You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
444 lines
12 KiB
444 lines
12 KiB
15 years ago
|
// -*- c-basic-offset: 2 -*-
|
||
|
/*
|
||
|
* This file is part of the KDE libraries
|
||
|
* Copyright (C) 1999-2001 Harri Porten (porten@kde.org)
|
||
|
* Copyright (C) 2003,2004 Apple Computer, Inc.
|
||
|
* Copyright (C) 2006 Maksim Orlovich (maksim@kde.org)
|
||
|
*
|
||
|
* This library is free software; you can redistribute it and/or
|
||
|
* modify it under the terms of the GNU Lesser General Public
|
||
|
* License as published by the Free Software Foundation; either
|
||
|
* version 2 of the License, or (at your option) any later version.
|
||
|
*
|
||
|
* This library is distributed in the hope that it will be useful,
|
||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
* Lesser General Public License for more details.
|
||
|
*
|
||
|
* You should have received a copy of the GNU Lesser General Public
|
||
|
* License along with this library; if not, write to the Free Software
|
||
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||
|
*
|
||
|
*/
|
||
|
|
||
|
#include "regexp.h"
|
||
|
|
||
|
#include "lexer.h"
|
||
|
#include <assert.h>
|
||
|
#include <stdio.h>
|
||
|
#include <stdlib.h>
|
||
|
#include <string.h>
|
||
|
|
||
|
using namespace KJS;
|
||
|
|
||
|
#ifdef PCRE_CONFIG_UTF8
|
||
|
RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
|
||
|
#endif
|
||
|
|
||
|
RegExp::RegExp(const UString &p, int f)
|
||
|
: pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0)
|
||
|
{
|
||
|
// Determine whether libpcre has unicode support if need be..
|
||
|
#ifdef PCRE_CONFIG_UTF8
|
||
|
if (utf8Support == Unknown) {
|
||
|
int supported;
|
||
|
pcre_config(PCRE_CONFIG_UTF8, (void*)&supported);
|
||
|
utf8Support = supported ? Supported : Unsupported;
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
nrSubPatterns = 0; // determined in match() with POSIX regex.
|
||
|
|
||
|
// JS regexps can contain Unicode escape sequences (\uxxxx) which
|
||
|
// are rather uncommon elsewhere. As our regexp libs don't understand
|
||
|
// them we do the unescaping ourselves internally.
|
||
|
// Also make sure to expand out any nulls as pcre_compile
|
||
|
// expects null termination..
|
||
|
UString intern;
|
||
|
const char* const nil = "\\x00";
|
||
|
if (p.find('\\') >= 0 || p.find(KJS::UChar('\0')) >= 0) {
|
||
|
bool escape = false;
|
||
|
for (int i = 0; i < p.size(); ++i) {
|
||
|
UChar c = p[i];
|
||
|
if (escape) {
|
||
|
escape = false;
|
||
|
// we only care about \u
|
||
|
if (c == 'u') {
|
||
|
// standard unicode escape sequence looks like \uxxxx but
|
||
|
// other browsers also accept less then 4 hex digits
|
||
|
unsigned short u = 0;
|
||
|
int j = 0;
|
||
|
for (j = 0; j < 4; ++j) {
|
||
|
if (i + 1 < p.size() && Lexer::isHexDigit(p[i + 1].unicode())) {
|
||
|
u = (u << 4) + Lexer::convertHex(p[i + 1].unicode());
|
||
|
++i;
|
||
|
} else {
|
||
|
// sequence incomplete. restore index.
|
||
|
// TODO: cleaner way to propagate warning
|
||
|
fprintf(stderr, "KJS: saw %d digit \\u sequence.\n", j);
|
||
|
i -= j;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
if (j < 4) {
|
||
|
// sequence was incomplete. treat \u as u which IE always
|
||
|
// and FF sometimes does.
|
||
|
intern.append(UString('u'));
|
||
|
} else {
|
||
|
c = UChar(u);
|
||
|
switch (u) {
|
||
|
case 0:
|
||
|
// Make sure to encode 0, to avoid terminating the string
|
||
|
intern += UString(nil);
|
||
|
break;
|
||
|
case '^':
|
||
|
case '$':
|
||
|
case '\\':
|
||
|
case '.':
|
||
|
case '*':
|
||
|
case '+':
|
||
|
case '?':
|
||
|
case '(': case ')':
|
||
|
case '{': case '}':
|
||
|
case '[': case ']':
|
||
|
case '|':
|
||
|
// escape pattern characters have to remain escaped
|
||
|
intern.append(UString('\\'));
|
||
|
// intentional fallthrough
|
||
|
default:
|
||
|
intern += UString(&c, 1);
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
continue;
|
||
|
}
|
||
|
intern += UString('\\');
|
||
|
intern += UString(&c, 1);
|
||
|
} else {
|
||
|
if (c == '\\')
|
||
|
escape = true;
|
||
|
else if (c == '\0')
|
||
|
intern += UString(nil);
|
||
|
else
|
||
|
intern += UString(&c, 1);
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
intern = p;
|
||
|
}
|
||
|
|
||
|
#ifdef HAVE_PCREPOSIX
|
||
|
int pcreflags = 0;
|
||
|
const char *perrormsg;
|
||
|
int errorOffset;
|
||
|
|
||
|
if (flgs & IgnoreCase)
|
||
|
pcreflags |= PCRE_CASELESS;
|
||
|
|
||
|
if (flgs & Multiline)
|
||
|
pcreflags |= PCRE_MULTILINE;
|
||
|
|
||
|
#ifdef PCRE_CONFIG_UTF8
|
||
|
if (utf8Support == Supported)
|
||
|
pcreflags |= (PCRE_UTF8 | PCRE_NO_UTF8_CHECK);
|
||
|
#endif
|
||
|
|
||
|
// Fill our buffer with an encoded version, whether utf-8, or,
|
||
|
// if PCRE is incapable, truncated.
|
||
|
prepareMatch(intern);
|
||
|
|
||
|
pcregex = pcre_compile(buffer, pcreflags,
|
||
|
&perrormsg, &errorOffset, NULL);
|
||
|
doneMatch(); // Cleanup buffers
|
||
|
if (!pcregex) {
|
||
|
#ifndef NDEBUG
|
||
|
fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg);
|
||
|
#endif
|
||
|
valid = false;
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
#ifdef PCRE_INFO_CAPTURECOUNT
|
||
|
// Get number of subpatterns that will be returned
|
||
|
int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns);
|
||
|
if (rc != 0)
|
||
|
#endif
|
||
|
nrSubPatterns = 0; // fallback. We always need the first pair of offsets.
|
||
|
|
||
|
#else /* HAVE_PCREPOSIX */
|
||
|
|
||
|
int regflags = 0;
|
||
|
#ifdef REG_EXTENDED
|
||
|
regflags |= REG_EXTENDED;
|
||
|
#endif
|
||
|
#ifdef REG_ICASE
|
||
|
if ( f & IgnoreCase )
|
||
|
regflags |= REG_ICASE;
|
||
|
#endif
|
||
|
|
||
|
//NOTE: Multiline is not feasible with POSIX regex.
|
||
|
//if ( f & Multiline )
|
||
|
// ;
|
||
|
// Note: the Global flag is already handled by RegExpProtoFunc::execute
|
||
|
|
||
|
int errorCode = regcomp(&preg, intern.ascii(), regflags);
|
||
|
if (errorCode != 0) {
|
||
|
#ifndef NDEBUG
|
||
|
char errorMessage[80];
|
||
|
regerror(errorCode, &preg, errorMessage, sizeof errorMessage);
|
||
|
fprintf(stderr, "KJS: regcomp failed with '%s'\n", errorMessage);
|
||
|
#endif
|
||
|
valid = false;
|
||
|
}
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
RegExp::~RegExp()
|
||
|
{
|
||
|
doneMatch(); // Be 100% sure buffers are freed
|
||
|
#ifdef HAVE_PCREPOSIX
|
||
|
if (pcregex)
|
||
|
pcre_free(pcregex);
|
||
|
#else
|
||
|
/* TODO: is this really okay after an error ? */
|
||
|
regfree(&preg);
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
void RegExp::prepareUtf8(const UString& s)
|
||
|
{
|
||
|
// Allocate a buffer big enough to hold all the characters plus \0
|
||
|
const int length = s.size();
|
||
|
buffer = new char[length * 3 + 1];
|
||
|
|
||
|
// Also create buffer for positions. We need one extra character in there,
|
||
|
// even past the \0 since the non-empty handling may jump one past the end
|
||
|
originalPos = new int[length * 3 + 2];
|
||
|
|
||
|
// Convert to runs of 8-bit characters, and generate indeces
|
||
|
// Note that we do NOT combine surrogate pairs here, as
|
||
|
// regexps operate on them as separate characters
|
||
|
char *p = buffer;
|
||
|
int *posOut = originalPos;
|
||
|
const UChar *d = s.data();
|
||
|
for (int i = 0; i != length; ++i) {
|
||
|
unsigned short c = d[i].unicode();
|
||
|
|
||
|
int sequenceLen;
|
||
|
if (c < 0x80) {
|
||
|
*p++ = (char)c;
|
||
|
sequenceLen = 1;
|
||
|
} else if (c < 0x800) {
|
||
|
*p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
|
||
|
*p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
|
||
|
sequenceLen = 2;
|
||
|
} else {
|
||
|
*p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
|
||
|
*p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
|
||
|
*p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
|
||
|
sequenceLen = 3;
|
||
|
}
|
||
|
|
||
|
while (sequenceLen > 0) {
|
||
|
*posOut = i;
|
||
|
++posOut;
|
||
|
--sequenceLen;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
bufferSize = p - buffer;
|
||
|
|
||
|
*p++ = '\0';
|
||
|
|
||
|
// Record positions for \0, and the fictional character after that.
|
||
|
*posOut = length;
|
||
|
*(posOut+1) = length+1;
|
||
|
}
|
||
|
|
||
|
void RegExp::prepareASCII (const UString& s)
|
||
|
{
|
||
|
originalPos = 0;
|
||
|
|
||
|
// Best-effort attempt to get something done
|
||
|
// when we don't have utf 8 available -- use
|
||
|
// truncated version, and pray for the best
|
||
|
CString truncated = s.cstring();
|
||
|
buffer = new char[truncated.size() + 1];
|
||
|
memcpy(buffer, truncated.c_str(), truncated.size());
|
||
|
buffer[truncated.size()] = '\0'; // For _compile use
|
||
|
bufferSize = truncated.size();
|
||
|
}
|
||
|
|
||
|
void RegExp::prepareMatch(const UString &s)
|
||
|
{
|
||
|
delete[] originalPos; // Just to be sure..
|
||
|
delete[] buffer;
|
||
|
#ifdef PCRE_CONFIG_UTF8
|
||
|
if (utf8Support == Supported)
|
||
|
prepareUtf8(s);
|
||
|
else
|
||
|
#endif
|
||
|
prepareASCII(s);
|
||
|
|
||
|
#ifndef NDEBUG
|
||
|
originalS = s;
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
void RegExp::doneMatch()
|
||
|
{
|
||
|
delete[] originalPos; originalPos = 0;
|
||
|
delete[] buffer; buffer = 0;
|
||
|
}
|
||
|
|
||
|
UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
|
||
|
{
|
||
|
#ifndef NDEBUG
|
||
|
assert(s.data() == originalS.data()); // Make sure prepareMatch got called right..
|
||
|
#endif
|
||
|
assert(valid);
|
||
|
|
||
|
if (i < 0)
|
||
|
i = 0;
|
||
|
if (ovector)
|
||
|
*ovector = 0L;
|
||
|
int dummyPos;
|
||
|
if (!pos)
|
||
|
pos = &dummyPos;
|
||
|
*pos = -1;
|
||
|
if (i > s.size() || s.isNull())
|
||
|
return UString::null;
|
||
|
|
||
|
#ifdef HAVE_PCREPOSIX
|
||
|
int ovecsize = (nrSubPatterns+1)*3; // see pcre docu
|
||
|
if (ovector) *ovector = new int[ovecsize];
|
||
|
if (!pcregex)
|
||
|
return UString::null;
|
||
|
|
||
|
int startPos;
|
||
|
int nextPos;
|
||
|
|
||
|
#ifdef PCRE_CONFIG_UTF8
|
||
|
if (utf8Support == Supported) {
|
||
|
startPos = i;
|
||
|
while (originalPos[startPos] < i)
|
||
|
++startPos;
|
||
|
|
||
|
nextPos = startPos;
|
||
|
while (originalPos[nextPos] < (i + 1))
|
||
|
++nextPos;
|
||
|
} else
|
||
|
#endif
|
||
|
{
|
||
|
startPos = i;
|
||
|
nextPos = i + 1;
|
||
|
}
|
||
|
|
||
|
int baseFlags =
|
||
|
#ifdef PCRE_CONFIG_UTF8
|
||
|
utf8Support == Supported ? PCRE_NO_UTF8_CHECK :
|
||
|
#endif
|
||
|
0;
|
||
|
if (pcre_exec(pcregex, NULL, buffer, bufferSize, startPos,
|
||
|
m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED | baseFlags) : baseFlags, // see man pcretest
|
||
|
ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
|
||
|
{
|
||
|
// Failed to match.
|
||
|
if ((flgs & Global) && m_notEmpty && ovector)
|
||
|
{
|
||
|
// We set m_notEmpty ourselves, to look for a non-empty match
|
||
|
// (see man pcretest or pcretest.c for details).
|
||
|
// So we don't stop here, we want to try again at i+1.
|
||
|
#ifdef KJS_VERBOSE
|
||
|
fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n");
|
||
|
#endif
|
||
|
m_notEmpty = 0;
|
||
|
if (pcre_exec(pcregex, NULL, buffer, bufferSize, nextPos, baseFlags,
|
||
|
ovector ? *ovector : 0L, ovecsize) == PCRE_ERROR_NOMATCH)
|
||
|
return UString::null;
|
||
|
}
|
||
|
else // done
|
||
|
return UString::null;
|
||
|
}
|
||
|
|
||
|
// Got a match, proceed with it.
|
||
|
// But fix up the ovector if need be..
|
||
|
if (ovector && originalPos) {
|
||
|
for (unsigned c = 0; c < 2 * (nrSubPatterns + 1); ++c) {
|
||
|
if ((*ovector)[c] != -1)
|
||
|
(*ovector)[c] = originalPos[(*ovector)[c]];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (!ovector)
|
||
|
return UString::null; // don't rely on the return value if you pass ovector==0
|
||
|
#else
|
||
|
const uint maxMatch = 10;
|
||
|
regmatch_t rmatch[maxMatch];
|
||
|
|
||
|
char *str = strdup(s.ascii()); // TODO: why ???
|
||
|
if (regexec(&preg, str + i, maxMatch, rmatch, 0)) {
|
||
|
free(str);
|
||
|
return UString::null;
|
||
|
}
|
||
|
free(str);
|
||
|
|
||
|
if (!ovector) {
|
||
|
*pos = rmatch[0].rm_so + i;
|
||
|
return s.substr(rmatch[0].rm_so + i, rmatch[0].rm_eo - rmatch[0].rm_so);
|
||
|
}
|
||
|
|
||
|
// map rmatch array to ovector used in PCRE case
|
||
|
nrSubPatterns = 0;
|
||
|
for (uint j = 0; j < maxMatch && rmatch[j].rm_so >= 0 ; j++) {
|
||
|
nrSubPatterns++;
|
||
|
// if the nonEmpty flag is set, return a failed match if any of the
|
||
|
// subMatches happens to be an empty string.
|
||
|
if (m_notEmpty && rmatch[j].rm_so == rmatch[j].rm_eo)
|
||
|
return UString::null;
|
||
|
}
|
||
|
// Allow an ovector slot to return the (failed) match result.
|
||
|
if (nrSubPatterns == 0) nrSubPatterns = 1;
|
||
|
|
||
|
int ovecsize = (nrSubPatterns)*3; // see above
|
||
|
*ovector = new int[ovecsize];
|
||
|
for (uint j = 0; j < nrSubPatterns; j++) {
|
||
|
(*ovector)[2*j] = rmatch[j].rm_so + i;
|
||
|
(*ovector)[2*j+1] = rmatch[j].rm_eo + i;
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
*pos = (*ovector)[0];
|
||
|
if ( *pos == (*ovector)[1] && (flgs & Global) )
|
||
|
{
|
||
|
// empty match, next try will be with m_notEmpty=true
|
||
|
m_notEmpty=true;
|
||
|
}
|
||
|
return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
|
||
|
}
|
||
|
|
||
|
#if 0 // unused
|
||
|
bool RegExp::test(const UString &s, int)
|
||
|
{
|
||
|
#ifdef HAVE_PCREPOSIX
|
||
|
int ovector[300];
|
||
|
CString buffer(s.cstring());
|
||
|
|
||
|
if (s.isNull() ||
|
||
|
pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0,
|
||
|
0, ovector, 300) == PCRE_ERROR_NOMATCH)
|
||
|
return false;
|
||
|
else
|
||
|
return true;
|
||
|
|
||
|
#else
|
||
|
|
||
|
char *str = strdup(s.ascii());
|
||
|
int r = regexec(&preg, str, 0, 0, 0);
|
||
|
free(str);
|
||
|
|
||
|
return r == 0;
|
||
|
#endif
|
||
|
}
|
||
|
#endif
|