|
|
@ -30,21 +30,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
using namespace KJS;
|
|
|
|
using namespace KJS;
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef PCRE_CONFIG_UTF8
|
|
|
|
|
|
|
|
RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
|
|
|
|
RegExp::UTF8SupportState RegExp::utf8Support = RegExp::Unknown;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
RegExp::RegExp(const UString &p, int f)
|
|
|
|
RegExp::RegExp(const UString &p, int f)
|
|
|
|
: pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0)
|
|
|
|
: pat(p), flgs(f), m_notEmpty(false), valid(true), buffer(0), originalPos(0)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
// Determine whether libpcre has unicode support if need be..
|
|
|
|
// Determine whether libpcre has unicode support if need be..
|
|
|
|
#ifdef PCRE_CONFIG_UTF8
|
|
|
|
|
|
|
|
if (utf8Support == Unknown) {
|
|
|
|
if (utf8Support == Unknown) {
|
|
|
|
int supported;
|
|
|
|
uint32_t supported;
|
|
|
|
pcre_config(PCRE_CONFIG_UTF8, (void*)&supported);
|
|
|
|
pcre2_config(PCRE2_CONFIG_COMPILED_WIDTHS, (void*)&supported);
|
|
|
|
utf8Support = supported ? Supported : Unsupported;
|
|
|
|
utf8Support = (supported & 0x0001) ? Supported : Unsupported;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
nrSubPatterns = 0; // determined in match() with POSIX regex.
|
|
|
|
nrSubPatterns = 0; // determined in match() with POSIX regex.
|
|
|
|
|
|
|
|
|
|
|
@ -126,45 +122,46 @@ RegExp::RegExp(const UString &p, int f)
|
|
|
|
intern = p;
|
|
|
|
intern = p;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef HAVE_PCREPOSIX
|
|
|
|
#ifdef HAVE_PCRE2POSIX
|
|
|
|
int pcreflags = 0;
|
|
|
|
uint32_t pcre2flags = 0;
|
|
|
|
const char *perrormsg;
|
|
|
|
int errorCode;
|
|
|
|
int errorOffset;
|
|
|
|
PCRE2_SIZE errorOffset;
|
|
|
|
|
|
|
|
|
|
|
|
if (flgs & IgnoreCase)
|
|
|
|
if (flgs & IgnoreCase)
|
|
|
|
pcreflags |= PCRE_CASELESS;
|
|
|
|
pcre2flags |= PCRE2_CASELESS;
|
|
|
|
|
|
|
|
|
|
|
|
if (flgs & Multiline)
|
|
|
|
if (flgs & Multiline)
|
|
|
|
pcreflags |= PCRE_MULTILINE;
|
|
|
|
pcre2flags |= PCRE2_MULTILINE;
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef PCRE_CONFIG_UTF8
|
|
|
|
|
|
|
|
if (utf8Support == Supported)
|
|
|
|
if (utf8Support == Supported)
|
|
|
|
pcreflags |= (PCRE_UTF8 | PCRE_NO_UTF8_CHECK);
|
|
|
|
pcre2flags |= (PCRE2_UTF | PCRE2_NO_UTF_CHECK);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Fill our buffer with an encoded version, whether utf-8, or,
|
|
|
|
// Fill our buffer with an encoded version, whether utf-8, or,
|
|
|
|
// if PCRE is incapable, truncated.
|
|
|
|
// if PCRE is incapable, truncated.
|
|
|
|
prepareMatch(intern);
|
|
|
|
prepareMatch(intern);
|
|
|
|
|
|
|
|
|
|
|
|
pcregex = pcre_compile(buffer, pcreflags,
|
|
|
|
pcregex = pcre2_compile(buffer, PCRE2_ZERO_TERMINATED, pcre2flags,
|
|
|
|
&perrormsg, &errorOffset, NULL);
|
|
|
|
&errorCode, &errorOffset, NULL);
|
|
|
|
doneMatch(); // Cleanup buffers
|
|
|
|
doneMatch(); // Cleanup buffers
|
|
|
|
if (!pcregex) {
|
|
|
|
if (!pcregex) {
|
|
|
|
#ifndef NDEBUG
|
|
|
|
#ifndef NDEBUG
|
|
|
|
fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", perrormsg);
|
|
|
|
PCRE2_UCHAR errorMsg[256];
|
|
|
|
|
|
|
|
pcre2_get_error_message(errorCode, errorMsg, sizeof(errorMsg));
|
|
|
|
|
|
|
|
fprintf(stderr, "KJS: pcre_compile() failed with '%s'\n", errorMsg);
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
valid = false;
|
|
|
|
valid = false;
|
|
|
|
return;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef PCRE_INFO_CAPTURECOUNT
|
|
|
|
|
|
|
|
// Get number of subpatterns that will be returned
|
|
|
|
// Get number of subpatterns that will be returned
|
|
|
|
int rc = pcre_fullinfo( pcregex, NULL, PCRE_INFO_CAPTURECOUNT, &nrSubPatterns);
|
|
|
|
int rc = pcre2_pattern_info(pcregex, PCRE2_INFO_CAPTURECOUNT, &nrSubPatterns);
|
|
|
|
if (rc != 0)
|
|
|
|
if (rc != 0)
|
|
|
|
#endif
|
|
|
|
{
|
|
|
|
nrSubPatterns = 0; // fallback. We always need the first pair of offsets.
|
|
|
|
nrSubPatterns = 0; // fallback. We always need the first pair of offsets.
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#else /* HAVE_PCREPOSIX */
|
|
|
|
match_data = pcre2_match_data_create_from_pattern(pcregex, NULL);
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
|
|
|
|
int regflags = 0;
|
|
|
|
int regflags = 0;
|
|
|
|
#ifdef REG_EXTENDED
|
|
|
|
#ifdef REG_EXTENDED
|
|
|
@ -195,9 +192,15 @@ RegExp::RegExp(const UString &p, int f)
|
|
|
|
RegExp::~RegExp()
|
|
|
|
RegExp::~RegExp()
|
|
|
|
{
|
|
|
|
{
|
|
|
|
doneMatch(); // Be 100% sure buffers are freed
|
|
|
|
doneMatch(); // Be 100% sure buffers are freed
|
|
|
|
#ifdef HAVE_PCREPOSIX
|
|
|
|
#ifdef HAVE_PCRE2POSIX
|
|
|
|
|
|
|
|
if (match_data)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
pcre2_match_data_free(match_data);
|
|
|
|
|
|
|
|
}
|
|
|
|
if (pcregex)
|
|
|
|
if (pcregex)
|
|
|
|
pcre_free(pcregex);
|
|
|
|
{
|
|
|
|
|
|
|
|
pcre2_code_free(pcregex);
|
|
|
|
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
#else
|
|
|
|
/* TODO: is this really okay after an error ? */
|
|
|
|
/* TODO: is this really okay after an error ? */
|
|
|
|
regfree(&preg);
|
|
|
|
regfree(&preg);
|
|
|
@ -208,7 +211,7 @@ void RegExp::prepareUtf8(const UString& s)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
// Allocate a buffer big enough to hold all the characters plus \0
|
|
|
|
// Allocate a buffer big enough to hold all the characters plus \0
|
|
|
|
const int length = s.size();
|
|
|
|
const int length = s.size();
|
|
|
|
buffer = new char[length * 3 + 1];
|
|
|
|
buffer = new buftype_t[length * 3 + 1];
|
|
|
|
|
|
|
|
|
|
|
|
// Also create buffer for positions. We need one extra character in there,
|
|
|
|
// Also create buffer for positions. We need one extra character in there,
|
|
|
|
// even past the \0 since the non-empty handling may jump one past the end
|
|
|
|
// even past the \0 since the non-empty handling may jump one past the end
|
|
|
@ -217,7 +220,7 @@ void RegExp::prepareUtf8(const UString& s)
|
|
|
|
// Convert to runs of 8-bit characters, and generate indeces
|
|
|
|
// Convert to runs of 8-bit characters, and generate indeces
|
|
|
|
// Note that we do NOT combine surrogate pairs here, as
|
|
|
|
// Note that we do NOT combine surrogate pairs here, as
|
|
|
|
// regexps operate on them as separate characters
|
|
|
|
// regexps operate on them as separate characters
|
|
|
|
char *p = buffer;
|
|
|
|
buftype_t *p = buffer;
|
|
|
|
int *posOut = originalPos;
|
|
|
|
int *posOut = originalPos;
|
|
|
|
const UChar *d = s.data();
|
|
|
|
const UChar *d = s.data();
|
|
|
|
for (int i = 0; i != length; ++i) {
|
|
|
|
for (int i = 0; i != length; ++i) {
|
|
|
@ -225,16 +228,16 @@ void RegExp::prepareUtf8(const UString& s)
|
|
|
|
|
|
|
|
|
|
|
|
int sequenceLen;
|
|
|
|
int sequenceLen;
|
|
|
|
if (c < 0x80) {
|
|
|
|
if (c < 0x80) {
|
|
|
|
*p++ = (char)c;
|
|
|
|
*p++ = (buftype_t)c;
|
|
|
|
sequenceLen = 1;
|
|
|
|
sequenceLen = 1;
|
|
|
|
} else if (c < 0x800) {
|
|
|
|
} else if (c < 0x800) {
|
|
|
|
*p++ = (char)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
|
|
|
|
*p++ = (buftype_t)((c >> 6) | 0xC0); // C0 is the 2-byte flag for UTF-8
|
|
|
|
*p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
|
|
|
|
*p++ = (buftype_t)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
|
|
|
|
sequenceLen = 2;
|
|
|
|
sequenceLen = 2;
|
|
|
|
} else {
|
|
|
|
} else {
|
|
|
|
*p++ = (char)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
|
|
|
|
*p++ = (buftype_t)((c >> 12) | 0xE0); // E0 is the 3-byte flag for UTF-8
|
|
|
|
*p++ = (char)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
|
|
|
|
*p++ = (buftype_t)(((c >> 6) | 0x80) & 0xBF); // next 6 bits, with high bit set
|
|
|
|
*p++ = (char)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
|
|
|
|
*p++ = (buftype_t)((c | 0x80) & 0xBF); // next 6 bits, with high bit set
|
|
|
|
sequenceLen = 3;
|
|
|
|
sequenceLen = 3;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
@ -262,7 +265,7 @@ void RegExp::prepareASCII (const UString& s)
|
|
|
|
// when we don't have utf 8 available -- use
|
|
|
|
// when we don't have utf 8 available -- use
|
|
|
|
// truncated version, and pray for the best
|
|
|
|
// truncated version, and pray for the best
|
|
|
|
CString truncated = s.cstring();
|
|
|
|
CString truncated = s.cstring();
|
|
|
|
buffer = new char[truncated.size() + 1];
|
|
|
|
buffer = new buftype_t[truncated.size() + 1];
|
|
|
|
memcpy(buffer, truncated.c_str(), truncated.size());
|
|
|
|
memcpy(buffer, truncated.c_str(), truncated.size());
|
|
|
|
buffer[truncated.size()] = '\0'; // For _compile use
|
|
|
|
buffer[truncated.size()] = '\0'; // For _compile use
|
|
|
|
bufferSize = truncated.size();
|
|
|
|
bufferSize = truncated.size();
|
|
|
@ -272,11 +275,9 @@ void RegExp::prepareMatch(const UString &s)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
delete[] originalPos; // Just to be sure..
|
|
|
|
delete[] originalPos; // Just to be sure..
|
|
|
|
delete[] buffer;
|
|
|
|
delete[] buffer;
|
|
|
|
#ifdef PCRE_CONFIG_UTF8
|
|
|
|
|
|
|
|
if (utf8Support == Supported)
|
|
|
|
if (utf8Support == Supported)
|
|
|
|
prepareUtf8(s);
|
|
|
|
prepareUtf8(s);
|
|
|
|
else
|
|
|
|
else
|
|
|
|
#endif
|
|
|
|
|
|
|
|
prepareASCII(s);
|
|
|
|
prepareASCII(s);
|
|
|
|
|
|
|
|
|
|
|
|
#ifndef NDEBUG
|
|
|
|
#ifndef NDEBUG
|
|
|
@ -308,17 +309,16 @@ UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
|
|
|
|
if (i > s.size() || s.isNull())
|
|
|
|
if (i > s.size() || s.isNull())
|
|
|
|
return UString::null;
|
|
|
|
return UString::null;
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef HAVE_PCREPOSIX
|
|
|
|
#ifdef HAVE_PCRE2POSIX
|
|
|
|
int ovecsize = (nrSubPatterns+1)*3; // see pcre docu
|
|
|
|
if (!pcregex || !match_data)
|
|
|
|
if (ovector) *ovector = new int[ovecsize];
|
|
|
|
return UString::null;
|
|
|
|
if (!pcregex)
|
|
|
|
if (!ovector)
|
|
|
|
return UString::null;
|
|
|
|
return UString::null;
|
|
|
|
|
|
|
|
|
|
|
|
int startPos;
|
|
|
|
int startPos;
|
|
|
|
int nextPos;
|
|
|
|
int nextPos;
|
|
|
|
|
|
|
|
if (utf8Support == Supported)
|
|
|
|
#ifdef PCRE_CONFIG_UTF8
|
|
|
|
{
|
|
|
|
if (utf8Support == Supported) {
|
|
|
|
|
|
|
|
startPos = i;
|
|
|
|
startPos = i;
|
|
|
|
while (originalPos[startPos] < i)
|
|
|
|
while (originalPos[startPos] < i)
|
|
|
|
++startPos;
|
|
|
|
++startPos;
|
|
|
@ -328,53 +328,59 @@ UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
|
|
|
|
while (originalPos[nextPos] < (i + 1))
|
|
|
|
while (originalPos[nextPos] < (i + 1))
|
|
|
|
++nextPos;
|
|
|
|
++nextPos;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
else
|
|
|
|
{
|
|
|
|
{
|
|
|
|
startPos = i;
|
|
|
|
startPos = i;
|
|
|
|
nextPos = i + (i < s.size() ? 1 : 0);
|
|
|
|
nextPos = i + (i < s.size() ? 1 : 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int baseFlags =
|
|
|
|
uint32_t baseFlags = (utf8Support == Supported ? PCRE2_NO_UTF_CHECK : 0);
|
|
|
|
#ifdef PCRE_CONFIG_UTF8
|
|
|
|
if (m_notEmpty)
|
|
|
|
utf8Support == Supported ? PCRE_NO_UTF8_CHECK :
|
|
|
|
{
|
|
|
|
#endif
|
|
|
|
baseFlags |= PCRE2_NOTEMPTY | PCRE2_ANCHORED;
|
|
|
|
0;
|
|
|
|
}
|
|
|
|
int numMatches = pcre_exec(pcregex, NULL, buffer, bufferSize, startPos,
|
|
|
|
int numMatches = pcre2_match(pcregex, buffer, PCRE2_ZERO_TERMINATED, startPos, baseFlags, match_data, NULL);
|
|
|
|
m_notEmpty ? (PCRE_NOTEMPTY | PCRE_ANCHORED | baseFlags) : baseFlags, // see man pcretest
|
|
|
|
if (numMatches <= 0)
|
|
|
|
ovector ? *ovector : 0L, ovecsize);
|
|
|
|
|
|
|
|
if (numMatches < 0)
|
|
|
|
|
|
|
|
{
|
|
|
|
{
|
|
|
|
// Failed to match.
|
|
|
|
// Failed to match.
|
|
|
|
if (numMatches == PCRE_ERROR_NOMATCH && (flgs & Global) && m_notEmpty && ovector && startPos < nextPos)
|
|
|
|
if (numMatches == PCRE2_ERROR_NOMATCH && (flgs & Global) && m_notEmpty && startPos < nextPos)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
// We set m_notEmpty ourselves, to look for a non-empty match
|
|
|
|
// We set m_notEmpty ourselves, to look for a non-empty match
|
|
|
|
// (see man pcretest or pcretest.c for details).
|
|
|
|
|
|
|
|
// So we don't stop here, we want to try again at i+1.
|
|
|
|
// So we don't stop here, we want to try again at i+1.
|
|
|
|
#ifdef KJS_VERBOSE
|
|
|
|
#ifdef KJS_VERBOSE
|
|
|
|
fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n");
|
|
|
|
fprintf(stderr, "No match after m_notEmpty. +1 and keep going.\n");
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
m_notEmpty = 0;
|
|
|
|
m_notEmpty = 0;
|
|
|
|
numMatches = pcre_exec(pcregex, NULL, buffer, bufferSize, nextPos, baseFlags,
|
|
|
|
baseFlags = (utf8Support == Supported ? PCRE2_NO_UTF_CHECK : 0);
|
|
|
|
ovector ? *ovector : 0L, ovecsize);
|
|
|
|
numMatches = pcre2_match(pcregex, buffer, PCRE2_ZERO_TERMINATED, nextPos, baseFlags, match_data, NULL);
|
|
|
|
if (numMatches < 0)
|
|
|
|
if (numMatches <= 0)
|
|
|
|
return UString::null;
|
|
|
|
return UString::null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else // done
|
|
|
|
else
|
|
|
|
return UString::null;
|
|
|
|
return UString::null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Got a match, proceed with it.
|
|
|
|
PCRE2_SIZE *pcre2_ovector = pcre2_get_ovector_pointer(match_data);
|
|
|
|
// But fix up the ovector if need be..
|
|
|
|
if (!pcre2_ovector)
|
|
|
|
if (ovector && originalPos) {
|
|
|
|
return UString::null;
|
|
|
|
for (unsigned c = 0; c < 2 * TQMIN((unsigned)numMatches, nrSubPatterns+1); ++c) {
|
|
|
|
|
|
|
|
if ((*ovector)[c] != -1)
|
|
|
|
uint32_t pcre2_ovecCount = pcre2_get_ovector_count(match_data);
|
|
|
|
(*ovector)[c] = originalPos[(*ovector)[c]];
|
|
|
|
*ovector = new int[pcre2_ovecCount * 2];
|
|
|
|
|
|
|
|
if (originalPos)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
for (size_t c = 0; c < 2 * pcre2_ovecCount; ++c)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
(*ovector)[c] = (pcre2_ovector[c] != -1) ? originalPos[pcre2_ovector[c]] : -1;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
else
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
for (size_t c = 0; c < 2 * pcre2_ovecCount; ++c)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
(*ovector)[c] = pcre2_ovector[c];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (!ovector)
|
|
|
|
|
|
|
|
return UString::null; // don't rely on the return value if you pass ovector==0
|
|
|
|
|
|
|
|
#else
|
|
|
|
#else
|
|
|
|
const uint maxMatch = 10;
|
|
|
|
const uint maxMatch = 10;
|
|
|
|
regmatch_t rmatch[maxMatch];
|
|
|
|
regmatch_t rmatch[maxMatch];
|
|
|
@ -419,28 +425,3 @@ UString RegExp::match(const UString &s, int i, int *pos, int **ovector)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
|
|
|
|
return s.substr((*ovector)[0], (*ovector)[1] - (*ovector)[0]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#if 0 // unused
|
|
|
|
|
|
|
|
bool RegExp::test(const UString &s, int)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
#ifdef HAVE_PCREPOSIX
|
|
|
|
|
|
|
|
int ovector[300];
|
|
|
|
|
|
|
|
CString buffer(s.cstring());
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (s.isNull() ||
|
|
|
|
|
|
|
|
pcre_exec(pcregex, NULL, buffer.c_str(), buffer.size(), 0,
|
|
|
|
|
|
|
|
0, ovector, 300) == PCRE_ERROR_NOMATCH)
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
else
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
char *str = strdup(s.ascii());
|
|
|
|
|
|
|
|
int r = regexec(&preg, str, 0, 0, 0);
|
|
|
|
|
|
|
|
free(str);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return r == 0;
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|