You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
554 lines
18 KiB
C++
554 lines
18 KiB
C++
#include "tequivchars.h"
|
|
|
|
#include <wchar.h>
|
|
#include <locale.h>
|
|
#include <stdio.h>
|
|
|
|
// #define TSM_PCRE2
|
|
// #define TEQUIVCHARS_HASH_LOOKUP
|
|
|
|
#ifndef TEQUIVCHARS_HASH_LOOKUP // Using binary search on sorted array
|
|
inline constexpr uint EquivTableROWS {2993}; // Make sure that these constants accurately
|
|
inline constexpr uint EquivTableCOLS { 2}; // reflect what's in tequivchars-mapping.h
|
|
#else // Using hash table lookup
|
|
#include "tsl/hopscotch_map.h"
|
|
#endif // TEQUIVCHARS_HASH_LOOKUP
|
|
|
|
class TEquivChars_Private
|
|
{
|
|
public:
|
|
#ifndef TEQUIVCHARS_HASH_LOOKUP // Using binary search on sorted array
|
|
inline wchar_t lookup( wchar_t widechar );
|
|
wchar_t EquivalentsTable[EquivTableROWS][EquivTableCOLS] = {
|
|
#include "tequivchars-mapping.h"
|
|
};
|
|
#else // Using hash table lookup
|
|
tsl::hopscotch_map<wchar_t, wchar_t> EquivalentsTable = {
|
|
#include "tequivchars-mapping.h"
|
|
};
|
|
#endif // TEQUIVCHARS_HASH_LOOKUP
|
|
};
|
|
|
|
#ifndef TEQUIVCHARS_HASH_LOOKUP // Using binary search on sorted array
|
|
inline wchar_t TEquivChars_Private::lookup( wchar_t key )
|
|
{
|
|
int low = 0;
|
|
int high = EquivTableROWS - 1;
|
|
int iteration = 0;
|
|
while (low <= high) {
|
|
iteration++;
|
|
int mid = low + (high - low) / 2;
|
|
// fprintf( stderr, "high = %u, mid = %u, low = %u\n", high, mid, low );
|
|
if (EquivalentsTable[mid][0] == key) {
|
|
//-Debug: fprintf( stderr,
|
|
//-Debug: "After %d lookups, character 0x%04x was mapped to 0x%04x \n",
|
|
//-Debug: iteration, key, EquivalentsTable[mid][1]
|
|
//-Debug: );
|
|
return EquivalentsTable[mid][1];
|
|
}
|
|
if ( EquivalentsTable[mid][0] < key )
|
|
low = mid + 1;
|
|
else
|
|
high = mid - 1;
|
|
}
|
|
//-Debug: fprintf( stderr, "Not found after %d lookups\n", iteration );
|
|
return key;
|
|
}
|
|
#endif // TEQUIVCHARS_HASH_LOOKUP
|
|
|
|
TEquivChars::TEquivChars()
|
|
{
|
|
p = new TEquivChars_Private;
|
|
}
|
|
|
|
TEquivChars::~TEquivChars()
|
|
{
|
|
delete p;
|
|
}
|
|
|
|
/*
|
|
There are 2 implementations of table lookup within this fuction: one that
|
|
uses hash table lookup and another that uses binary search on a presorted
|
|
table. We default using the binary search implementation for these reasons:
|
|
|
|
1. The hash table implementation consumes more RAM.
|
|
2. The hash table implementation consumes slightly less CPU when used on a
|
|
large dataset that contains no replaceable characters (all misses) BUT
|
|
consumes significantly more CPU when used on dataset that contains many
|
|
replaceable characters (mostly hits).
|
|
3. The theoretical benefits of using hash lookups on such a small table
|
|
will probably never be realized.
|
|
|
|
The hashing used was
|
|
|
|
https://github.com/Tessil/robin-map
|
|
*/
|
|
|
|
TQString TEquivChars::replaceChars( TQString inputQstring, bool isRegex )
|
|
{
|
|
int inStrLen = inputQstring.length();
|
|
TQString outString = TQString::fromLatin1( "" );
|
|
const TQChar *char16 = inputQstring.unicode();
|
|
|
|
bool backSlashed = false; // \_
|
|
bool startedCharClass = false; // Previous character was starting '[' of character class
|
|
bool inCharacterClass = false; // [___]
|
|
bool inPosixBracketExpr = false; // [:___:]
|
|
#ifdef TSM_PCRE2
|
|
bool quoteLiteral = false; // \Q___\E
|
|
bool inBraceExpr = false; // \c{___} where 'c' is any of: 'x' 'o' 'p' 'P' 'N' 'g'
|
|
bool inDirective = false; // (*___)
|
|
bool inGroupName = false; // (?<___>
|
|
#endif // TSM_PCRE2
|
|
wchar_t currChar = 0;
|
|
wchar_t prevChar = 0;
|
|
wchar_t nextChar = 0;
|
|
|
|
for ( int i = 0 ; i < inStrLen ; i++ , outString += TQChar(currChar) )
|
|
{
|
|
prevChar = currChar;
|
|
currChar = char16[i].unicode();
|
|
|
|
if ( isRegex ) {
|
|
|
|
if ( i < ( inStrLen - 1 ) )
|
|
nextChar = char16[i+1].unicode();
|
|
else
|
|
nextChar = 0;
|
|
|
|
if ( currChar == L'\\' ) {
|
|
backSlashed = true;
|
|
continue;
|
|
}
|
|
|
|
// Don't convert backSlashed characters
|
|
if ( backSlashed ) {
|
|
#ifdef TSM_PCRE2
|
|
switch (currChar) {
|
|
case 'Q' : quoteLiteral = true; break; // Entering literal \Q___\E
|
|
case 'E' : quoteLiteral = false; break; // Leaving literal \Q___\E
|
|
case 'N' : // Entering Unicode codepoint specification \N{U+___} ?
|
|
case 'P' : // Entering (negated) Unicode property specification \p{} ?
|
|
case 'p' : // Entering Unicode property specification \p{} ?
|
|
case 'g' : // Entering a named backreference \g{___} ?
|
|
if ( nextChar == L'{' ) inBraceExpr = true;
|
|
break;
|
|
}
|
|
#endif // TSM_PCRE2
|
|
backSlashed = false;
|
|
continue;
|
|
}
|
|
|
|
#ifdef TSM_PCRE2
|
|
if ( quoteLiteral )
|
|
continue;
|
|
|
|
if ( inBraceExpr ) {
|
|
// Is it time to leave brace expression {___} ?
|
|
if ( nextChar == L'}' ) inBraceExpr = true;
|
|
continue;
|
|
}
|
|
#endif // TSM_PCRE2
|
|
|
|
if ( startedCharClass ) {
|
|
switch (currChar) {
|
|
case L'^' : // Negated character class, proceed to next character
|
|
continue; // Bypass converting this special character
|
|
case L']' : // Treat as part of character class, not as a closure
|
|
case L':' : // Treat as part of character class, not as start of bracket expression
|
|
startedCharClass = false;
|
|
continue; // Bypass converting these special characters
|
|
}
|
|
startedCharClass = false;
|
|
} // startedCharClass
|
|
|
|
if ( inCharacterClass ) {
|
|
|
|
if ( inPosixBracketExpr ) {
|
|
// Is it time to leave POSIX bracket expression [:___:] ?
|
|
if ( currChar == L':' && nextChar == L']' ) inPosixBracketExpr = false;
|
|
continue;
|
|
} // inPosixBracketExpr
|
|
|
|
else { // ! inPosixBracketExpr
|
|
|
|
if ( prevChar == L'[' && currChar == L':' ) {
|
|
// Enter POSIX bracket expression [:___:]
|
|
inPosixBracketExpr = true;
|
|
continue;
|
|
}
|
|
|
|
if ( currChar == L']' ) {
|
|
// Leaving character class [___]
|
|
inCharacterClass = false;
|
|
continue;
|
|
}
|
|
|
|
} // ! inPosixBracketExpr
|
|
|
|
} // inCharacterClass
|
|
|
|
else { // ! inCharacterClass
|
|
|
|
switch (currChar) {
|
|
|
|
case '[' :
|
|
// Entering a character class [___]
|
|
startedCharClass = true;
|
|
inCharacterClass = true;
|
|
continue;
|
|
break;
|
|
#ifdef TSM_PCRE2
|
|
case '*' :
|
|
if ( prevChar != '(' ) continue;
|
|
// Entering a PCRE2 directive (*___)
|
|
inDirective = true;
|
|
continue;
|
|
break;
|
|
|
|
case '?' :
|
|
if ( prevChar != '(' ) continue;
|
|
if ( nextChar != '<' ) continue;
|
|
// Entering PCRE2 group name (?<___>)
|
|
inGroupName = true;
|
|
continue;
|
|
break;
|
|
#endif // TSM_PCRE2
|
|
}
|
|
#ifdef TSM_PCRE2
|
|
if ( inDirective ) {
|
|
// Is it time to leave PCRE2 directive (*___) ?
|
|
if (currChar == ')' ) inDirective = false;
|
|
continue;
|
|
}
|
|
|
|
if ( inGroupName ) {
|
|
// Is it time to leave PCRE2 group name (?<___>) ?
|
|
if (currChar == '>' ) inGroupName = false;
|
|
continue;
|
|
}
|
|
#endif // TSM_PCRE2
|
|
} // ! inCharacterClass
|
|
|
|
// If we reach here, currChar will be convertd
|
|
|
|
} // isRegex
|
|
|
|
CONVERT:
|
|
|
|
#ifdef TEQUIVCHARS_HASH_LOOKUP
|
|
try {
|
|
currChar = p->EquivalentsTable.at(currChar);
|
|
}
|
|
catch(const std::exception&) {
|
|
// No need to "handle" exception, *currchar was not changed
|
|
}
|
|
#else // TEQUIVCHARS_HASH_LOOKUP => Use binary search
|
|
currChar = p->lookup(currChar);
|
|
#endif // TEQUIVCHARS_HASH_LOOKUP
|
|
}
|
|
|
|
return outString;
|
|
}
|
|
|
|
/*
|
|
This is the original implementation of the replaceChars function. It works
|
|
by converting inputString (UTF16) to a "multibyte" (typically UTF8) string,
|
|
then to an array of "wide" (32-bit) characters which is used for character
|
|
replacement.The result is converted back into a TQString and returned.
|
|
Despite the multiple conversions, this function is almost as fast as
|
|
the one that replaced it.
|
|
*/
|
|
|
|
TQString TEquivChars::replaceCharsMB( TQString inputQstring, bool isRegex )
|
|
{
|
|
setlocale(LC_CTYPE, "");
|
|
const char* inputMBstring = inputQstring.local8Bit().data();
|
|
|
|
//-Debug: printf("Size of inputMBstring (metric 1): %d\n", inputQstring.length() );
|
|
//-Debug: printf("Size of inputMBstring (metric 2): %ld\n", strlen( inputMBstring ) );
|
|
|
|
//--- Allocate "wide" work string
|
|
size_t numCharsMax;
|
|
numCharsMax = inputQstring.length()+2; // Start with a hopefully safe overestimate
|
|
wchar_t *tempWstring = new wchar_t[numCharsMax]; // HEAP
|
|
// Versus stack allocated: wchar_t tempWstring[numCharsMax];
|
|
size_t numBytesMax = sizeof(wchar_t) * numCharsMax ;
|
|
|
|
//--- Load work string with "wide" characters from temporary input
|
|
const char * p_inputMBstring = inputMBstring; // need this 2nd pointer!
|
|
mbstate_t mbs = mbstate_t();
|
|
mbsrtowcs( tempWstring, &p_inputMBstring, numCharsMax, &mbs);
|
|
|
|
#ifdef verbose
|
|
size_t szWidechar = sizeof ( wchar_t ) ;
|
|
size_t numChars = wcslen(tempWstring);
|
|
size_t numBytes = numChars * szWidechar + 1;
|
|
// wchar_t charType[] = L"wchar_t" ;
|
|
// fwprintf( stderr, L"Wide character '%ls' occupies %ld bytes\n", charType, szWidechar );
|
|
fwprintf( stderr,
|
|
L"\nTemporary wide string contains %ld wide characters (using %ld of %ld allocated bytes)\n",
|
|
numChars, numBytes, numBytesMax
|
|
);
|
|
fwprintf( stderr, L"Wide string before: \t%ls\n", tempWstring );
|
|
#endif // verbose
|
|
|
|
#if 0 // old code
|
|
bool backSlashed = false;
|
|
bool startedCharClass = false;
|
|
bool inCharacterClass = false;
|
|
bool inPosixBracketExpr = false;
|
|
bool quoteLiteral = false;
|
|
wchar_t prevChar = L'_';
|
|
for ( wchar_t* currChar = tempWstring ; *currChar != L'\0' ; prevChar = *currChar, currChar++ )
|
|
{
|
|
if ( isRegex ) {
|
|
if ( startedCharClass ) {
|
|
//-Debug: printf( "Character class starting (%c%c)\n", prevChar, *currChar );
|
|
switch (*currChar) {
|
|
case L'^' : // Negated character class, proceed to next character
|
|
continue; //goto NOCONVERT;
|
|
case L']' : // Treat part of character class, not a closure
|
|
case L':' : // Treat part of character class, not as start of bracket expression
|
|
startedCharClass = false;
|
|
continue; //goto NOCONVERT;
|
|
}
|
|
startedCharClass = false;
|
|
} // startedCharClass
|
|
|
|
// Don't convert backSlashed characters
|
|
if ( backSlashed ) {
|
|
backSlashed = false;
|
|
continue; //goto NOCONVERT;
|
|
}
|
|
if ( *currChar == L'\\' ) {
|
|
backSlashed = true;
|
|
continue; //goto NOCONVERT;
|
|
}
|
|
|
|
// Don't convert characters that are part of bracket expressions
|
|
if ( inCharacterClass ) {
|
|
if ( inPosixBracketExpr ) {
|
|
if ( prevChar == L':' && *currChar == L']' ) {
|
|
//printf( "Leaving POSIX bracket expression (%c%c)\n", prevChar, *currChar );
|
|
inPosixBracketExpr = false;
|
|
continue; //goto NOCONVERT;
|
|
}
|
|
continue; //goto NOCONVERT;
|
|
} // inPosixBracketExpr
|
|
else { // ! inPosixBracketExpr
|
|
if ( prevChar == L'[' && *currChar == L':' ) {
|
|
// POSIX bracket expression open
|
|
//-Debug: printf( "Entering POSIX bracket expression (%c%c)\n", prevChar, *currChar );
|
|
inPosixBracketExpr = true;
|
|
continue; //goto NOCONVERT;
|
|
}
|
|
if ( *currChar == L']' ) {
|
|
// Character class closure
|
|
//-Debug: printf( "Leaving character class (%c%c)\n", prevChar, *currChar );
|
|
inCharacterClass = false;
|
|
continue; //goto NOCONVERT;
|
|
}
|
|
// Convert unescaped character inside character class
|
|
// if it's not part of a POSIX bracket expression
|
|
goto CONVERT;
|
|
} // ! inPosixBracketExpr
|
|
} // inCharacterClass
|
|
|
|
else { // ! inCharacterClass
|
|
if ( *currChar == L'[' ) {
|
|
// Character class opener
|
|
startedCharClass = true;
|
|
inCharacterClass = true;
|
|
//-Debug: printf( "Entering character class (%c%c)\n", prevChar, *currChar );
|
|
continue; //goto NOCONVERT;
|
|
}
|
|
// Convert unescaped character outside of character class
|
|
goto CONVERT;
|
|
} // ! inCharacterClass
|
|
|
|
} // isRegex
|
|
|
|
CONVERT:
|
|
#ifdef TEQUIVCHARS_HASH_LOOKUP
|
|
try {
|
|
*currChar = p->EquivalentsTable.at(*currChar);
|
|
}
|
|
catch(const std::exception&) {
|
|
// No need to "handle" exception, *currchar was not changed
|
|
}
|
|
#else // TEQUIVCHARS_HASH_LOOKUP => Use binary search
|
|
*currChar = p->lookup(*currChar);
|
|
#endif // TEQUIVCHARS_HASH_LOOKUP
|
|
//-Debug: printf( "Xlate: %c\n", *currChar );
|
|
}
|
|
#else // new code
|
|
bool backSlashed = false; // \_
|
|
bool startedCharClass = false; // Previous character was starting '[' of character class
|
|
bool inCharacterClass = false; // [___]
|
|
bool inPosixBracketExpr = false; // [:___:]
|
|
#ifdef TSM_PCRE2
|
|
bool quoteLiteral = false; // \Q___\E
|
|
bool inBraceExpr = false; // \c{___} where 'c' is any of: 'x' 'o' 'p' 'P' 'N' 'g'
|
|
bool inDirective = false; // (*___)
|
|
bool inGroupName = false; // (?<___>
|
|
#endif // TSM_PCRE2
|
|
wchar_t prevChar = 0;
|
|
wchar_t *currChar = tempWstring;
|
|
wchar_t *nextChar = tempWstring+1;
|
|
|
|
for ( ; *currChar != L'\0' ; prevChar = *currChar, currChar++, nextChar++ )
|
|
{
|
|
|
|
if ( isRegex ) {
|
|
|
|
if ( *currChar == L'\\' ) {
|
|
backSlashed = true;
|
|
continue;
|
|
}
|
|
|
|
// Don't convert backSlashed characters
|
|
if ( backSlashed ) {
|
|
#ifdef TSM_PCRE2
|
|
switch (*currChar) {
|
|
case 'Q' : quoteLiteral = true; break; // Entering literal \Q___\E
|
|
case 'E' : quoteLiteral = false; break; // Leaving literal \Q___\E
|
|
case 'N' : // Entering Unicode codepoint specification \N{U+___} ?
|
|
case 'P' : // Entering (negated) Unicode property specification \p{} ?
|
|
case 'p' : // Entering Unicode property specification \p{} ?
|
|
case 'g' : // Entering a named backreference \g{___} ?
|
|
if ( *nextChar == L'{' ) inBraceExpr = true;
|
|
break;
|
|
}
|
|
#endif // TSM_PCRE2
|
|
backSlashed = false;
|
|
continue;
|
|
}
|
|
|
|
#ifdef TSM_PCRE2
|
|
if ( quoteLiteral )
|
|
continue;
|
|
|
|
if ( inBraceExpr ) {
|
|
// Is it time to leave brace expression {___} ?
|
|
if ( *nextChar == L'}' ) inBraceExpr = true;
|
|
continue;
|
|
}
|
|
#endif // TSM_PCRE2
|
|
|
|
if ( startedCharClass ) {
|
|
switch (*currChar) {
|
|
case L'^' : // Negated character class, proceed to next character
|
|
continue; // Bypass converting this special character
|
|
case L']' : // Treat as part of character class, not as a closure
|
|
case L':' : // Treat as part of character class, not as start of bracket expression
|
|
startedCharClass = false;
|
|
continue; // Bypass converting these special characters
|
|
}
|
|
startedCharClass = false;
|
|
} // startedCharClass
|
|
|
|
if ( inCharacterClass ) {
|
|
|
|
if ( inPosixBracketExpr ) {
|
|
// Is it time to leave POSIX bracket expression [:___:] ?
|
|
if ( *currChar == L':' && *nextChar == L']' ) inPosixBracketExpr = false;
|
|
continue;
|
|
} // inPosixBracketExpr
|
|
|
|
else { // ! inPosixBracketExpr
|
|
|
|
if ( prevChar == L'[' && *currChar == L':' ) {
|
|
// Enter POSIX bracket expression [:___:]
|
|
inPosixBracketExpr = true;
|
|
continue;
|
|
}
|
|
|
|
if ( *currChar == L']' ) {
|
|
// Leaving character class [___]
|
|
inCharacterClass = false;
|
|
continue;
|
|
}
|
|
|
|
} // ! inPosixBracketExpr
|
|
|
|
} // inCharacterClass
|
|
|
|
else { // ! inCharacterClass
|
|
|
|
switch (*currChar) {
|
|
|
|
case '[' :
|
|
// Entering a character class [___]
|
|
startedCharClass = true;
|
|
inCharacterClass = true;
|
|
continue;
|
|
break;
|
|
#ifdef TSM_PCRE2
|
|
case '*' :
|
|
if ( prevChar != '(' ) continue;
|
|
// Entering a PCRE2 directive (*___)
|
|
inDirective = true;
|
|
continue;
|
|
break;
|
|
|
|
case '?' :
|
|
if ( prevChar != '(' ) continue;
|
|
if ( *nextChar != '<' ) continue;
|
|
// Entering PCRE2 group name (?<___>)
|
|
inGroupName = true;
|
|
continue;
|
|
break;
|
|
#endif // TSM_PCRE2
|
|
}
|
|
#ifdef TSM_PCRE2
|
|
if ( inDirective ) {
|
|
// Is it time to leave PCRE2 directive (*___) ?
|
|
if (*currChar == ')' ) inDirective = false;
|
|
continue;
|
|
}
|
|
|
|
if ( inGroupName ) {
|
|
// Is it time to leave PCRE2 group name (?<___>) ?
|
|
if (*currChar == '>' ) inGroupName = false;
|
|
continue;
|
|
}
|
|
#endif // TSM_PCRE2
|
|
} // ! inCharacterClass
|
|
|
|
// If we reach here, *currChar will be convertd
|
|
|
|
} // isRegex
|
|
|
|
CONVERT:
|
|
|
|
#ifdef TEQUIVCHARS_HASH_LOOKUP
|
|
try {
|
|
*currChar = p->EquivalentsTable.at(*currChar);
|
|
}
|
|
catch(const std::exception&) {
|
|
// No need to "handle" exception, *currchar was not changed
|
|
}
|
|
#else // TEQUIVCHARS_HASH_LOOKUP => Use binary search
|
|
*currChar = p->lookup(*currChar);
|
|
#endif // TEQUIVCHARS_HASH_LOOKUP
|
|
}
|
|
|
|
#endif // old->new code
|
|
|
|
#ifdef verbose
|
|
fwprintf( stderr, L"Wide string after: \t%ls\n\n", tempWstring );
|
|
#endif // verbose
|
|
|
|
char *outputMBstring = new char[numBytesMax];
|
|
// Versus stack allocated: char outputMBstring[numBytesMax];
|
|
const wchar_t * p_tempWstring = tempWstring; // need this 2nd pointer!
|
|
mbrlen( NULL, 0, &mbs );
|
|
int rc = wcsrtombs( outputMBstring, &p_tempWstring, numBytesMax, &mbs );
|
|
|
|
delete[] tempWstring;
|
|
TQString outputQstring = TQString::fromLocal8Bit( outputMBstring );
|
|
delete[] outputMBstring;
|
|
return outputQstring;
|
|
}
|