#include "tequivchars.h" #include #include #include // #define TSM_PCRE2 // #define TEQUIVCHARS_HASH_LOOKUP #ifndef TEQUIVCHARS_HASH_LOOKUP // Using binary search on sorted array inline constexpr uint EquivTableROWS {2993}; // Make sure that these constants accurately inline constexpr uint EquivTableCOLS { 2}; // reflect what's in tequivchars-mapping.h #else // Using hash table lookup #include "tsl/hopscotch_map.h" #endif // TEQUIVCHARS_HASH_LOOKUP class TEquivChars_Private { public: #ifndef TEQUIVCHARS_HASH_LOOKUP // Using binary search on sorted array inline wchar_t lookup( wchar_t widechar ); wchar_t EquivalentsTable[EquivTableROWS][EquivTableCOLS] = { #include "tequivchars-mapping.h" }; #else // Using hash table lookup tsl::hopscotch_map EquivalentsTable = { #include "tequivchars-mapping.h" }; #endif // TEQUIVCHARS_HASH_LOOKUP }; #ifndef TEQUIVCHARS_HASH_LOOKUP // Using binary search on sorted array inline wchar_t TEquivChars_Private::lookup( wchar_t key ) { int low = 0; int high = EquivTableROWS - 1; int iteration = 0; while (low <= high) { iteration++; int mid = low + (high - low) / 2; // fprintf( stderr, "high = %u, mid = %u, low = %u\n", high, mid, low ); if (EquivalentsTable[mid][0] == key) { //-Debug: fprintf( stderr, //-Debug: "After %d lookups, character 0x%04x was mapped to 0x%04x \n", //-Debug: iteration, key, EquivalentsTable[mid][1] //-Debug: ); return EquivalentsTable[mid][1]; } if ( EquivalentsTable[mid][0] < key ) low = mid + 1; else high = mid - 1; } //-Debug: fprintf( stderr, "Not found after %d lookups\n", iteration ); return key; } #endif // TEQUIVCHARS_HASH_LOOKUP TEquivChars::TEquivChars() { p = new TEquivChars_Private; } TEquivChars::~TEquivChars() { delete p; } /* There are 2 implementations of table lookup within this fuction: one that uses hash table lookup and another that uses binary search on a presorted table. We default using the binary search implementation for these reasons: 1. The hash table implementation consumes more RAM. 2. The hash table implementation consumes slightly less CPU when used on a large dataset that contains no replaceable characters (all misses) BUT consumes significantly more CPU when used on dataset that contains many replaceable characters (mostly hits). 3. The theoretical benefits of using hash lookups on such a small table will probably never be realized. The hashing used was https://github.com/Tessil/robin-map */ TQString TEquivChars::replaceChars( TQString inputQstring, bool isRegex ) { int inStrLen = inputQstring.length(); TQString outString = TQString::fromLatin1( "" ); const TQChar *char16 = inputQstring.unicode(); bool backSlashed = false; // \_ bool startedCharClass = false; // Previous character was starting '[' of character class bool inCharacterClass = false; // [___] bool inPosixBracketExpr = false; // [:___:] #ifdef TSM_PCRE2 bool quoteLiteral = false; // \Q___\E bool inBraceExpr = false; // \c{___} where 'c' is any of: 'x' 'o' 'p' 'P' 'N' 'g' bool inDirective = false; // (*___) bool inGroupName = false; // (?<___> #endif // TSM_PCRE2 wchar_t currChar = 0; wchar_t prevChar = 0; wchar_t nextChar = 0; for ( int i = 0 ; i < inStrLen ; i++ , outString += TQChar(currChar) ) { prevChar = currChar; currChar = char16[i].unicode(); if ( isRegex ) { if ( i < ( inStrLen - 1 ) ) nextChar = char16[i+1].unicode(); else nextChar = 0; if ( currChar == L'\\' ) { backSlashed = true; continue; } // Don't convert backSlashed characters if ( backSlashed ) { #ifdef TSM_PCRE2 switch (currChar) { case 'Q' : quoteLiteral = true; break; // Entering literal \Q___\E case 'E' : quoteLiteral = false; break; // Leaving literal \Q___\E case 'N' : // Entering Unicode codepoint specification \N{U+___} ? case 'P' : // Entering (negated) Unicode property specification \p{} ? case 'p' : // Entering Unicode property specification \p{} ? case 'g' : // Entering a named backreference \g{___} ? if ( nextChar == L'{' ) inBraceExpr = true; break; } #endif // TSM_PCRE2 backSlashed = false; continue; } #ifdef TSM_PCRE2 if ( quoteLiteral ) continue; if ( inBraceExpr ) { // Is it time to leave brace expression {___} ? if ( nextChar == L'}' ) inBraceExpr = true; continue; } #endif // TSM_PCRE2 if ( startedCharClass ) { switch (currChar) { case L'^' : // Negated character class, proceed to next character continue; // Bypass converting this special character case L']' : // Treat as part of character class, not as a closure case L':' : // Treat as part of character class, not as start of bracket expression startedCharClass = false; continue; // Bypass converting these special characters } startedCharClass = false; } // startedCharClass if ( inCharacterClass ) { if ( inPosixBracketExpr ) { // Is it time to leave POSIX bracket expression [:___:] ? if ( currChar == L':' && nextChar == L']' ) inPosixBracketExpr = false; continue; } // inPosixBracketExpr else { // ! inPosixBracketExpr if ( prevChar == L'[' && currChar == L':' ) { // Enter POSIX bracket expression [:___:] inPosixBracketExpr = true; continue; } if ( currChar == L']' ) { // Leaving character class [___] inCharacterClass = false; continue; } } // ! inPosixBracketExpr } // inCharacterClass else { // ! inCharacterClass switch (currChar) { case '[' : // Entering a character class [___] startedCharClass = true; inCharacterClass = true; continue; break; #ifdef TSM_PCRE2 case '*' : if ( prevChar != '(' ) continue; // Entering a PCRE2 directive (*___) inDirective = true; continue; break; case '?' : if ( prevChar != '(' ) continue; if ( nextChar != '<' ) continue; // Entering PCRE2 group name (?<___>) inGroupName = true; continue; break; #endif // TSM_PCRE2 } #ifdef TSM_PCRE2 if ( inDirective ) { // Is it time to leave PCRE2 directive (*___) ? if (currChar == ')' ) inDirective = false; continue; } if ( inGroupName ) { // Is it time to leave PCRE2 group name (?<___>) ? if (currChar == '>' ) inGroupName = false; continue; } #endif // TSM_PCRE2 } // ! inCharacterClass // If we reach here, currChar will be convertd } // isRegex CONVERT: #ifdef TEQUIVCHARS_HASH_LOOKUP try { currChar = p->EquivalentsTable.at(currChar); } catch(const std::exception&) { // No need to "handle" exception, *currchar was not changed } #else // TEQUIVCHARS_HASH_LOOKUP => Use binary search currChar = p->lookup(currChar); #endif // TEQUIVCHARS_HASH_LOOKUP } return outString; } /* This is the original implementation of the replaceChars function. It works by converting inputString (UTF16) to a "multibyte" (typically UTF8) string, then to an array of "wide" (32-bit) characters which is used for character replacement.The result is converted back into a TQString and returned. Despite the multiple conversions, this function is almost as fast as the one that replaced it. */ TQString TEquivChars::replaceCharsMB( TQString inputQstring, bool isRegex ) { setlocale(LC_CTYPE, ""); const char* inputMBstring = inputQstring.local8Bit().data(); //-Debug: printf("Size of inputMBstring (metric 1): %d\n", inputQstring.length() ); //-Debug: printf("Size of inputMBstring (metric 2): %ld\n", strlen( inputMBstring ) ); //--- Allocate "wide" work string size_t numCharsMax; numCharsMax = inputQstring.length()+2; // Start with a hopefully safe overestimate wchar_t *tempWstring = new wchar_t[numCharsMax]; // HEAP // Versus stack allocated: wchar_t tempWstring[numCharsMax]; size_t numBytesMax = sizeof(wchar_t) * numCharsMax ; //--- Load work string with "wide" characters from temporary input const char * p_inputMBstring = inputMBstring; // need this 2nd pointer! mbstate_t mbs = mbstate_t(); mbsrtowcs( tempWstring, &p_inputMBstring, numCharsMax, &mbs); #ifdef verbose size_t szWidechar = sizeof ( wchar_t ) ; size_t numChars = wcslen(tempWstring); size_t numBytes = numChars * szWidechar + 1; // wchar_t charType[] = L"wchar_t" ; // fwprintf( stderr, L"Wide character '%ls' occupies %ld bytes\n", charType, szWidechar ); fwprintf( stderr, L"\nTemporary wide string contains %ld wide characters (using %ld of %ld allocated bytes)\n", numChars, numBytes, numBytesMax ); fwprintf( stderr, L"Wide string before: \t%ls\n", tempWstring ); #endif // verbose #if 0 // old code bool backSlashed = false; bool startedCharClass = false; bool inCharacterClass = false; bool inPosixBracketExpr = false; bool quoteLiteral = false; wchar_t prevChar = L'_'; for ( wchar_t* currChar = tempWstring ; *currChar != L'\0' ; prevChar = *currChar, currChar++ ) { if ( isRegex ) { if ( startedCharClass ) { //-Debug: printf( "Character class starting (%c%c)\n", prevChar, *currChar ); switch (*currChar) { case L'^' : // Negated character class, proceed to next character continue; //goto NOCONVERT; case L']' : // Treat part of character class, not a closure case L':' : // Treat part of character class, not as start of bracket expression startedCharClass = false; continue; //goto NOCONVERT; } startedCharClass = false; } // startedCharClass // Don't convert backSlashed characters if ( backSlashed ) { backSlashed = false; continue; //goto NOCONVERT; } if ( *currChar == L'\\' ) { backSlashed = true; continue; //goto NOCONVERT; } // Don't convert characters that are part of bracket expressions if ( inCharacterClass ) { if ( inPosixBracketExpr ) { if ( prevChar == L':' && *currChar == L']' ) { //printf( "Leaving POSIX bracket expression (%c%c)\n", prevChar, *currChar ); inPosixBracketExpr = false; continue; //goto NOCONVERT; } continue; //goto NOCONVERT; } // inPosixBracketExpr else { // ! inPosixBracketExpr if ( prevChar == L'[' && *currChar == L':' ) { // POSIX bracket expression open //-Debug: printf( "Entering POSIX bracket expression (%c%c)\n", prevChar, *currChar ); inPosixBracketExpr = true; continue; //goto NOCONVERT; } if ( *currChar == L']' ) { // Character class closure //-Debug: printf( "Leaving character class (%c%c)\n", prevChar, *currChar ); inCharacterClass = false; continue; //goto NOCONVERT; } // Convert unescaped character inside character class // if it's not part of a POSIX bracket expression goto CONVERT; } // ! inPosixBracketExpr } // inCharacterClass else { // ! inCharacterClass if ( *currChar == L'[' ) { // Character class opener startedCharClass = true; inCharacterClass = true; //-Debug: printf( "Entering character class (%c%c)\n", prevChar, *currChar ); continue; //goto NOCONVERT; } // Convert unescaped character outside of character class goto CONVERT; } // ! inCharacterClass } // isRegex CONVERT: #ifdef TEQUIVCHARS_HASH_LOOKUP try { *currChar = p->EquivalentsTable.at(*currChar); } catch(const std::exception&) { // No need to "handle" exception, *currchar was not changed } #else // TEQUIVCHARS_HASH_LOOKUP => Use binary search *currChar = p->lookup(*currChar); #endif // TEQUIVCHARS_HASH_LOOKUP //-Debug: printf( "Xlate: %c\n", *currChar ); } #else // new code bool backSlashed = false; // \_ bool startedCharClass = false; // Previous character was starting '[' of character class bool inCharacterClass = false; // [___] bool inPosixBracketExpr = false; // [:___:] #ifdef TSM_PCRE2 bool quoteLiteral = false; // \Q___\E bool inBraceExpr = false; // \c{___} where 'c' is any of: 'x' 'o' 'p' 'P' 'N' 'g' bool inDirective = false; // (*___) bool inGroupName = false; // (?<___> #endif // TSM_PCRE2 wchar_t prevChar = 0; wchar_t *currChar = tempWstring; wchar_t *nextChar = tempWstring+1; for ( ; *currChar != L'\0' ; prevChar = *currChar, currChar++, nextChar++ ) { if ( isRegex ) { if ( *currChar == L'\\' ) { backSlashed = true; continue; } // Don't convert backSlashed characters if ( backSlashed ) { #ifdef TSM_PCRE2 switch (*currChar) { case 'Q' : quoteLiteral = true; break; // Entering literal \Q___\E case 'E' : quoteLiteral = false; break; // Leaving literal \Q___\E case 'N' : // Entering Unicode codepoint specification \N{U+___} ? case 'P' : // Entering (negated) Unicode property specification \p{} ? case 'p' : // Entering Unicode property specification \p{} ? case 'g' : // Entering a named backreference \g{___} ? if ( *nextChar == L'{' ) inBraceExpr = true; break; } #endif // TSM_PCRE2 backSlashed = false; continue; } #ifdef TSM_PCRE2 if ( quoteLiteral ) continue; if ( inBraceExpr ) { // Is it time to leave brace expression {___} ? if ( *nextChar == L'}' ) inBraceExpr = true; continue; } #endif // TSM_PCRE2 if ( startedCharClass ) { switch (*currChar) { case L'^' : // Negated character class, proceed to next character continue; // Bypass converting this special character case L']' : // Treat as part of character class, not as a closure case L':' : // Treat as part of character class, not as start of bracket expression startedCharClass = false; continue; // Bypass converting these special characters } startedCharClass = false; } // startedCharClass if ( inCharacterClass ) { if ( inPosixBracketExpr ) { // Is it time to leave POSIX bracket expression [:___:] ? if ( *currChar == L':' && *nextChar == L']' ) inPosixBracketExpr = false; continue; } // inPosixBracketExpr else { // ! inPosixBracketExpr if ( prevChar == L'[' && *currChar == L':' ) { // Enter POSIX bracket expression [:___:] inPosixBracketExpr = true; continue; } if ( *currChar == L']' ) { // Leaving character class [___] inCharacterClass = false; continue; } } // ! inPosixBracketExpr } // inCharacterClass else { // ! inCharacterClass switch (*currChar) { case '[' : // Entering a character class [___] startedCharClass = true; inCharacterClass = true; continue; break; #ifdef TSM_PCRE2 case '*' : if ( prevChar != '(' ) continue; // Entering a PCRE2 directive (*___) inDirective = true; continue; break; case '?' : if ( prevChar != '(' ) continue; if ( *nextChar != '<' ) continue; // Entering PCRE2 group name (?<___>) inGroupName = true; continue; break; #endif // TSM_PCRE2 } #ifdef TSM_PCRE2 if ( inDirective ) { // Is it time to leave PCRE2 directive (*___) ? if (*currChar == ')' ) inDirective = false; continue; } if ( inGroupName ) { // Is it time to leave PCRE2 group name (?<___>) ? if (*currChar == '>' ) inGroupName = false; continue; } #endif // TSM_PCRE2 } // ! inCharacterClass // If we reach here, *currChar will be convertd } // isRegex CONVERT: #ifdef TEQUIVCHARS_HASH_LOOKUP try { *currChar = p->EquivalentsTable.at(*currChar); } catch(const std::exception&) { // No need to "handle" exception, *currchar was not changed } #else // TEQUIVCHARS_HASH_LOOKUP => Use binary search *currChar = p->lookup(*currChar); #endif // TEQUIVCHARS_HASH_LOOKUP } #endif // old->new code #ifdef verbose fwprintf( stderr, L"Wide string after: \t%ls\n\n", tempWstring ); #endif // verbose char *outputMBstring = new char[numBytesMax]; // Versus stack allocated: char outputMBstring[numBytesMax]; const wchar_t * p_tempWstring = tempWstring; // need this 2nd pointer! mbrlen( NULL, 0, &mbs ); int rc = wcsrtombs( outputMBstring, &p_tempWstring, numBytesMax, &mbs ); delete[] tempWstring; TQString outputQstring = TQString::fromLocal8Bit( outputMBstring ); delete[] outputMBstring; return outputQstring; }