Introduce TEquivChars class and an associated global instance that

provides access to a table mapping alphanumeric characters to their collation equivalents as defined in the Default Unicode Collation Element Table (DUCET). Class is used by TSM to implement alphanumeric character handling EQUIVALENCE. Utilize fnmatch(3) on GLIBC systems for processing wildcard matching. Non-GLIBC systems will continue handle wildcard expressions by converting them to regex. Internal AuxData class introduced to further de-couple internal implementation details from public interfaces. Other changes were made based on feedback from @MicheleC. Signed-off-by: Vincent Reher <tde@4reher.org>
2 years ago · 0c107b54a2
parent 46fedcbe96
commit 0c107b54a2
3 changed files with 3664 additions and 0 deletions
--- a/tdecore/tequivchars-mapping.h
+++ b/tdecore/tequivchars-mapping.h
--- a/tdecore/tequivchars.cpp
+++ b/tdecore/tequivchars.cpp
@ -0,0 +1,553 @@
 #include "tequivchars.h"
 #include <wchar.h>
 #include <locale.h>
 #include <stdio.h>
 // #define TSM_PCRE2
 // #define TEQUIVCHARS_HASH_LOOKUP
 #ifndef TEQUIVCHARS_HASH_LOOKUP  // Using binary search on sorted array
 inline constexpr uint EquivTableROWS {2993}; // Make sure that these constants accurately
 inline constexpr uint EquivTableCOLS {   2}; // reflect what's in tequivchars-mapping.h
 #else // Using hash table lookup
 #include "tsl/hopscotch_map.h"
 #endif // TEQUIVCHARS_HASH_LOOKUP
 class TEquivChars_Private
 {
 public:
 #ifndef TEQUIVCHARS_HASH_LOOKUP // Using binary search on sorted array
  inline wchar_t lookup( wchar_t widechar );
  wchar_t EquivalentsTable[EquivTableROWS][EquivTableCOLS] = {
    #include "tequivchars-mapping.h"
  };
 #else // Using hash table lookup
  tsl::hopscotch_map<wchar_t, wchar_t> EquivalentsTable = {
    #include "tequivchars-mapping.h"
  };
 #endif // TEQUIVCHARS_HASH_LOOKUP
 };
 #ifndef TEQUIVCHARS_HASH_LOOKUP // Using binary search on sorted array
 inline wchar_t TEquivChars_Private::lookup( wchar_t key )
 {
  int low  =  0;
  int high =  EquivTableROWS - 1;
  int iteration = 0;
  while (low <= high) {
    iteration++;
    int mid = low + (high - low) / 2;
    // fprintf( stderr, "high = %u, mid = %u, low = %u\n", high, mid, low );
    if (EquivalentsTable[mid][0] == key) {
      //-Debug: fprintf( stderr,
      //-Debug:   "After %d lookups, character 0x%04x was mapped to 0x%04x \n",
      //-Debug:   iteration, key, EquivalentsTable[mid][1]
      //-Debug: );
      return EquivalentsTable[mid][1];
    }
    if ( EquivalentsTable[mid][0] < key )
      low = mid + 1;
    else
      high = mid - 1;
  }
  //-Debug: fprintf( stderr, "Not found after %d lookups\n", iteration );
  return key;
 }
 #endif // TEQUIVCHARS_HASH_LOOKUP
 TEquivChars::TEquivChars()
 {
  p = new TEquivChars_Private;
 }
 TEquivChars::~TEquivChars()
 {
  delete p;
 }
 /*
   There are 2 implementations of table lookup within this fuction: one that
   uses hash table lookup and another that uses binary search on a presorted
   table. We default using the binary search implementation for these reasons:
   1. The hash table implementation consumes more RAM.
   2. The hash table implementation consumes slightly less CPU when used on a
      large dataset that contains no replaceable characters (all misses) BUT
      consumes significantly more CPU when used on dataset that contains many
      replaceable characters (mostly hits).
   3. The theoretical benefits of using hash lookups on such a small table
      will probably never be realized.
  The hashing used was 
  https://github.com/Tessil/robin-map
 */
 TQString TEquivChars::replaceChars( TQString inputQstring, bool isRegex )
 {
  int inStrLen = inputQstring.length();
  TQString outString = TQString::fromLatin1( "" );
  const TQChar *char16 = inputQstring.unicode();
  bool backSlashed        = false; // \_
  bool startedCharClass   = false; // Previous character was starting '[' of character class
  bool inCharacterClass   = false; // [___]
  bool inPosixBracketExpr = false; // [:___:]
 #ifdef TSM_PCRE2
  bool quoteLiteral       = false; // \Q___\E
  bool inBraceExpr        = false; // \c{___} where 'c' is any of: 'x' 'o' 'p' 'P' 'N' 'g'
  bool inDirective        = false; // (*___)
  bool inGroupName        = false; // (?<___>
 #endif // TSM_PCRE2
  wchar_t currChar  = 0;
  wchar_t prevChar  = 0;
  wchar_t nextChar  = 0;
  for ( int i = 0 ; i < inStrLen ; i++ , outString += TQChar(currChar)  )
  {
    prevChar = currChar;
    currChar = char16[i].unicode();
    if ( isRegex ) {
      if ( i < ( inStrLen - 1 ) )
        nextChar = char16[i+1].unicode();
      else
        nextChar = 0;
      if ( currChar == L'\\' ) {
        backSlashed = true;
        continue;
      }
      // Don't convert backSlashed characters
      if ( backSlashed ) {
 #ifdef TSM_PCRE2
        switch (currChar) {
          case 'Q' : quoteLiteral = true;  break; // Entering literal \Q___\E
          case 'E' : quoteLiteral = false; break; // Leaving literal \Q___\E
          case 'N' : // Entering Unicode codepoint specification \N{U+___} ?
          case 'P' : // Entering (negated) Unicode property specification \p{} ?
          case 'p' : // Entering Unicode property specification \p{} ?
          case 'g' : // Entering a named backreference \g{___} ?
            if ( nextChar == L'{' ) inBraceExpr = true;
            break;
        }
 #endif // TSM_PCRE2
        backSlashed = false;
        continue;
      }
 #ifdef TSM_PCRE2
      if ( quoteLiteral )
        continue;
      if ( inBraceExpr ) {
        // Is it time to leave brace expression {___} ?
        if ( nextChar == L'}' ) inBraceExpr = true;
        continue;
      }
 #endif // TSM_PCRE2
      if ( startedCharClass ) {
        switch (currChar) {
          case L'^' : // Negated character class, proceed to next character
            continue; // Bypass converting this special character
          case L']' : // Treat as part of character class, not as a closure
          case L':' : // Treat as part of character class, not as start of bracket expression
            startedCharClass = false;
            continue;  // Bypass converting these special characters
        }
        startedCharClass = false;
      } // startedCharClass
      if ( inCharacterClass ) {
        if ( inPosixBracketExpr ) {
          // Is it time to leave POSIX bracket expression [:___:] ?
          if ( currChar == L':' && nextChar == L']' ) inPosixBracketExpr = false;
          continue;
        } // inPosixBracketExpr
        else { // ! inPosixBracketExpr
          if ( prevChar == L'[' && currChar == L':' ) {
            // Enter POSIX bracket expression [:___:]
            inPosixBracketExpr = true;
            continue;
          }
          if ( currChar == L']' ) {
            // Leaving character class [___]
            inCharacterClass = false;
            continue;
          }
        } // ! inPosixBracketExpr
      } // inCharacterClass
      else { // ! inCharacterClass
        switch (currChar) {
          case '[' :
            // Entering a character class [___]
            startedCharClass = true;
            inCharacterClass = true;
            continue;
            break;
 #ifdef TSM_PCRE2
          case '*' :
            if ( prevChar != '(' ) continue;
            // Entering a PCRE2 directive (*___)
            inDirective = true;
            continue;
            break;
          case '?' :
            if ( prevChar != '(' ) continue;
            if ( nextChar != '<' ) continue;
            // Entering PCRE2 group name (?<___>)
            inGroupName = true;
            continue;
            break;
 #endif // TSM_PCRE2
        }
 #ifdef TSM_PCRE2
        if ( inDirective ) {
          // Is it time to leave PCRE2 directive (*___) ?
          if (currChar == ')' ) inDirective = false;
          continue;
        }
        if ( inGroupName ) {
          // Is it time to leave PCRE2 group name (?<___>) ?
          if (currChar == '>' ) inGroupName = false;
          continue;
        }
 #endif // TSM_PCRE2
      } // ! inCharacterClass
      // If we reach here, currChar will be convertd
    } // isRegex
    CONVERT:
    #ifdef TEQUIVCHARS_HASH_LOOKUP
    try {
      currChar = p->EquivalentsTable.at(currChar);
    }
    catch(const std::exception&) {
      // No need to "handle" exception, *currchar was not changed
    }
    #else // TEQUIVCHARS_HASH_LOOKUP => Use binary search
    currChar = p->lookup(currChar);
    #endif // TEQUIVCHARS_HASH_LOOKUP
  }
  return outString;
 }
 /*
    This is the original implementation of the replaceChars function. It works
    by converting inputString (UTF16) to a "multibyte" (typically UTF8) string,
    then to an array of "wide" (32-bit) characters which is used for character
    replacement.The result is converted back into a TQString and returned.
    Despite the multiple conversions, this function is almost as fast as
    the one that replaced it.
 */
 TQString TEquivChars::replaceCharsMB( TQString inputQstring, bool isRegex )
 {
  setlocale(LC_CTYPE, "");
  const char* inputMBstring = inputQstring.local8Bit().data();
  //-Debug: printf("Size of inputMBstring (metric 1): %d\n", inputQstring.length()   );
  //-Debug: printf("Size of inputMBstring (metric 2): %ld\n", strlen( inputMBstring ) );
  //--- Allocate "wide" work string
  size_t numCharsMax;
  numCharsMax = inputQstring.length()+2; // Start with a hopefully safe overestimate
  wchar_t *tempWstring = new wchar_t[numCharsMax]; // HEAP
    // Versus stack allocated: wchar_t tempWstring[numCharsMax];
  size_t numBytesMax = sizeof(wchar_t) * numCharsMax ;
  //--- Load work string with "wide" characters from temporary input
  const char * p_inputMBstring = inputMBstring; // need this 2nd pointer!
  mbstate_t mbs = mbstate_t();
  mbsrtowcs( tempWstring, &p_inputMBstring, numCharsMax, &mbs);
  #ifdef verbose
  size_t szWidechar = sizeof ( wchar_t ) ;
  size_t numChars = wcslen(tempWstring);
  size_t numBytes = numChars * szWidechar + 1;
  // wchar_t charType[] = L"wchar_t" ;
  // fwprintf( stderr, L"Wide character '%ls' occupies %ld bytes\n", charType, szWidechar );
  fwprintf( stderr,
    L"\nTemporary wide string contains %ld wide characters (using %ld of %ld allocated bytes)\n",
    numChars, numBytes, numBytesMax
  );
  fwprintf( stderr, L"Wide string before: \t%ls\n", tempWstring );
  #endif // verbose
 #if 0 // old code
  bool backSlashed        = false;
  bool startedCharClass   = false;
  bool inCharacterClass   = false;
  bool inPosixBracketExpr = false;
  bool quoteLiteral       = false;
  wchar_t prevChar  = L'_';
  for ( wchar_t* currChar = tempWstring ; *currChar != L'\0' ;  prevChar = *currChar, currChar++ )
  {
    if ( isRegex ) {
      if ( startedCharClass ) {
        //-Debug: printf( "Character class starting (%c%c)\n", prevChar, *currChar );
        switch (*currChar) {
          case  L'^' : // Negated character class, proceed to next character
            continue; //goto NOCONVERT;
          case  L']' : // Treat part of character class, not a closure
          case  L':' : // Treat part of character class, not as start of bracket expression
            startedCharClass = false;
            continue; //goto NOCONVERT;
        }
        startedCharClass = false;
      } // startedCharClass
      // Don't convert backSlashed characters
      if ( backSlashed ) {
        backSlashed = false;
        continue; //goto NOCONVERT;
      }
      if ( *currChar == L'\\' ) {
        backSlashed = true;
        continue; //goto NOCONVERT;
      }
      // Don't convert characters that are part of bracket expressions
      if ( inCharacterClass ) {
        if ( inPosixBracketExpr ) {
          if ( prevChar == L':' && *currChar == L']' ) {
            //printf( "Leaving POSIX bracket expression (%c%c)\n", prevChar, *currChar );
            inPosixBracketExpr = false;
            continue; //goto NOCONVERT;
          }
          continue; //goto NOCONVERT;
        } // inPosixBracketExpr
        else { // ! inPosixBracketExpr
          if ( prevChar == L'[' && *currChar == L':' ) {
            // POSIX bracket expression open
            //-Debug: printf( "Entering POSIX bracket expression (%c%c)\n", prevChar, *currChar );
            inPosixBracketExpr = true;
            continue; //goto NOCONVERT;
          }
          if ( *currChar == L']' ) {
            // Character class closure
            //-Debug: printf( "Leaving character class (%c%c)\n", prevChar, *currChar );
            inCharacterClass = false;
            continue; //goto NOCONVERT;
          }
          // Convert unescaped character inside character class
          // if it's not part of a POSIX bracket expression
          goto CONVERT;
        } // ! inPosixBracketExpr
      } // inCharacterClass
      else { // ! inCharacterClass
        if ( *currChar == L'[' ) {
          // Character class opener
          startedCharClass = true;
          inCharacterClass = true;
          //-Debug: printf( "Entering character class (%c%c)\n", prevChar, *currChar );
          continue; //goto NOCONVERT;
        }
        // Convert unescaped character outside of character class
        goto CONVERT;
      } // ! inCharacterClass
    } // isRegex
    CONVERT:
    #ifdef TEQUIVCHARS_HASH_LOOKUP
    try {
      *currChar = p->EquivalentsTable.at(*currChar);
    }
    catch(const std::exception&) {
      // No need to "handle" exception, *currchar was not changed
    }
    #else // TEQUIVCHARS_HASH_LOOKUP => Use binary search
    *currChar = p->lookup(*currChar);
    #endif // TEQUIVCHARS_HASH_LOOKUP
    //-Debug: printf( "Xlate: %c\n", *currChar );
  }
 #else // new code
  bool backSlashed        = false; // \_
  bool startedCharClass   = false; // Previous character was starting '[' of character class
  bool inCharacterClass   = false; // [___]
  bool inPosixBracketExpr = false; // [:___:]
 #ifdef TSM_PCRE2
  bool quoteLiteral       = false; // \Q___\E
  bool inBraceExpr        = false; // \c{___} where 'c' is any of: 'x' 'o' 'p' 'P' 'N' 'g'
  bool inDirective        = false; // (*___)
  bool inGroupName        = false; // (?<___>
 #endif // TSM_PCRE2
  wchar_t  prevChar = 0;
  wchar_t *currChar = tempWstring;
  wchar_t *nextChar = tempWstring+1;
  for ( ; *currChar != L'\0' ; prevChar = *currChar, currChar++, nextChar++ )
  {
    if ( isRegex ) {
      if ( *currChar == L'\\' ) {
        backSlashed = true;
        continue;
      }
      // Don't convert backSlashed characters
      if ( backSlashed ) {
 #ifdef TSM_PCRE2
        switch (*currChar) {
          case 'Q' : quoteLiteral = true;  break; // Entering literal \Q___\E
          case 'E' : quoteLiteral = false; break; // Leaving literal \Q___\E
          case 'N' : // Entering Unicode codepoint specification \N{U+___} ?
          case 'P' : // Entering (negated) Unicode property specification \p{} ?
          case 'p' : // Entering Unicode property specification \p{} ?
          case 'g' : // Entering a named backreference \g{___} ?
            if ( *nextChar == L'{' ) inBraceExpr = true;
            break;
        }
 #endif // TSM_PCRE2
        backSlashed = false;
        continue;
      }
 #ifdef TSM_PCRE2
      if ( quoteLiteral )
        continue;
      if ( inBraceExpr ) {
        // Is it time to leave brace expression {___} ?
        if ( *nextChar == L'}' ) inBraceExpr = true;
        continue;
      }
 #endif // TSM_PCRE2
      if ( startedCharClass ) {
        switch (*currChar) {
          case L'^' : // Negated character class, proceed to next character
            continue; // Bypass converting this special character
          case L']' : // Treat as part of character class, not as a closure
          case L':' : // Treat as part of character class, not as start of bracket expression
            startedCharClass = false;
            continue;  // Bypass converting these special characters
        }
        startedCharClass = false;
      } // startedCharClass
      if ( inCharacterClass ) {
        if ( inPosixBracketExpr ) {
          // Is it time to leave POSIX bracket expression [:___:] ?
          if ( *currChar == L':' && *nextChar == L']' ) inPosixBracketExpr = false;
          continue;
        } // inPosixBracketExpr
        else { // ! inPosixBracketExpr
          if ( prevChar == L'[' && *currChar == L':' ) {
            // Enter POSIX bracket expression [:___:]
            inPosixBracketExpr = true;
            continue;
          }
          if ( *currChar == L']' ) {
            // Leaving character class [___]
            inCharacterClass = false;
            continue;
          }
        } // ! inPosixBracketExpr
      } // inCharacterClass
      else { // ! inCharacterClass
        switch (*currChar) {
          case '[' :
            // Entering a character class [___]
            startedCharClass = true;
            inCharacterClass = true;
            continue;
            break;
 #ifdef TSM_PCRE2
          case '*' :
            if ( prevChar != '(' ) continue;
            // Entering a PCRE2 directive (*___)
            inDirective = true;
            continue;
            break;
          case '?' :
            if ( prevChar != '(' ) continue;
            if ( *nextChar != '<' ) continue;
            // Entering PCRE2 group name (?<___>)
            inGroupName = true;
            continue;
            break;
 #endif // TSM_PCRE2
        }
 #ifdef TSM_PCRE2
        if ( inDirective ) {
          // Is it time to leave PCRE2 directive (*___) ?
          if (*currChar == ')' ) inDirective = false;
          continue;
        }
        if ( inGroupName ) {
          // Is it time to leave PCRE2 group name (?<___>) ?
          if (*currChar == '>' ) inGroupName = false;
          continue;
        }
 #endif // TSM_PCRE2
      } // ! inCharacterClass
      // If we reach here, *currChar will be convertd
    } // isRegex
    CONVERT:
    #ifdef TEQUIVCHARS_HASH_LOOKUP
    try {
      *currChar = p->EquivalentsTable.at(*currChar);
    }
    catch(const std::exception&) {
      // No need to "handle" exception, *currchar was not changed
    }
    #else // TEQUIVCHARS_HASH_LOOKUP => Use binary search
    *currChar = p->lookup(*currChar);
    #endif // TEQUIVCHARS_HASH_LOOKUP
  }
 #endif // old->new code
  #ifdef verbose
  fwprintf( stderr, L"Wide string after:  \t%ls\n\n", tempWstring );
  #endif // verbose
  char *outputMBstring = new char[numBytesMax];
    // Versus stack allocated: char outputMBstring[numBytesMax];
  const wchar_t * p_tempWstring = tempWstring; // need this 2nd pointer!
  mbrlen( NULL, 0, &mbs );
  int rc = wcsrtombs( outputMBstring, &p_tempWstring, numBytesMax, &mbs );
  delete[] tempWstring;
  TQString outputQstring = TQString::fromLocal8Bit( outputMBstring );
  delete[] outputMBstring;
  return outputQstring;
 }
--- a/tdecore/tequivchars.h
+++ b/tdecore/tequivchars.h
@ -0,0 +1,38 @@
 #ifndef TEQUIVCHARS_H
 #define TEQUIVCHARS_H
 #include "tdelibs_export.h"
 #include <tqstring.h>
 /**
 *  Class representing a mapping of each alphanumeric character to its "collating
 *  equivalent" as defined by the Default Unicode Collation Entity Table (DUCET).
 *  The mapping is limited to single-codepoint characters <= U+FFFF.
 */
 class TDECORE_EXPORT TEquivChars
 {
 public:
  TEquivChars();
  ~TEquivChars();
  /**
      @return copy of @param inputString modified such that each alphanumeric
      character is replaced with it's collating character equivalent. If the
      value @param isRegex is true, the input string is treated as a regular
      expression and the alphabetical characters inside Posix bracket [::]
      expressions are left as-is 
   */
  TQString replaceChars( TQString inputString, bool isRegex = false );
  /**
      Alternative implementation of replaceChars function that uses some
      "multibyte string" / "wide character" functions defined in wchar.h.
   */
  TQString replaceCharsMB( TQString inputString, bool isRegex = false );
 private:
  class TEquivChars_Private *p;
 };
 #endif // TEQUIVCHARS_H