Introduce TEquivChars class and an associated global instance that

provides access to a table mapping alphanumeric characters to their collation equivalents as defined in the Default Unicode Collation Element Table (DUCET). Class is used by TSM to implement alphanumeric character handling EQUIVALENCE. Utilize fnmatch(3) on GLIBC systems for processing wildcard matching. Non-GLIBC systems will continue handle wildcard expressions by converting them to regex. Internal AuxData class introduced to further de-couple internal implementation details from public interfaces. Other changes were made based on feedback from @MicheleC. Signed-off-by: Vincent Reher <tde@4reher.org>
2 years ago · 0c107b54a2
parent 46fedcbe96
commit 0c107b54a2
3 changed files with 3664 additions and 0 deletions
--- a/tdecore/tequivchars-mapping.h
+++ b/tdecore/tequivchars-mapping.h
--- a/tdecore/tequivchars.cpp
+++ b/tdecore/tequivchars.cpp
@ -0,0 +1,553 @@
+#include "tequivchars.h"
+
+#include <wchar.h>
+#include <locale.h>
+#include <stdio.h>
+
+// #define TSM_PCRE2
+// #define TEQUIVCHARS_HASH_LOOKUP
+
+#ifndef TEQUIVCHARS_HASH_LOOKUP  // Using binary search on sorted array
+inline constexpr uint EquivTableROWS {2993}; // Make sure that these constants accurately
+inline constexpr uint EquivTableCOLS {   2}; // reflect what's in tequivchars-mapping.h
+#else // Using hash table lookup
+#include "tsl/hopscotch_map.h"
+#endif // TEQUIVCHARS_HASH_LOOKUP
+
+class TEquivChars_Private
+{
+public:
+#ifndef TEQUIVCHARS_HASH_LOOKUP // Using binary search on sorted array
+  inline wchar_t lookup( wchar_t widechar );
+  wchar_t EquivalentsTable[EquivTableROWS][EquivTableCOLS] = {
+    #include "tequivchars-mapping.h"
+  };
+#else // Using hash table lookup
+  tsl::hopscotch_map<wchar_t, wchar_t> EquivalentsTable = {
+    #include "tequivchars-mapping.h"
+  };
+#endif // TEQUIVCHARS_HASH_LOOKUP
+};
+
+#ifndef TEQUIVCHARS_HASH_LOOKUP // Using binary search on sorted array
+inline wchar_t TEquivChars_Private::lookup( wchar_t key )
+{
+  int low  =  0;
+  int high =  EquivTableROWS - 1;
+  int iteration = 0;
+  while (low <= high) {
+    iteration++;
+    int mid = low + (high - low) / 2;
+    // fprintf( stderr, "high = %u, mid = %u, low = %u\n", high, mid, low );
+    if (EquivalentsTable[mid][0] == key) {
+      //-Debug: fprintf( stderr,
+      //-Debug:   "After %d lookups, character 0x%04x was mapped to 0x%04x \n",
+      //-Debug:   iteration, key, EquivalentsTable[mid][1]
+      //-Debug: );
+      return EquivalentsTable[mid][1];
+    }
+    if ( EquivalentsTable[mid][0] < key )
+      low = mid + 1;
+    else
+      high = mid - 1;
+  }
+  //-Debug: fprintf( stderr, "Not found after %d lookups\n", iteration );
+  return key;
+}
+#endif // TEQUIVCHARS_HASH_LOOKUP
+
+TEquivChars::TEquivChars()
+{
+  p = new TEquivChars_Private;
+}
+
+TEquivChars::~TEquivChars()
+{
+  delete p;
+}
+
+/*
+   There are 2 implementations of table lookup within this fuction: one that
+   uses hash table lookup and another that uses binary search on a presorted
+   table. We default using the binary search implementation for these reasons:
+
+   1. The hash table implementation consumes more RAM.
+   2. The hash table implementation consumes slightly less CPU when used on a
+      large dataset that contains no replaceable characters (all misses) BUT
+      consumes significantly more CPU when used on dataset that contains many
+      replaceable characters (mostly hits).
+   3. The theoretical benefits of using hash lookups on such a small table
+      will probably never be realized.
+
+  The hashing used was 
+
+  https://github.com/Tessil/robin-map
+*/
+
+TQString TEquivChars::replaceChars( TQString inputQstring, bool isRegex )
+{
+  int inStrLen = inputQstring.length();
+  TQString outString = TQString::fromLatin1( "" );
+  const TQChar *char16 = inputQstring.unicode();
+
+  bool backSlashed        = false; // \_
+  bool startedCharClass   = false; // Previous character was starting '[' of character class
+  bool inCharacterClass   = false; // [___]
+  bool inPosixBracketExpr = false; // [:___:]
+#ifdef TSM_PCRE2
+  bool quoteLiteral       = false; // \Q___\E
+  bool inBraceExpr        = false; // \c{___} where 'c' is any of: 'x' 'o' 'p' 'P' 'N' 'g'
+  bool inDirective        = false; // (*___)
+  bool inGroupName        = false; // (?<___>
+#endif // TSM_PCRE2
+  wchar_t currChar  = 0;
+  wchar_t prevChar  = 0;
+  wchar_t nextChar  = 0;
+
+  for ( int i = 0 ; i < inStrLen ; i++ , outString += TQChar(currChar)  )
+  {
+    prevChar = currChar;
+    currChar = char16[i].unicode();
+
+    if ( isRegex ) {
+
+      if ( i < ( inStrLen - 1 ) )
+        nextChar = char16[i+1].unicode();
+      else
+        nextChar = 0;
+
+      if ( currChar == L'\\' ) {
+        backSlashed = true;
+        continue;
+      }
+
+      // Don't convert backSlashed characters
+      if ( backSlashed ) {
+#ifdef TSM_PCRE2
+        switch (currChar) {
+          case 'Q' : quoteLiteral = true;  break; // Entering literal \Q___\E
+          case 'E' : quoteLiteral = false; break; // Leaving literal \Q___\E
+          case 'N' : // Entering Unicode codepoint specification \N{U+___} ?
+          case 'P' : // Entering (negated) Unicode property specification \p{} ?
+          case 'p' : // Entering Unicode property specification \p{} ?
+          case 'g' : // Entering a named backreference \g{___} ?
+            if ( nextChar == L'{' ) inBraceExpr = true;
+            break;
+        }
+#endif // TSM_PCRE2
+        backSlashed = false;
+        continue;
+      }
+
+#ifdef TSM_PCRE2
+      if ( quoteLiteral )
+        continue;
+
+      if ( inBraceExpr ) {
+        // Is it time to leave brace expression {___} ?
+        if ( nextChar == L'}' ) inBraceExpr = true;
+        continue;
+      }
+#endif // TSM_PCRE2
+
+      if ( startedCharClass ) {
+        switch (currChar) {
+          case L'^' : // Negated character class, proceed to next character
+            continue; // Bypass converting this special character
+          case L']' : // Treat as part of character class, not as a closure
+          case L':' : // Treat as part of character class, not as start of bracket expression
+            startedCharClass = false;
+            continue;  // Bypass converting these special characters
+        }
+        startedCharClass = false;
+      } // startedCharClass
+
+      if ( inCharacterClass ) {
+
+        if ( inPosixBracketExpr ) {
+          // Is it time to leave POSIX bracket expression [:___:] ?
+          if ( currChar == L':' && nextChar == L']' ) inPosixBracketExpr = false;
+          continue;
+        } // inPosixBracketExpr
+
+        else { // ! inPosixBracketExpr
+
+          if ( prevChar == L'[' && currChar == L':' ) {
+            // Enter POSIX bracket expression [:___:]
+            inPosixBracketExpr = true;
+            continue;
+          }
+
+          if ( currChar == L']' ) {
+            // Leaving character class [___]
+            inCharacterClass = false;
+            continue;
+          }
+
+        } // ! inPosixBracketExpr
+
+      } // inCharacterClass
+
+      else { // ! inCharacterClass
+
+        switch (currChar) {
+
+          case '[' :
+            // Entering a character class [___]
+            startedCharClass = true;
+            inCharacterClass = true;
+            continue;
+            break;
+#ifdef TSM_PCRE2
+          case '*' :
+            if ( prevChar != '(' ) continue;
+            // Entering a PCRE2 directive (*___)
+            inDirective = true;
+            continue;
+            break;
+
+          case '?' :
+            if ( prevChar != '(' ) continue;
+            if ( nextChar != '<' ) continue;
+            // Entering PCRE2 group name (?<___>)
+            inGroupName = true;
+            continue;
+            break;
+#endif // TSM_PCRE2
+        }
+#ifdef TSM_PCRE2
+        if ( inDirective ) {
+          // Is it time to leave PCRE2 directive (*___) ?
+          if (currChar == ')' ) inDirective = false;
+          continue;
+        }
+
+        if ( inGroupName ) {
+          // Is it time to leave PCRE2 group name (?<___>) ?
+          if (currChar == '>' ) inGroupName = false;
+          continue;
+        }
+#endif // TSM_PCRE2
+      } // ! inCharacterClass
+
+      // If we reach here, currChar will be convertd
+
+    } // isRegex
+
+    CONVERT:
+
+    #ifdef TEQUIVCHARS_HASH_LOOKUP
+    try {
+      currChar = p->EquivalentsTable.at(currChar);
+    }
+    catch(const std::exception&) {
+      // No need to "handle" exception, *currchar was not changed
+    }
+    #else // TEQUIVCHARS_HASH_LOOKUP => Use binary search
+    currChar = p->lookup(currChar);
+    #endif // TEQUIVCHARS_HASH_LOOKUP
+  }
+
+  return outString;
+}
+
+/*
+    This is the original implementation of the replaceChars function. It works
+    by converting inputString (UTF16) to a "multibyte" (typically UTF8) string,
+    then to an array of "wide" (32-bit) characters which is used for character
+    replacement.The result is converted back into a TQString and returned.
+    Despite the multiple conversions, this function is almost as fast as
+    the one that replaced it.
+*/
+
+TQString TEquivChars::replaceCharsMB( TQString inputQstring, bool isRegex )
+{
+  setlocale(LC_CTYPE, "");
+  const char* inputMBstring = inputQstring.local8Bit().data();
+
+  //-Debug: printf("Size of inputMBstring (metric 1): %d\n", inputQstring.length()   );
+  //-Debug: printf("Size of inputMBstring (metric 2): %ld\n", strlen( inputMBstring ) );
+
+  //--- Allocate "wide" work string
+  size_t numCharsMax;
+  numCharsMax = inputQstring.length()+2; // Start with a hopefully safe overestimate
+  wchar_t *tempWstring = new wchar_t[numCharsMax]; // HEAP
+    // Versus stack allocated: wchar_t tempWstring[numCharsMax];
+  size_t numBytesMax = sizeof(wchar_t) * numCharsMax ;
+
+  //--- Load work string with "wide" characters from temporary input
+  const char * p_inputMBstring = inputMBstring; // need this 2nd pointer!
+  mbstate_t mbs = mbstate_t();
+  mbsrtowcs( tempWstring, &p_inputMBstring, numCharsMax, &mbs);
+
+  #ifdef verbose
+  size_t szWidechar = sizeof ( wchar_t ) ;
+  size_t numChars = wcslen(tempWstring);
+  size_t numBytes = numChars * szWidechar + 1;
+  // wchar_t charType[] = L"wchar_t" ;
+  // fwprintf( stderr, L"Wide character '%ls' occupies %ld bytes\n", charType, szWidechar );
+  fwprintf( stderr,
+    L"\nTemporary wide string contains %ld wide characters (using %ld of %ld allocated bytes)\n",
+    numChars, numBytes, numBytesMax
+  );
+  fwprintf( stderr, L"Wide string before: \t%ls\n", tempWstring );
+  #endif // verbose
+
+#if 0 // old code
+  bool backSlashed        = false;
+  bool startedCharClass   = false;
+  bool inCharacterClass   = false;
+  bool inPosixBracketExpr = false;
+  bool quoteLiteral       = false;
+  wchar_t prevChar  = L'_';
+  for ( wchar_t* currChar = tempWstring ; *currChar != L'\0' ;  prevChar = *currChar, currChar++ )
+  {
+    if ( isRegex ) {
+      if ( startedCharClass ) {
+        //-Debug: printf( "Character class starting (%c%c)\n", prevChar, *currChar );
+        switch (*currChar) {
+          case  L'^' : // Negated character class, proceed to next character
+            continue; //goto NOCONVERT;
+          case  L']' : // Treat part of character class, not a closure
+          case  L':' : // Treat part of character class, not as start of bracket expression
+            startedCharClass = false;
+            continue; //goto NOCONVERT;
+        }
+        startedCharClass = false;
+      } // startedCharClass
+
+      // Don't convert backSlashed characters
+      if ( backSlashed ) {
+        backSlashed = false;
+        continue; //goto NOCONVERT;
+      }
+      if ( *currChar == L'\\' ) {
+        backSlashed = true;
+        continue; //goto NOCONVERT;
+      }
+
+      // Don't convert characters that are part of bracket expressions
+      if ( inCharacterClass ) {
+        if ( inPosixBracketExpr ) {
+          if ( prevChar == L':' && *currChar == L']' ) {
+            //printf( "Leaving POSIX bracket expression (%c%c)\n", prevChar, *currChar );
+            inPosixBracketExpr = false;
+            continue; //goto NOCONVERT;
+          }
+          continue; //goto NOCONVERT;
+        } // inPosixBracketExpr
+        else { // ! inPosixBracketExpr
+          if ( prevChar == L'[' && *currChar == L':' ) {
+            // POSIX bracket expression open
+            //-Debug: printf( "Entering POSIX bracket expression (%c%c)\n", prevChar, *currChar );
+            inPosixBracketExpr = true;
+            continue; //goto NOCONVERT;
+          }
+          if ( *currChar == L']' ) {
+            // Character class closure
+            //-Debug: printf( "Leaving character class (%c%c)\n", prevChar, *currChar );
+            inCharacterClass = false;
+            continue; //goto NOCONVERT;
+          }
+          // Convert unescaped character inside character class
+          // if it's not part of a POSIX bracket expression
+          goto CONVERT;
+        } // ! inPosixBracketExpr
+      } // inCharacterClass
+
+      else { // ! inCharacterClass
+        if ( *currChar == L'[' ) {
+          // Character class opener
+          startedCharClass = true;
+          inCharacterClass = true;
+          //-Debug: printf( "Entering character class (%c%c)\n", prevChar, *currChar );
+          continue; //goto NOCONVERT;
+        }
+        // Convert unescaped character outside of character class
+        goto CONVERT;
+      } // ! inCharacterClass
+
+    } // isRegex
+
+    CONVERT:
+    #ifdef TEQUIVCHARS_HASH_LOOKUP
+    try {
+      *currChar = p->EquivalentsTable.at(*currChar);
+    }
+    catch(const std::exception&) {
+      // No need to "handle" exception, *currchar was not changed
+    }
+    #else // TEQUIVCHARS_HASH_LOOKUP => Use binary search
+    *currChar = p->lookup(*currChar);
+    #endif // TEQUIVCHARS_HASH_LOOKUP
+    //-Debug: printf( "Xlate: %c\n", *currChar );
+  }
+#else // new code
+  bool backSlashed        = false; // \_
+  bool startedCharClass   = false; // Previous character was starting '[' of character class
+  bool inCharacterClass   = false; // [___]
+  bool inPosixBracketExpr = false; // [:___:]
+#ifdef TSM_PCRE2
+  bool quoteLiteral       = false; // \Q___\E
+  bool inBraceExpr        = false; // \c{___} where 'c' is any of: 'x' 'o' 'p' 'P' 'N' 'g'
+  bool inDirective        = false; // (*___)
+  bool inGroupName        = false; // (?<___>
+#endif // TSM_PCRE2
+  wchar_t  prevChar = 0;
+  wchar_t *currChar = tempWstring;
+  wchar_t *nextChar = tempWstring+1;
+
+  for ( ; *currChar != L'\0' ; prevChar = *currChar, currChar++, nextChar++ )
+  {
+
+    if ( isRegex ) {
+
+      if ( *currChar == L'\\' ) {
+        backSlashed = true;
+        continue;
+      }
+
+      // Don't convert backSlashed characters
+      if ( backSlashed ) {
+#ifdef TSM_PCRE2
+        switch (*currChar) {
+          case 'Q' : quoteLiteral = true;  break; // Entering literal \Q___\E
+          case 'E' : quoteLiteral = false; break; // Leaving literal \Q___\E
+          case 'N' : // Entering Unicode codepoint specification \N{U+___} ?
+          case 'P' : // Entering (negated) Unicode property specification \p{} ?
+          case 'p' : // Entering Unicode property specification \p{} ?
+          case 'g' : // Entering a named backreference \g{___} ?
+            if ( *nextChar == L'{' ) inBraceExpr = true;
+            break;
+        }
+#endif // TSM_PCRE2
+        backSlashed = false;
+        continue;
+      }
+
+#ifdef TSM_PCRE2
+      if ( quoteLiteral )
+        continue;
+
+      if ( inBraceExpr ) {
+        // Is it time to leave brace expression {___} ?
+        if ( *nextChar == L'}' ) inBraceExpr = true;
+        continue;
+      }
+#endif // TSM_PCRE2
+
+      if ( startedCharClass ) {
+        switch (*currChar) {
+          case L'^' : // Negated character class, proceed to next character
+            continue; // Bypass converting this special character
+          case L']' : // Treat as part of character class, not as a closure
+          case L':' : // Treat as part of character class, not as start of bracket expression
+            startedCharClass = false;
+            continue;  // Bypass converting these special characters
+        }
+        startedCharClass = false;
+      } // startedCharClass
+
+      if ( inCharacterClass ) {
+
+        if ( inPosixBracketExpr ) {
+          // Is it time to leave POSIX bracket expression [:___:] ?
+          if ( *currChar == L':' && *nextChar == L']' ) inPosixBracketExpr = false;
+          continue;
+        } // inPosixBracketExpr
+
+        else { // ! inPosixBracketExpr
+
+          if ( prevChar == L'[' && *currChar == L':' ) {
+            // Enter POSIX bracket expression [:___:]
+            inPosixBracketExpr = true;
+            continue;
+          }
+
+          if ( *currChar == L']' ) {
+            // Leaving character class [___]
+            inCharacterClass = false;
+            continue;
+          }
+
+        } // ! inPosixBracketExpr
+
+      } // inCharacterClass
+
+      else { // ! inCharacterClass
+
+        switch (*currChar) {
+
+          case '[' :
+            // Entering a character class [___]
+            startedCharClass = true;
+            inCharacterClass = true;
+            continue;
+            break;
+#ifdef TSM_PCRE2
+          case '*' :
+            if ( prevChar != '(' ) continue;
+            // Entering a PCRE2 directive (*___)
+            inDirective = true;
+            continue;
+            break;
+
+          case '?' :
+            if ( prevChar != '(' ) continue;
+            if ( *nextChar != '<' ) continue;
+            // Entering PCRE2 group name (?<___>)
+            inGroupName = true;
+            continue;
+            break;
+#endif // TSM_PCRE2
+        }
+#ifdef TSM_PCRE2
+        if ( inDirective ) {
+          // Is it time to leave PCRE2 directive (*___) ?
+          if (*currChar == ')' ) inDirective = false;
+          continue;
+        }
+
+        if ( inGroupName ) {
+          // Is it time to leave PCRE2 group name (?<___>) ?
+          if (*currChar == '>' ) inGroupName = false;
+          continue;
+        }
+#endif // TSM_PCRE2
+      } // ! inCharacterClass
+
+      // If we reach here, *currChar will be convertd
+
+    } // isRegex
+
+    CONVERT:
+
+    #ifdef TEQUIVCHARS_HASH_LOOKUP
+    try {
+      *currChar = p->EquivalentsTable.at(*currChar);
+    }
+    catch(const std::exception&) {
+      // No need to "handle" exception, *currchar was not changed
+    }
+    #else // TEQUIVCHARS_HASH_LOOKUP => Use binary search
+    *currChar = p->lookup(*currChar);
+    #endif // TEQUIVCHARS_HASH_LOOKUP
+  }
+
+#endif // old->new code
+
+  #ifdef verbose
+  fwprintf( stderr, L"Wide string after:  \t%ls\n\n", tempWstring );
+  #endif // verbose
+
+  char *outputMBstring = new char[numBytesMax];
+    // Versus stack allocated: char outputMBstring[numBytesMax];
+  const wchar_t * p_tempWstring = tempWstring; // need this 2nd pointer!
+  mbrlen( NULL, 0, &mbs );
+  int rc = wcsrtombs( outputMBstring, &p_tempWstring, numBytesMax, &mbs );
+
+  delete[] tempWstring;
+  TQString outputQstring = TQString::fromLocal8Bit( outputMBstring );
+  delete[] outputMBstring;
+  return outputQstring;
+}
--- a/tdecore/tequivchars.h
+++ b/tdecore/tequivchars.h
@ -0,0 +1,38 @@
+#ifndef TEQUIVCHARS_H
+#define TEQUIVCHARS_H
+
+#include "tdelibs_export.h"
+
+#include <tqstring.h>
+
+/**
+ *  Class representing a mapping of each alphanumeric character to its "collating
+ *  equivalent" as defined by the Default Unicode Collation Entity Table (DUCET).
+ *  The mapping is limited to single-codepoint characters <= U+FFFF.
+ */
+class TDECORE_EXPORT TEquivChars
+{
+public:
+  TEquivChars();
+  ~TEquivChars();
+
+  /**
+      @return copy of @param inputString modified such that each alphanumeric
+      character is replaced with it's collating character equivalent. If the
+      value @param isRegex is true, the input string is treated as a regular
+      expression and the alphabetical characters inside Posix bracket [::]
+      expressions are left as-is 
+   */
+  TQString replaceChars( TQString inputString, bool isRegex = false );
+
+  /**
+      Alternative implementation of replaceChars function that uses some
+      "multibyte string" / "wide character" functions defined in wchar.h.
+   */
+  TQString replaceCharsMB( TQString inputString, bool isRegex = false );
+
+private:
+  class TEquivChars_Private *p;
+};
+
+#endif // TEQUIVCHARS_H