tdelibs/tdecore/tequivchars.cpp

#include "tequivchars.h"

#include <wchar.h>
#include <locale.h>
#include <stdio.h>

// #define TSM_PCRE2
// #define TEQUIVCHARS_HASH_LOOKUP

#ifndef TEQUIVCHARS_HASH_LOOKUP  // Using binary search on sorted array
inline constexpr uint EquivTableROWS {2993}; // Make sure that these constants accurately
inline constexpr uint EquivTableCOLS {   2}; // reflect what's in tequivchars-mapping.h
#else // Using hash table lookup
#include "tsl/hopscotch_map.h"
#endif // TEQUIVCHARS_HASH_LOOKUP

class TEquivChars_Private
{
public:
#ifndef TEQUIVCHARS_HASH_LOOKUP // Using binary search on sorted array
  inline wchar_t lookup( wchar_t widechar );
  wchar_t EquivalentsTable[EquivTableROWS][EquivTableCOLS] = {
    #include "tequivchars-mapping.h"
  };
#else // Using hash table lookup
  tsl::hopscotch_map<wchar_t, wchar_t> EquivalentsTable = {
    #include "tequivchars-mapping.h"
  };
#endif // TEQUIVCHARS_HASH_LOOKUP
};

#ifndef TEQUIVCHARS_HASH_LOOKUP // Using binary search on sorted array
inline wchar_t TEquivChars_Private::lookup( wchar_t key )
{
  int low  =  0;
  int high =  EquivTableROWS - 1;
  int iteration = 0;
  while (low <= high) {
    iteration++;
    int mid = low + (high - low) / 2;
    // fprintf( stderr, "high = %u, mid = %u, low = %u\n", high, mid, low );
    if (EquivalentsTable[mid][0] == key) {
      //-Debug: fprintf( stderr,
      //-Debug:   "After %d lookups, character 0x%04x was mapped to 0x%04x \n",
      //-Debug:   iteration, key, EquivalentsTable[mid][1]
      //-Debug: );
      return EquivalentsTable[mid][1];
    }
    if ( EquivalentsTable[mid][0] < key )
      low = mid + 1;
    else
      high = mid - 1;
  }
  //-Debug: fprintf( stderr, "Not found after %d lookups\n", iteration );
  return key;
}
#endif // TEQUIVCHARS_HASH_LOOKUP

TEquivChars::TEquivChars()
{
  p = new TEquivChars_Private;
}

TEquivChars::~TEquivChars()
{
  delete p;
}

/*
   There are 2 implementations of table lookup within this fuction: one that
   uses hash table lookup and another that uses binary search on a presorted
   table. We default using the binary search implementation for these reasons:

   1. The hash table implementation consumes more RAM.
   2. The hash table implementation consumes slightly less CPU when used on a
      large dataset that contains no replaceable characters (all misses) BUT
      consumes significantly more CPU when used on dataset that contains many
      replaceable characters (mostly hits).
   3. The theoretical benefits of using hash lookups on such a small table
      will probably never be realized.

  The hashing used was

  https://github.com/Tessil/robin-map
*/

TQString TEquivChars::replaceChars( TQString inputQstring, bool isRegex )
{
  int inStrLen = inputQstring.length();
  TQString outString = TQString::fromLatin1( "" );
  const TQChar *char16 = inputQstring.unicode();

  bool backSlashed        = false; // \_
  bool startedCharClass   = false; // Previous character was starting '[' of character class
  bool inCharacterClass   = false; // [___]
  bool inPosixBracketExpr = false; // [:___:]
#ifdef TSM_PCRE2
  bool quoteLiteral       = false; // \Q___\E
  bool inBraceExpr        = false; // \c{___} where 'c' is any of: 'x' 'o' 'p' 'P' 'N' 'g'
  bool inDirective        = false; // (*___)
  bool inGroupName        = false; // (?<___>
#endif // TSM_PCRE2
  wchar_t currChar  = 0;
  wchar_t prevChar  = 0;
  wchar_t nextChar  = 0;

  for ( int i = 0 ; i < inStrLen ; i++ , outString += TQChar(currChar)  )
  {
    prevChar = currChar;
    currChar = char16[i].unicode();

    if ( isRegex ) {

      if ( i < ( inStrLen - 1 ) )
        nextChar = char16[i+1].unicode();
      else
        nextChar = 0;

      if ( currChar == L'\\' ) {
        backSlashed = true;
        continue;
      }

      // Don't convert backSlashed characters
      if ( backSlashed ) {
#ifdef TSM_PCRE2
        switch (currChar) {
          case 'Q' : quoteLiteral = true;  break; // Entering literal \Q___\E
          case 'E' : quoteLiteral = false; break; // Leaving literal \Q___\E
          case 'N' : // Entering Unicode codepoint specification \N{U+___} ?
          case 'P' : // Entering (negated) Unicode property specification \p{} ?
          case 'p' : // Entering Unicode property specification \p{} ?
          case 'g' : // Entering a named backreference \g{___} ?
            if ( nextChar == L'{' ) inBraceExpr = true;
            break;
        }
#endif // TSM_PCRE2
        backSlashed = false;
        continue;
      }

#ifdef TSM_PCRE2
      if ( quoteLiteral )
        continue;

      if ( inBraceExpr ) {
        // Is it time to leave brace expression {___} ?
        if ( nextChar == L'}' ) inBraceExpr = true;
        continue;
      }
#endif // TSM_PCRE2

      if ( startedCharClass ) {
        switch (currChar) {
          case L'^' : // Negated character class, proceed to next character
            continue; // Bypass converting this special character
          case L']' : // Treat as part of character class, not as a closure
          case L':' : // Treat as part of character class, not as start of bracket expression
            startedCharClass = false;
            continue;  // Bypass converting these special characters
        }
        startedCharClass = false;
      } // startedCharClass

      if ( inCharacterClass ) {

        if ( inPosixBracketExpr ) {
          // Is it time to leave POSIX bracket expression [:___:] ?
          if ( currChar == L':' && nextChar == L']' ) inPosixBracketExpr = false;
          continue;
        } // inPosixBracketExpr

        else { // ! inPosixBracketExpr

          if ( prevChar == L'[' && currChar == L':' ) {
            // Enter POSIX bracket expression [:___:]
            inPosixBracketExpr = true;
            continue;
          }

          if ( currChar == L']' ) {
            // Leaving character class [___]
            inCharacterClass = false;
            continue;
          }

        } // ! inPosixBracketExpr

      } // inCharacterClass

      else { // ! inCharacterClass

        switch (currChar) {

          case '[' :
            // Entering a character class [___]
            startedCharClass = true;
            inCharacterClass = true;
            continue;
            break;
#ifdef TSM_PCRE2
          case '*' :
            if ( prevChar != '(' ) continue;
            // Entering a PCRE2 directive (*___)
            inDirective = true;
            continue;
            break;

          case '?' :
            if ( prevChar != '(' ) continue;
            if ( nextChar != '<' ) continue;
            // Entering PCRE2 group name (?<___>)
            inGroupName = true;
            continue;
            break;
#endif // TSM_PCRE2
        }
#ifdef TSM_PCRE2
        if ( inDirective ) {
          // Is it time to leave PCRE2 directive (*___) ?
          if (currChar == ')' ) inDirective = false;
          continue;
        }

        if ( inGroupName ) {
          // Is it time to leave PCRE2 group name (?<___>) ?
          if (currChar == '>' ) inGroupName = false;
          continue;
        }
#endif // TSM_PCRE2
      } // ! inCharacterClass

      // If we reach here, currChar will be convertd

    } // isRegex

    CONVERT:

    #ifdef TEQUIVCHARS_HASH_LOOKUP
    try {
      currChar = p->EquivalentsTable.at(currChar);
    }
    catch(const std::exception&) {
      // No need to "handle" exception, *currchar was not changed
    }
    #else // TEQUIVCHARS_HASH_LOOKUP => Use binary search
    currChar = p->lookup(currChar);
    #endif // TEQUIVCHARS_HASH_LOOKUP
  }

  return outString;
}

/*
    This is the original implementation of the replaceChars function. It works
    by converting inputString (UTF16) to a "multibyte" (typically UTF8) string,
    then to an array of "wide" (32-bit) characters which is used for character
    replacement.The result is converted back into a TQString and returned.
    Despite the multiple conversions, this function is almost as fast as
    the one that replaced it.
*/

TQString TEquivChars::replaceCharsMB( TQString inputQstring, bool isRegex )
{
  setlocale(LC_CTYPE, "");
  const char* inputMBstring = inputQstring.local8Bit().data();

  //-Debug: printf("Size of inputMBstring (metric 1): %d\n", inputQstring.length()   );
  //-Debug: printf("Size of inputMBstring (metric 2): %ld\n", strlen( inputMBstring ) );

  //--- Allocate "wide" work string
  size_t numCharsMax;
  numCharsMax = inputQstring.length()+2; // Start with a hopefully safe overestimate
  wchar_t *tempWstring = new wchar_t[numCharsMax]; // HEAP
    // Versus stack allocated: wchar_t tempWstring[numCharsMax];
  size_t numBytesMax = sizeof(wchar_t) * numCharsMax ;

  //--- Load work string with "wide" characters from temporary input
  const char * p_inputMBstring = inputMBstring; // need this 2nd pointer!
  mbstate_t mbs = mbstate_t();
  mbsrtowcs( tempWstring, &p_inputMBstring, numCharsMax, &mbs);

  #ifdef verbose
  size_t szWidechar = sizeof ( wchar_t ) ;
  size_t numChars = wcslen(tempWstring);
  size_t numBytes = numChars * szWidechar + 1;
  // wchar_t charType[] = L"wchar_t" ;
  // fwprintf( stderr, L"Wide character '%ls' occupies %ld bytes\n", charType, szWidechar );
  fwprintf( stderr,
    L"\nTemporary wide string contains %ld wide characters (using %ld of %ld allocated bytes)\n",
    numChars, numBytes, numBytesMax
  );
  fwprintf( stderr, L"Wide string before: \t%ls\n", tempWstring );
  #endif // verbose

#if 0 // old code
  bool backSlashed        = false;
  bool startedCharClass   = false;
  bool inCharacterClass   = false;
  bool inPosixBracketExpr = false;
  bool quoteLiteral       = false;
  wchar_t prevChar  = L'_';
  for ( wchar_t* currChar = tempWstring ; *currChar != L'\0' ;  prevChar = *currChar, currChar++ )
  {
    if ( isRegex ) {
      if ( startedCharClass ) {
        //-Debug: printf( "Character class starting (%c%c)\n", prevChar, *currChar );
        switch (*currChar) {
          case  L'^' : // Negated character class, proceed to next character
            continue; //goto NOCONVERT;
          case  L']' : // Treat part of character class, not a closure
          case  L':' : // Treat part of character class, not as start of bracket expression
            startedCharClass = false;
            continue; //goto NOCONVERT;
        }
        startedCharClass = false;
      } // startedCharClass

      // Don't convert backSlashed characters
      if ( backSlashed ) {
        backSlashed = false;
        continue; //goto NOCONVERT;
      }
      if ( *currChar == L'\\' ) {
        backSlashed = true;
        continue; //goto NOCONVERT;
      }

      // Don't convert characters that are part of bracket expressions
      if ( inCharacterClass ) {
        if ( inPosixBracketExpr ) {
          if ( prevChar == L':' && *currChar == L']' ) {
            //printf( "Leaving POSIX bracket expression (%c%c)\n", prevChar, *currChar );
            inPosixBracketExpr = false;
            continue; //goto NOCONVERT;
          }
          continue; //goto NOCONVERT;
        } // inPosixBracketExpr
        else { // ! inPosixBracketExpr
          if ( prevChar == L'[' && *currChar == L':' ) {
            // POSIX bracket expression open
            //-Debug: printf( "Entering POSIX bracket expression (%c%c)\n", prevChar, *currChar );
            inPosixBracketExpr = true;
            continue; //goto NOCONVERT;
          }
          if ( *currChar == L']' ) {
            // Character class closure
            //-Debug: printf( "Leaving character class (%c%c)\n", prevChar, *currChar );
            inCharacterClass = false;
            continue; //goto NOCONVERT;
          }
          // Convert unescaped character inside character class
          // if it's not part of a POSIX bracket expression
          goto CONVERT;
        } // ! inPosixBracketExpr
      } // inCharacterClass

      else { // ! inCharacterClass
        if ( *currChar == L'[' ) {
          // Character class opener
          startedCharClass = true;
          inCharacterClass = true;
          //-Debug: printf( "Entering character class (%c%c)\n", prevChar, *currChar );
          continue; //goto NOCONVERT;
        }
        // Convert unescaped character outside of character class
        goto CONVERT;
      } // ! inCharacterClass

    } // isRegex

    CONVERT:
    #ifdef TEQUIVCHARS_HASH_LOOKUP
    try {
      *currChar = p->EquivalentsTable.at(*currChar);
    }
    catch(const std::exception&) {
      // No need to "handle" exception, *currchar was not changed
    }
    #else // TEQUIVCHARS_HASH_LOOKUP => Use binary search
    *currChar = p->lookup(*currChar);
    #endif // TEQUIVCHARS_HASH_LOOKUP
    //-Debug: printf( "Xlate: %c\n", *currChar );
  }
#else // new code
  bool backSlashed        = false; // \_
  bool startedCharClass   = false; // Previous character was starting '[' of character class
  bool inCharacterClass   = false; // [___]
  bool inPosixBracketExpr = false; // [:___:]
#ifdef TSM_PCRE2
  bool quoteLiteral       = false; // \Q___\E
  bool inBraceExpr        = false; // \c{___} where 'c' is any of: 'x' 'o' 'p' 'P' 'N' 'g'
  bool inDirective        = false; // (*___)
  bool inGroupName        = false; // (?<___>
#endif // TSM_PCRE2
  wchar_t  prevChar = 0;
  wchar_t *currChar = tempWstring;
  wchar_t *nextChar = tempWstring+1;

  for ( ; *currChar != L'\0' ; prevChar = *currChar, currChar++, nextChar++ )
  {

    if ( isRegex ) {

      if ( *currChar == L'\\' ) {
        backSlashed = true;
        continue;
      }

      // Don't convert backSlashed characters
      if ( backSlashed ) {
#ifdef TSM_PCRE2
        switch (*currChar) {
          case 'Q' : quoteLiteral = true;  break; // Entering literal \Q___\E
          case 'E' : quoteLiteral = false; break; // Leaving literal \Q___\E
          case 'N' : // Entering Unicode codepoint specification \N{U+___} ?
          case 'P' : // Entering (negated) Unicode property specification \p{} ?
          case 'p' : // Entering Unicode property specification \p{} ?
          case 'g' : // Entering a named backreference \g{___} ?
            if ( *nextChar == L'{' ) inBraceExpr = true;
            break;
        }
#endif // TSM_PCRE2
        backSlashed = false;
        continue;
      }

#ifdef TSM_PCRE2
      if ( quoteLiteral )
        continue;

      if ( inBraceExpr ) {
        // Is it time to leave brace expression {___} ?
        if ( *nextChar == L'}' ) inBraceExpr = true;
        continue;
      }
#endif // TSM_PCRE2

      if ( startedCharClass ) {
        switch (*currChar) {
          case L'^' : // Negated character class, proceed to next character
            continue; // Bypass converting this special character
          case L']' : // Treat as part of character class, not as a closure
          case L':' : // Treat as part of character class, not as start of bracket expression
            startedCharClass = false;
            continue;  // Bypass converting these special characters
        }
        startedCharClass = false;
      } // startedCharClass

      if ( inCharacterClass ) {

        if ( inPosixBracketExpr ) {
          // Is it time to leave POSIX bracket expression [:___:] ?
          if ( *currChar == L':' && *nextChar == L']' ) inPosixBracketExpr = false;
          continue;
        } // inPosixBracketExpr

        else { // ! inPosixBracketExpr

          if ( prevChar == L'[' && *currChar == L':' ) {
            // Enter POSIX bracket expression [:___:]
            inPosixBracketExpr = true;
            continue;
          }

          if ( *currChar == L']' ) {
            // Leaving character class [___]
            inCharacterClass = false;
            continue;
          }

        } // ! inPosixBracketExpr

      } // inCharacterClass

      else { // ! inCharacterClass

        switch (*currChar) {

          case '[' :
            // Entering a character class [___]
            startedCharClass = true;
            inCharacterClass = true;
            continue;
            break;
#ifdef TSM_PCRE2
          case '*' :
            if ( prevChar != '(' ) continue;
            // Entering a PCRE2 directive (*___)
            inDirective = true;
            continue;
            break;

          case '?' :
            if ( prevChar != '(' ) continue;
            if ( *nextChar != '<' ) continue;
            // Entering PCRE2 group name (?<___>)
            inGroupName = true;
            continue;
            break;
#endif // TSM_PCRE2
        }
#ifdef TSM_PCRE2
        if ( inDirective ) {
          // Is it time to leave PCRE2 directive (*___) ?
          if (*currChar == ')' ) inDirective = false;
          continue;
        }

        if ( inGroupName ) {
          // Is it time to leave PCRE2 group name (?<___>) ?
          if (*currChar == '>' ) inGroupName = false;
          continue;
        }
#endif // TSM_PCRE2
      } // ! inCharacterClass

      // If we reach here, *currChar will be convertd

    } // isRegex

    CONVERT:

    #ifdef TEQUIVCHARS_HASH_LOOKUP
    try {
      *currChar = p->EquivalentsTable.at(*currChar);
    }
    catch(const std::exception&) {
      // No need to "handle" exception, *currchar was not changed
    }
    #else // TEQUIVCHARS_HASH_LOOKUP => Use binary search
    *currChar = p->lookup(*currChar);
    #endif // TEQUIVCHARS_HASH_LOOKUP
  }

#endif // old->new code

  #ifdef verbose
  fwprintf( stderr, L"Wide string after:  \t%ls\n\n", tempWstring );
  #endif // verbose

  char *outputMBstring = new char[numBytesMax];
    // Versus stack allocated: char outputMBstring[numBytesMax];
  const wchar_t * p_tempWstring = tempWstring; // need this 2nd pointer!
  mbrlen( NULL, 0, &mbs );
  int rc = wcsrtombs( outputMBstring, &p_tempWstring, numBytesMax, &mbs );

  delete[] tempWstring;
  TQString outputQstring = TQString::fromLocal8Bit( outputMBstring );
  delete[] outputMBstring;
  return outputQstring;
}