Introduce TEquivChars class and an associated global instance that

provides access to a table mapping alphanumeric characters to their
collation equivalents as defined in the Default Unicode Collation
Element Table (DUCET). Class is used by TSM to implement
alphanumeric character handling EQUIVALENCE.

Utilize fnmatch(3) on GLIBC systems for processing wildcard matching.
Non-GLIBC systems will continue handle wildcard expressions by
converting them to regex.

Internal AuxData class introduced to further de-couple internal
implementation details from public interfaces.

Other changes were made based on feedback from @MicheleC.

Signed-off-by: Vincent Reher <tde@4reher.org>
pull/179/head
Vincent Reher 2 years ago
parent 46fedcbe96
commit 0c107b54a2

File diff suppressed because it is too large Load Diff

@ -0,0 +1,553 @@
#include "tequivchars.h"
#include <wchar.h>
#include <locale.h>
#include <stdio.h>
// #define TSM_PCRE2
// #define TEQUIVCHARS_HASH_LOOKUP
#ifndef TEQUIVCHARS_HASH_LOOKUP // Using binary search on sorted array
inline constexpr uint EquivTableROWS {2993}; // Make sure that these constants accurately
inline constexpr uint EquivTableCOLS { 2}; // reflect what's in tequivchars-mapping.h
#else // Using hash table lookup
#include "tsl/hopscotch_map.h"
#endif // TEQUIVCHARS_HASH_LOOKUP
class TEquivChars_Private
{
public:
#ifndef TEQUIVCHARS_HASH_LOOKUP // Using binary search on sorted array
inline wchar_t lookup( wchar_t widechar );
wchar_t EquivalentsTable[EquivTableROWS][EquivTableCOLS] = {
#include "tequivchars-mapping.h"
};
#else // Using hash table lookup
tsl::hopscotch_map<wchar_t, wchar_t> EquivalentsTable = {
#include "tequivchars-mapping.h"
};
#endif // TEQUIVCHARS_HASH_LOOKUP
};
#ifndef TEQUIVCHARS_HASH_LOOKUP // Using binary search on sorted array
inline wchar_t TEquivChars_Private::lookup( wchar_t key )
{
int low = 0;
int high = EquivTableROWS - 1;
int iteration = 0;
while (low <= high) {
iteration++;
int mid = low + (high - low) / 2;
// fprintf( stderr, "high = %u, mid = %u, low = %u\n", high, mid, low );
if (EquivalentsTable[mid][0] == key) {
//-Debug: fprintf( stderr,
//-Debug: "After %d lookups, character 0x%04x was mapped to 0x%04x \n",
//-Debug: iteration, key, EquivalentsTable[mid][1]
//-Debug: );
return EquivalentsTable[mid][1];
}
if ( EquivalentsTable[mid][0] < key )
low = mid + 1;
else
high = mid - 1;
}
//-Debug: fprintf( stderr, "Not found after %d lookups\n", iteration );
return key;
}
#endif // TEQUIVCHARS_HASH_LOOKUP
TEquivChars::TEquivChars()
{
p = new TEquivChars_Private;
}
TEquivChars::~TEquivChars()
{
delete p;
}
/*
There are 2 implementations of table lookup within this fuction: one that
uses hash table lookup and another that uses binary search on a presorted
table. We default using the binary search implementation for these reasons:
1. The hash table implementation consumes more RAM.
2. The hash table implementation consumes slightly less CPU when used on a
large dataset that contains no replaceable characters (all misses) BUT
consumes significantly more CPU when used on dataset that contains many
replaceable characters (mostly hits).
3. The theoretical benefits of using hash lookups on such a small table
will probably never be realized.
The hashing used was
https://github.com/Tessil/robin-map
*/
TQString TEquivChars::replaceChars( TQString inputQstring, bool isRegex )
{
int inStrLen = inputQstring.length();
TQString outString = TQString::fromLatin1( "" );
const TQChar *char16 = inputQstring.unicode();
bool backSlashed = false; // \_
bool startedCharClass = false; // Previous character was starting '[' of character class
bool inCharacterClass = false; // [___]
bool inPosixBracketExpr = false; // [:___:]
#ifdef TSM_PCRE2
bool quoteLiteral = false; // \Q___\E
bool inBraceExpr = false; // \c{___} where 'c' is any of: 'x' 'o' 'p' 'P' 'N' 'g'
bool inDirective = false; // (*___)
bool inGroupName = false; // (?<___>
#endif // TSM_PCRE2
wchar_t currChar = 0;
wchar_t prevChar = 0;
wchar_t nextChar = 0;
for ( int i = 0 ; i < inStrLen ; i++ , outString += TQChar(currChar) )
{
prevChar = currChar;
currChar = char16[i].unicode();
if ( isRegex ) {
if ( i < ( inStrLen - 1 ) )
nextChar = char16[i+1].unicode();
else
nextChar = 0;
if ( currChar == L'\\' ) {
backSlashed = true;
continue;
}
// Don't convert backSlashed characters
if ( backSlashed ) {
#ifdef TSM_PCRE2
switch (currChar) {
case 'Q' : quoteLiteral = true; break; // Entering literal \Q___\E
case 'E' : quoteLiteral = false; break; // Leaving literal \Q___\E
case 'N' : // Entering Unicode codepoint specification \N{U+___} ?
case 'P' : // Entering (negated) Unicode property specification \p{} ?
case 'p' : // Entering Unicode property specification \p{} ?
case 'g' : // Entering a named backreference \g{___} ?
if ( nextChar == L'{' ) inBraceExpr = true;
break;
}
#endif // TSM_PCRE2
backSlashed = false;
continue;
}
#ifdef TSM_PCRE2
if ( quoteLiteral )
continue;
if ( inBraceExpr ) {
// Is it time to leave brace expression {___} ?
if ( nextChar == L'}' ) inBraceExpr = true;
continue;
}
#endif // TSM_PCRE2
if ( startedCharClass ) {
switch (currChar) {
case L'^' : // Negated character class, proceed to next character
continue; // Bypass converting this special character
case L']' : // Treat as part of character class, not as a closure
case L':' : // Treat as part of character class, not as start of bracket expression
startedCharClass = false;
continue; // Bypass converting these special characters
}
startedCharClass = false;
} // startedCharClass
if ( inCharacterClass ) {
if ( inPosixBracketExpr ) {
// Is it time to leave POSIX bracket expression [:___:] ?
if ( currChar == L':' && nextChar == L']' ) inPosixBracketExpr = false;
continue;
} // inPosixBracketExpr
else { // ! inPosixBracketExpr
if ( prevChar == L'[' && currChar == L':' ) {
// Enter POSIX bracket expression [:___:]
inPosixBracketExpr = true;
continue;
}
if ( currChar == L']' ) {
// Leaving character class [___]
inCharacterClass = false;
continue;
}
} // ! inPosixBracketExpr
} // inCharacterClass
else { // ! inCharacterClass
switch (currChar) {
case '[' :
// Entering a character class [___]
startedCharClass = true;
inCharacterClass = true;
continue;
break;
#ifdef TSM_PCRE2
case '*' :
if ( prevChar != '(' ) continue;
// Entering a PCRE2 directive (*___)
inDirective = true;
continue;
break;
case '?' :
if ( prevChar != '(' ) continue;
if ( nextChar != '<' ) continue;
// Entering PCRE2 group name (?<___>)
inGroupName = true;
continue;
break;
#endif // TSM_PCRE2
}
#ifdef TSM_PCRE2
if ( inDirective ) {
// Is it time to leave PCRE2 directive (*___) ?
if (currChar == ')' ) inDirective = false;
continue;
}
if ( inGroupName ) {
// Is it time to leave PCRE2 group name (?<___>) ?
if (currChar == '>' ) inGroupName = false;
continue;
}
#endif // TSM_PCRE2
} // ! inCharacterClass
// If we reach here, currChar will be convertd
} // isRegex
CONVERT:
#ifdef TEQUIVCHARS_HASH_LOOKUP
try {
currChar = p->EquivalentsTable.at(currChar);
}
catch(const std::exception&) {
// No need to "handle" exception, *currchar was not changed
}
#else // TEQUIVCHARS_HASH_LOOKUP => Use binary search
currChar = p->lookup(currChar);
#endif // TEQUIVCHARS_HASH_LOOKUP
}
return outString;
}
/*
This is the original implementation of the replaceChars function. It works
by converting inputString (UTF16) to a "multibyte" (typically UTF8) string,
then to an array of "wide" (32-bit) characters which is used for character
replacement.The result is converted back into a TQString and returned.
Despite the multiple conversions, this function is almost as fast as
the one that replaced it.
*/
TQString TEquivChars::replaceCharsMB( TQString inputQstring, bool isRegex )
{
setlocale(LC_CTYPE, "");
const char* inputMBstring = inputQstring.local8Bit().data();
//-Debug: printf("Size of inputMBstring (metric 1): %d\n", inputQstring.length() );
//-Debug: printf("Size of inputMBstring (metric 2): %ld\n", strlen( inputMBstring ) );
//--- Allocate "wide" work string
size_t numCharsMax;
numCharsMax = inputQstring.length()+2; // Start with a hopefully safe overestimate
wchar_t *tempWstring = new wchar_t[numCharsMax]; // HEAP
// Versus stack allocated: wchar_t tempWstring[numCharsMax];
size_t numBytesMax = sizeof(wchar_t) * numCharsMax ;
//--- Load work string with "wide" characters from temporary input
const char * p_inputMBstring = inputMBstring; // need this 2nd pointer!
mbstate_t mbs = mbstate_t();
mbsrtowcs( tempWstring, &p_inputMBstring, numCharsMax, &mbs);
#ifdef verbose
size_t szWidechar = sizeof ( wchar_t ) ;
size_t numChars = wcslen(tempWstring);
size_t numBytes = numChars * szWidechar + 1;
// wchar_t charType[] = L"wchar_t" ;
// fwprintf( stderr, L"Wide character '%ls' occupies %ld bytes\n", charType, szWidechar );
fwprintf( stderr,
L"\nTemporary wide string contains %ld wide characters (using %ld of %ld allocated bytes)\n",
numChars, numBytes, numBytesMax
);
fwprintf( stderr, L"Wide string before: \t%ls\n", tempWstring );
#endif // verbose
#if 0 // old code
bool backSlashed = false;
bool startedCharClass = false;
bool inCharacterClass = false;
bool inPosixBracketExpr = false;
bool quoteLiteral = false;
wchar_t prevChar = L'_';
for ( wchar_t* currChar = tempWstring ; *currChar != L'\0' ; prevChar = *currChar, currChar++ )
{
if ( isRegex ) {
if ( startedCharClass ) {
//-Debug: printf( "Character class starting (%c%c)\n", prevChar, *currChar );
switch (*currChar) {
case L'^' : // Negated character class, proceed to next character
continue; //goto NOCONVERT;
case L']' : // Treat part of character class, not a closure
case L':' : // Treat part of character class, not as start of bracket expression
startedCharClass = false;
continue; //goto NOCONVERT;
}
startedCharClass = false;
} // startedCharClass
// Don't convert backSlashed characters
if ( backSlashed ) {
backSlashed = false;
continue; //goto NOCONVERT;
}
if ( *currChar == L'\\' ) {
backSlashed = true;
continue; //goto NOCONVERT;
}
// Don't convert characters that are part of bracket expressions
if ( inCharacterClass ) {
if ( inPosixBracketExpr ) {
if ( prevChar == L':' && *currChar == L']' ) {
//printf( "Leaving POSIX bracket expression (%c%c)\n", prevChar, *currChar );
inPosixBracketExpr = false;
continue; //goto NOCONVERT;
}
continue; //goto NOCONVERT;
} // inPosixBracketExpr
else { // ! inPosixBracketExpr
if ( prevChar == L'[' && *currChar == L':' ) {
// POSIX bracket expression open
//-Debug: printf( "Entering POSIX bracket expression (%c%c)\n", prevChar, *currChar );
inPosixBracketExpr = true;
continue; //goto NOCONVERT;
}
if ( *currChar == L']' ) {
// Character class closure
//-Debug: printf( "Leaving character class (%c%c)\n", prevChar, *currChar );
inCharacterClass = false;
continue; //goto NOCONVERT;
}
// Convert unescaped character inside character class
// if it's not part of a POSIX bracket expression
goto CONVERT;
} // ! inPosixBracketExpr
} // inCharacterClass
else { // ! inCharacterClass
if ( *currChar == L'[' ) {
// Character class opener
startedCharClass = true;
inCharacterClass = true;
//-Debug: printf( "Entering character class (%c%c)\n", prevChar, *currChar );
continue; //goto NOCONVERT;
}
// Convert unescaped character outside of character class
goto CONVERT;
} // ! inCharacterClass
} // isRegex
CONVERT:
#ifdef TEQUIVCHARS_HASH_LOOKUP
try {
*currChar = p->EquivalentsTable.at(*currChar);
}
catch(const std::exception&) {
// No need to "handle" exception, *currchar was not changed
}
#else // TEQUIVCHARS_HASH_LOOKUP => Use binary search
*currChar = p->lookup(*currChar);
#endif // TEQUIVCHARS_HASH_LOOKUP
//-Debug: printf( "Xlate: %c\n", *currChar );
}
#else // new code
bool backSlashed = false; // \_
bool startedCharClass = false; // Previous character was starting '[' of character class
bool inCharacterClass = false; // [___]
bool inPosixBracketExpr = false; // [:___:]
#ifdef TSM_PCRE2
bool quoteLiteral = false; // \Q___\E
bool inBraceExpr = false; // \c{___} where 'c' is any of: 'x' 'o' 'p' 'P' 'N' 'g'
bool inDirective = false; // (*___)
bool inGroupName = false; // (?<___>
#endif // TSM_PCRE2
wchar_t prevChar = 0;
wchar_t *currChar = tempWstring;
wchar_t *nextChar = tempWstring+1;
for ( ; *currChar != L'\0' ; prevChar = *currChar, currChar++, nextChar++ )
{
if ( isRegex ) {
if ( *currChar == L'\\' ) {
backSlashed = true;
continue;
}
// Don't convert backSlashed characters
if ( backSlashed ) {
#ifdef TSM_PCRE2
switch (*currChar) {
case 'Q' : quoteLiteral = true; break; // Entering literal \Q___\E
case 'E' : quoteLiteral = false; break; // Leaving literal \Q___\E
case 'N' : // Entering Unicode codepoint specification \N{U+___} ?
case 'P' : // Entering (negated) Unicode property specification \p{} ?
case 'p' : // Entering Unicode property specification \p{} ?
case 'g' : // Entering a named backreference \g{___} ?
if ( *nextChar == L'{' ) inBraceExpr = true;
break;
}
#endif // TSM_PCRE2
backSlashed = false;
continue;
}
#ifdef TSM_PCRE2
if ( quoteLiteral )
continue;
if ( inBraceExpr ) {
// Is it time to leave brace expression {___} ?
if ( *nextChar == L'}' ) inBraceExpr = true;
continue;
}
#endif // TSM_PCRE2
if ( startedCharClass ) {
switch (*currChar) {
case L'^' : // Negated character class, proceed to next character
continue; // Bypass converting this special character
case L']' : // Treat as part of character class, not as a closure
case L':' : // Treat as part of character class, not as start of bracket expression
startedCharClass = false;
continue; // Bypass converting these special characters
}
startedCharClass = false;
} // startedCharClass
if ( inCharacterClass ) {
if ( inPosixBracketExpr ) {
// Is it time to leave POSIX bracket expression [:___:] ?
if ( *currChar == L':' && *nextChar == L']' ) inPosixBracketExpr = false;
continue;
} // inPosixBracketExpr
else { // ! inPosixBracketExpr
if ( prevChar == L'[' && *currChar == L':' ) {
// Enter POSIX bracket expression [:___:]
inPosixBracketExpr = true;
continue;
}
if ( *currChar == L']' ) {
// Leaving character class [___]
inCharacterClass = false;
continue;
}
} // ! inPosixBracketExpr
} // inCharacterClass
else { // ! inCharacterClass
switch (*currChar) {
case '[' :
// Entering a character class [___]
startedCharClass = true;
inCharacterClass = true;
continue;
break;
#ifdef TSM_PCRE2
case '*' :
if ( prevChar != '(' ) continue;
// Entering a PCRE2 directive (*___)
inDirective = true;
continue;
break;
case '?' :
if ( prevChar != '(' ) continue;
if ( *nextChar != '<' ) continue;
// Entering PCRE2 group name (?<___>)
inGroupName = true;
continue;
break;
#endif // TSM_PCRE2
}
#ifdef TSM_PCRE2
if ( inDirective ) {
// Is it time to leave PCRE2 directive (*___) ?
if (*currChar == ')' ) inDirective = false;
continue;
}
if ( inGroupName ) {
// Is it time to leave PCRE2 group name (?<___>) ?
if (*currChar == '>' ) inGroupName = false;
continue;
}
#endif // TSM_PCRE2
} // ! inCharacterClass
// If we reach here, *currChar will be convertd
} // isRegex
CONVERT:
#ifdef TEQUIVCHARS_HASH_LOOKUP
try {
*currChar = p->EquivalentsTable.at(*currChar);
}
catch(const std::exception&) {
// No need to "handle" exception, *currchar was not changed
}
#else // TEQUIVCHARS_HASH_LOOKUP => Use binary search
*currChar = p->lookup(*currChar);
#endif // TEQUIVCHARS_HASH_LOOKUP
}
#endif // old->new code
#ifdef verbose
fwprintf( stderr, L"Wide string after: \t%ls\n\n", tempWstring );
#endif // verbose
char *outputMBstring = new char[numBytesMax];
// Versus stack allocated: char outputMBstring[numBytesMax];
const wchar_t * p_tempWstring = tempWstring; // need this 2nd pointer!
mbrlen( NULL, 0, &mbs );
int rc = wcsrtombs( outputMBstring, &p_tempWstring, numBytesMax, &mbs );
delete[] tempWstring;
TQString outputQstring = TQString::fromLocal8Bit( outputMBstring );
delete[] outputMBstring;
return outputQstring;
}

@ -0,0 +1,38 @@
#ifndef TEQUIVCHARS_H
#define TEQUIVCHARS_H
#include "tdelibs_export.h"
#include <tqstring.h>
/**
* Class representing a mapping of each alphanumeric character to its "collating
* equivalent" as defined by the Default Unicode Collation Entity Table (DUCET).
* The mapping is limited to single-codepoint characters <= U+FFFF.
*/
class TDECORE_EXPORT TEquivChars
{
public:
TEquivChars();
~TEquivChars();
/**
@return copy of @param inputString modified such that each alphanumeric
character is replaced with it's collating character equivalent. If the
value @param isRegex is true, the input string is treated as a regular
expression and the alphabetical characters inside Posix bracket [::]
expressions are left as-is
*/
TQString replaceChars( TQString inputString, bool isRegex = false );
/**
Alternative implementation of replaceChars function that uses some
"multibyte string" / "wide character" functions defined in wchar.h.
*/
TQString replaceCharsMB( TQString inputString, bool isRegex = false );
private:
class TEquivChars_Private *p;
};
#endif // TEQUIVCHARS_H
Loading…
Cancel
Save