final commit to branch issue/270/tdelibs-V3. It includes the following files that were mistakenly omitted in that commit: tdecore/CMakeLists.txt tdecore/README.tdestringmatcher tdecore/tdeglobal.cpp tdecore/tdeglobal.h tdecore/tdestringmatcher.cpp tdecore/tdestringmatcher.h tdeio/tdeio/tdefileitem.cpp tdeio/tdeio/tdefileitem.h It also includes updates to the following files, some of which are based on recent feedback from @MicheleC: tdecore/tequivchars-mapping.h tdecore/tequivchars.cpp tdecore/tequivchars.h Signed-off-by: Vincent Reher <tde@4reher.org>issue/270/tdelibs-V4
parent
4c0dae60b2
commit
a39403fb8b
@ -0,0 +1,649 @@
|
||||
#include "tdestringmatcher.h"
|
||||
#include "tequivchars.h"
|
||||
|
||||
#include <tdeglobal.h>
|
||||
#include <tqregexp.h>
|
||||
#include <kdebug.h>
|
||||
|
||||
#include <features.h>
|
||||
#ifdef __GLIBC__
|
||||
#include <fnmatch.h>
|
||||
#pragma message "TSM using GLIBC fnmatch() for wildcard matching"
|
||||
#endif
|
||||
|
||||
//================================================================================================
|
||||
|
||||
namespace TSM {
|
||||
|
||||
class AuxData
|
||||
{
|
||||
public:
|
||||
AuxData();
|
||||
TQString patternConverted; // Pattern converted from original (e.g ANCHandling::EQUIVALENCE)
|
||||
TQRegExp* matchEngine; // Used when PatternType::REGEX
|
||||
#ifdef __GLIBC__
|
||||
int fnmatchFlags; // Used by fnmatch() when PatternType::WILDCARD
|
||||
#endif
|
||||
bool isCaseSensitive; // PatternType::SUBSTRING
|
||||
};
|
||||
|
||||
AuxData::AuxData()
|
||||
{
|
||||
isCaseSensitive = true;
|
||||
#ifdef __GLIBC__
|
||||
fnmatchFlags = FNM_EXTMATCH; // Bash shell option 'extglob'
|
||||
#endif
|
||||
matchEngine = nullptr;
|
||||
patternConverted = "";
|
||||
}
|
||||
|
||||
} // End of namespace TSM
|
||||
|
||||
//================================================================================================
|
||||
|
||||
using namespace TSM;
|
||||
|
||||
typedef TQValueVector<AuxData> AuxDataList;
|
||||
|
||||
class TDEStringMatcher::TDEStringMatcherPrivate {
|
||||
public:
|
||||
|
||||
// Properties that may be set / accessed through the TSM interface
|
||||
TQString m_matchSpecString;
|
||||
MatchSpecList m_matchSpecList;
|
||||
|
||||
// Properties that are internal implementation only
|
||||
AuxDataList m_auxData;
|
||||
void clearAll();
|
||||
};
|
||||
|
||||
void TDEStringMatcher::TDEStringMatcherPrivate::clearAll()
|
||||
{
|
||||
m_matchSpecString = "";
|
||||
m_matchSpecList.clear();
|
||||
for ( size_t index = 0 ; index < m_auxData.count() ; index++ ) {
|
||||
if ( m_auxData[index].matchEngine != nullptr ) {
|
||||
TSMTRACE << "Freeing match engine " << m_auxData[index].matchEngine << endl;
|
||||
delete m_auxData[index].matchEngine;
|
||||
}
|
||||
}
|
||||
m_auxData.clear();
|
||||
}
|
||||
|
||||
//================================================================================================
|
||||
|
||||
TDEStringMatcher::TDEStringMatcher()
|
||||
{
|
||||
TSMTRACE << "TSM::TDEStringMatcher(): New instance created: " << this << endl;
|
||||
d = new TDEStringMatcherPrivate;
|
||||
}
|
||||
|
||||
TDEStringMatcher::~TDEStringMatcher()
|
||||
{
|
||||
d->clearAll();
|
||||
delete d;
|
||||
TSMTRACE << "TSM::~TDEStringMatcher(): Instance destroyed: " << this << endl;
|
||||
}
|
||||
|
||||
|
||||
//================================================================================================
|
||||
// Match specification output functions
|
||||
//================================================================================================
|
||||
|
||||
const TQString TDEStringMatcher::getMatchSpecString() const
|
||||
{
|
||||
return d->m_matchSpecString;
|
||||
}
|
||||
|
||||
const MatchSpecList TDEStringMatcher::getMatchSpecs() const
|
||||
{
|
||||
return d->m_matchSpecList;
|
||||
}
|
||||
|
||||
|
||||
//================================================================================================
|
||||
// Match specification input functions
|
||||
//================================================================================================
|
||||
|
||||
bool TDEStringMatcher::setMatchSpecs( MatchSpecList newMatchSpecList )
|
||||
{
|
||||
TDEStringMatcherPrivate workArea;
|
||||
|
||||
TQStringList newMatchSpecs;
|
||||
|
||||
TSMTRACE << "TSM::setPatterns(): validating match specification list" << endl;
|
||||
|
||||
for ( MatchSpec matchSpec : newMatchSpecList ) {
|
||||
|
||||
if ( matchSpec.pattern.isEmpty() ) {
|
||||
TSMTRACE << " Error: empty pattern!" << endl;
|
||||
workArea.clearAll();
|
||||
return false;
|
||||
}
|
||||
if ( matchSpec.pattern.find( TQChar(PatterStringDivider) ) >= 0 ) {
|
||||
TSMTRACE << " Error: pattern contains reserved separator character" << endl;
|
||||
workArea.clearAll();
|
||||
return false;
|
||||
}
|
||||
|
||||
AuxData auxWork;
|
||||
TQString inferredOptionString;
|
||||
|
||||
// Validate / process PatternType
|
||||
|
||||
auxWork.patternConverted = matchSpec.pattern;
|
||||
switch ( matchSpec.patternType ) {
|
||||
case PatternType::WILDCARD :
|
||||
inferredOptionString += TQChar('w');
|
||||
#ifndef __GLIBC__
|
||||
auxWork.patternConverted = wildcardToRegex( auxWork.patternConverted );
|
||||
TSMTRACE << " Converted wildcard expression '" << matchSpec.pattern << "' to regex '" << auxWork.patternConverted << "'" << endl;
|
||||
#endif
|
||||
break;
|
||||
case PatternType::REGEX :
|
||||
inferredOptionString += TQChar('r');
|
||||
break;
|
||||
case PatternType::SUBSTRING :
|
||||
inferredOptionString += TQChar('s');
|
||||
break;
|
||||
default:
|
||||
TSMTRACE << " Error: pattern type out of range" << endl;
|
||||
workArea.clearAll();
|
||||
return false;
|
||||
}
|
||||
|
||||
// Validate / process ANCHandling
|
||||
|
||||
TQString before = auxWork.patternConverted;
|
||||
switch ( matchSpec.ancHandling ) {
|
||||
case ANCHandling::CASE_SENSITIVE :
|
||||
inferredOptionString += TQChar('c');
|
||||
auxWork.isCaseSensitive = true;
|
||||
break;
|
||||
case ANCHandling::CASE_INSENSITIVE :
|
||||
inferredOptionString += TQChar('i');
|
||||
auxWork.isCaseSensitive = false;
|
||||
#ifdef __GLIBC__
|
||||
auxWork.fnmatchFlags |= FNM_CASEFOLD;
|
||||
#endif
|
||||
break;
|
||||
case ANCHandling::EQUIVALENCE :
|
||||
inferredOptionString += TQChar('e');
|
||||
auxWork.isCaseSensitive = true;
|
||||
auxWork.patternConverted = TDEGlobal::equivChars()->replaceChars( auxWork.patternConverted, true );
|
||||
TSMTRACE << " Converted match pattern '" << before << "' to equivalent '" << auxWork.patternConverted << "'" << endl;
|
||||
break;
|
||||
default:
|
||||
TSMTRACE << " Error: alphabetic character handling specification out of range" << endl;
|
||||
workArea.clearAll();
|
||||
return false;
|
||||
}
|
||||
|
||||
if ( matchSpec.expectMatch )
|
||||
inferredOptionString += TQChar('=');
|
||||
else
|
||||
inferredOptionString += TQChar('!');
|
||||
|
||||
// Test validity of pattern
|
||||
|
||||
TQRegExp rxWork;
|
||||
int result;
|
||||
|
||||
switch ( matchSpec.patternType ) {
|
||||
case PatternType::WILDCARD :
|
||||
#ifdef __GLIBC__ // Test wildcard expression using a subject matter expert
|
||||
result = fnmatch(
|
||||
auxWork.patternConverted.local8Bit().data(),
|
||||
auxWork.patternConverted.local8Bit().data(),
|
||||
auxWork.fnmatchFlags
|
||||
); // Comparison should fail
|
||||
switch ( result ) {
|
||||
case 0: // matched
|
||||
case FNM_NOMATCH: // not matched
|
||||
break;
|
||||
default:
|
||||
TSMTRACE << " Error: invalid wildcard syntax" << endl;
|
||||
workArea.clearAll();
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
#endif // Otherwise we will test wildcard expression as one converted to a regex
|
||||
case PatternType::REGEX :
|
||||
// Prepare regex
|
||||
rxWork.setPattern( auxWork.patternConverted );
|
||||
rxWork.setCaseSensitive( auxWork.isCaseSensitive );
|
||||
// Test regex
|
||||
if ( rxWork.isValid() ) {
|
||||
auxWork.matchEngine = new TQRegExp;
|
||||
*auxWork.matchEngine = rxWork;
|
||||
TSMTRACE << "AuxData: Allocated regex engine for matching '" << auxWork.matchEngine->pattern() << "'" << endl;
|
||||
}
|
||||
else {
|
||||
TSMTRACE << " Error: invalid regex syntax'" << endl;
|
||||
workArea.clearAll();
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
// if (! rxWork.isReallyWhatUserIntended() ) { HA HA
|
||||
}
|
||||
|
||||
// This particular match specification is good
|
||||
|
||||
newMatchSpecs.append( inferredOptionString );
|
||||
newMatchSpecs.append( matchSpec.pattern );
|
||||
workArea.m_auxData.append( auxWork );
|
||||
}
|
||||
|
||||
// All proposed match specifications are good, update everything accordingly
|
||||
|
||||
workArea.m_matchSpecList = newMatchSpecList;
|
||||
workArea.m_matchSpecString = newMatchSpecs.join( TQChar(PatterStringDivider) );
|
||||
d->clearAll();
|
||||
*d = workArea;
|
||||
//-Debug: TSMTRACE << " Notifying slots of pattern change" << endl;
|
||||
emit patternsChanged();
|
||||
//-Debug: TSMTRACE << " All slots have been notified" << endl;
|
||||
TSMTRACE << "TSM::setPatterns(): Patterns were successfully regenerated from list" << endl << endl;
|
||||
return true;
|
||||
}
|
||||
|
||||
//=================================================================================================
|
||||
|
||||
bool TDEStringMatcher::setMatchSpecs( TQString newMatchSpecString )
|
||||
{
|
||||
if ( newMatchSpecString == d->m_matchSpecString )
|
||||
return true;
|
||||
|
||||
TDEStringMatcherPrivate workArea;
|
||||
|
||||
MatchSpec matchSpec = {
|
||||
PatternType::DEFAULT,
|
||||
ANCHandling::DEFAULT,
|
||||
true, // seeking matches, not non-matches
|
||||
""
|
||||
};
|
||||
|
||||
TSMTRACE << "TSM::setPatterns: Proposed match specification string: <" << newMatchSpecString << ">" << endl;
|
||||
|
||||
if ( newMatchSpecString.isEmpty() ) {
|
||||
TSMTRACE << " Empty pattern string => match specifications will be cleared" << endl;
|
||||
d->m_matchSpecList.clear();
|
||||
d->m_matchSpecString = "";
|
||||
emit patternsChanged();
|
||||
return true;
|
||||
}
|
||||
|
||||
TQStringList newMatchSpecs = TQStringList::split( PatterStringDivider, newMatchSpecString, true );
|
||||
|
||||
if ( newMatchSpecs.count() % 2 != 0 ) {
|
||||
TSMTRACE << " Error: match specification string must contain an even number of components" << endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
bool processingOptionString = true; // expected format: option string , pattern string, ...
|
||||
|
||||
for ( TQString &specification : newMatchSpecs ) {
|
||||
|
||||
if ( processingOptionString ) {
|
||||
specification = specification.lower();
|
||||
TSMTRACE << " Processing match option string: '" << specification << "'" << endl;
|
||||
for ( int i = 0 ; i < specification.length() ; i++ ) {
|
||||
|
||||
TQChar optionChar = specification[i];
|
||||
//Debug: TSMTRACE << " Option character: '" << optionChar << "'" << endl;
|
||||
|
||||
switch ( optionChar ) {
|
||||
case 'r' : matchSpec.patternType = PatternType::REGEX ; break;
|
||||
case 'w' : matchSpec.patternType = PatternType::WILDCARD ; break;
|
||||
case 's' : matchSpec.patternType = PatternType::SUBSTRING ; break;
|
||||
case 'c' : matchSpec.ancHandling = ANCHandling::CASE_SENSITIVE ; break;
|
||||
case 'i' : matchSpec.ancHandling = ANCHandling::CASE_INSENSITIVE; break;
|
||||
case 'e' : matchSpec.ancHandling = ANCHandling::EQUIVALENCE ; break;
|
||||
case '=' : matchSpec.expectMatch = true ; break;
|
||||
case '!' : matchSpec.expectMatch = false ; break;
|
||||
default:
|
||||
// We reserve ALL other possible option characters for future use!
|
||||
TSMTRACE << " Error: invalid option character" << endl;
|
||||
workArea.clearAll();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
processingOptionString = false; // next spec should be a pattern string
|
||||
} // processingOptionString
|
||||
|
||||
else { // ! processingOptionString
|
||||
|
||||
TSMTRACE << " Processing match pattern string: '" << specification << "'" << endl;
|
||||
|
||||
if ( specification.isEmpty() ) {
|
||||
TSMTRACE << " Error: empty pattern!" << endl;
|
||||
workArea.clearAll();
|
||||
return false;
|
||||
}
|
||||
|
||||
AuxData auxWork;
|
||||
|
||||
// Validate / process PatternType
|
||||
|
||||
auxWork.patternConverted = specification;
|
||||
switch ( matchSpec.patternType ) {
|
||||
case PatternType::WILDCARD :
|
||||
#ifndef __GLIBC__
|
||||
auxWork.patternConverted = wildcardToRegex( specification );
|
||||
TSMTRACE << " Converted wildcard expression '" << specification << "' to regex '" << auxWork.patternConverted << "'" << endl;
|
||||
break;
|
||||
#endif
|
||||
case PatternType::REGEX :
|
||||
case PatternType::SUBSTRING :
|
||||
break;
|
||||
default :
|
||||
// This should never arise since the content of this field was set within this function
|
||||
kdWarning() << "Error while processing '" << specification
|
||||
<< "' pattern type out of range: " << (uchar) matchSpec.patternType
|
||||
<< endl;
|
||||
workArea.clearAll();
|
||||
return false;
|
||||
}
|
||||
|
||||
// Validate / process ANCHandling
|
||||
|
||||
TQString before = auxWork.patternConverted;
|
||||
switch ( matchSpec.ancHandling ) {
|
||||
case ANCHandling::CASE_SENSITIVE :
|
||||
auxWork.isCaseSensitive = true;
|
||||
break;
|
||||
case ANCHandling::CASE_INSENSITIVE :
|
||||
auxWork.isCaseSensitive = false;
|
||||
#ifdef __GLIBC__
|
||||
auxWork.fnmatchFlags |= FNM_CASEFOLD;
|
||||
#endif
|
||||
break;
|
||||
case ANCHandling::EQUIVALENCE :
|
||||
auxWork.isCaseSensitive = true;
|
||||
auxWork.patternConverted = TDEGlobal::equivChars()->replaceChars( auxWork.patternConverted, true );
|
||||
TSMTRACE << " Converted match pattern '" << before << "' to equivalent '" << auxWork.patternConverted << "'" << endl;
|
||||
break;
|
||||
default: break;
|
||||
kdWarning() << "Error while processing '" << specification
|
||||
<< "' alphabetic character handling specification out of range: " << (uchar) matchSpec.ancHandling
|
||||
<< endl;
|
||||
workArea.clearAll();
|
||||
return false;
|
||||
}
|
||||
|
||||
// Test validity of pattern
|
||||
|
||||
TQRegExp rxWork; // single working copy == each pattern inherits previous options
|
||||
int result;
|
||||
|
||||
switch ( matchSpec.patternType ) {
|
||||
case PatternType::WILDCARD :
|
||||
#ifdef __GLIBC__ // Test wildcard expression using a subject matter expert
|
||||
result = fnmatch(
|
||||
auxWork.patternConverted.local8Bit().data(),
|
||||
auxWork.patternConverted.local8Bit().data(),
|
||||
auxWork.fnmatchFlags
|
||||
); // Comparison should fail
|
||||
switch ( result ) {
|
||||
case 0: // matched
|
||||
case FNM_NOMATCH: // not matched
|
||||
break;
|
||||
default:
|
||||
TSMTRACE << " Error: invalid wildcard syntax" << endl;
|
||||
workArea.clearAll();
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
#endif // Otherwise we will test wildcard expression as one converted to x regex
|
||||
case PatternType::REGEX :
|
||||
// Prepare regex
|
||||
rxWork.setPattern( auxWork.patternConverted );
|
||||
rxWork.setCaseSensitive( auxWork.isCaseSensitive );
|
||||
// Test regex
|
||||
if ( rxWork.isValid() ) {
|
||||
auxWork.matchEngine = new TQRegExp;
|
||||
*auxWork.matchEngine = rxWork;
|
||||
TSMTRACE << " AuxData: Allocated regex engine " << auxWork.matchEngine << "for pattern: " << auxWork.matchEngine->pattern() << endl;
|
||||
}
|
||||
else {
|
||||
TSMTRACE << " Error: invalid regex syntax" << endl;
|
||||
workArea.clearAll();
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
// if (! rxWork.isReallyWhatUserIntended() ) { HA HA
|
||||
}
|
||||
|
||||
matchSpec.pattern = specification;
|
||||
workArea.m_matchSpecList.push_back( matchSpec );
|
||||
workArea.m_auxData.append( auxWork );
|
||||
|
||||
processingOptionString = true; // next spec should be an option string
|
||||
} // ! processingOptionString completed
|
||||
}
|
||||
|
||||
workArea.m_matchSpecString = newMatchSpecString;
|
||||
d->clearAll();
|
||||
*d = workArea;
|
||||
TSMTRACE << " Final patternString: '" << d->m_matchSpecString << "'" << endl;
|
||||
TSMTRACE << " Number of match patterns in list: '" << d->m_matchSpecList.count() << "'" << endl;
|
||||
//-Debug: TSMTRACE << " Notifying slots of pattern change" << endl;
|
||||
emit patternsChanged();
|
||||
//-Debug: TSMTRACE << " All slots have been notified" << endl;
|
||||
TSMTRACE << "TSM::setPatterns(): Patterns were successfully regenerated from string" << endl << endl;
|
||||
return true;
|
||||
}
|
||||
|
||||
//================================================================================================
|
||||
// Match functions
|
||||
//================================================================================================
|
||||
|
||||
bool TDEStringMatcher::matchAny( const TQString& stringToMatch ) const
|
||||
{
|
||||
/* DEBUG
|
||||
TSMTRACE << "TSM:matchAny(): Attempting to match string '" << stringToMatch << "' against stored patterns" << endl;
|
||||
if ( d->m_matchSpecList.isEmpty() ) {
|
||||
//-Debug: TSMTRACE << "Match failed on empty pattern list!" << endl;
|
||||
return false;
|
||||
}
|
||||
*/
|
||||
TQString equivalentString;
|
||||
|
||||
for ( size_t index = 0 ; index < d->m_matchSpecList.count() ; index++ )
|
||||
{
|
||||
TQString matchWhat = stringToMatch;
|
||||
TQString matchThis = d->m_auxData[index].patternConverted;
|
||||
|
||||
if ( d->m_matchSpecList[index].ancHandling == ANCHandling::EQUIVALENCE ) {
|
||||
if ( equivalentString.isEmpty() ) {
|
||||
equivalentString = TDEGlobal::equivChars()->replaceChars( stringToMatch, false ) ;
|
||||
}
|
||||
matchWhat = equivalentString;
|
||||
}
|
||||
|
||||
bool matchFound = false;
|
||||
switch ( d->m_matchSpecList[index].patternType ) {
|
||||
case PatternType::WILDCARD :
|
||||
#ifdef __GLIBC__
|
||||
matchFound = ( fnmatch(
|
||||
matchThis.local8Bit().data(),
|
||||
matchWhat.local8Bit().data(),
|
||||
d->m_auxData[index].fnmatchFlags
|
||||
) == 0 );
|
||||
break;
|
||||
#endif
|
||||
case PatternType::REGEX :
|
||||
matchFound = ( d->m_auxData[index].matchEngine->search( matchWhat ) >= 0 );
|
||||
break;
|
||||
case PatternType::SUBSTRING :
|
||||
matchFound = ( matchWhat.find( matchThis, 0, d->m_auxData[index].isCaseSensitive ) >= 0 );
|
||||
break;
|
||||
}
|
||||
|
||||
if ( matchFound == d->m_matchSpecList[index].expectMatch ) {
|
||||
TSMTRACE << " Success! match of pattern '" << matchThis << "' against '" << matchWhat << "' turned out as expected" << endl;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
TSMTRACE << " Match failed, there were no pattern matches against '" << stringToMatch << "' that turned out as expected" << endl;
|
||||
return false ;
|
||||
}
|
||||
|
||||
bool TDEStringMatcher::matchAll( const TQString& stringToMatch ) const
|
||||
{
|
||||
TSMTRACE << "TSM:matchAll(): Attempting to match string '" << stringToMatch << "' against stored patterns" << endl;
|
||||
if ( d->m_matchSpecList.isEmpty() ) {
|
||||
//-Debug: TSMTRACE << "Match failed on empty pattern list!" << endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
TQString equivalentString;
|
||||
|
||||
for ( size_t index = 0 ; index < d->m_matchSpecList.count() ; index++ )
|
||||
{
|
||||
TQString matchWhat = stringToMatch;
|
||||
TQString matchThis = d->m_auxData[index].patternConverted;
|
||||
|
||||
if ( d->m_matchSpecList[index].ancHandling == ANCHandling::EQUIVALENCE ) {
|
||||
if ( equivalentString.isEmpty() ) {
|
||||
equivalentString = TDEGlobal::equivChars()->replaceChars( stringToMatch, false ) ;
|
||||
}
|
||||
matchWhat = equivalentString;
|
||||
}
|
||||
|
||||
bool matchFound = false;
|
||||
switch ( d->m_matchSpecList[index].patternType ) {
|
||||
case PatternType::WILDCARD :
|
||||
#ifdef __GLIBC__
|
||||
matchFound = ( fnmatch(
|
||||
matchThis.local8Bit().data(),
|
||||
matchWhat.local8Bit().data(),
|
||||
d->m_auxData[index].fnmatchFlags
|
||||
) == 0 );
|
||||
break;
|
||||
#endif
|
||||
case PatternType::REGEX :
|
||||
matchFound = ( d->m_auxData[index].matchEngine->search( matchWhat ) >= 0 );
|
||||
break;
|
||||
case PatternType::SUBSTRING :
|
||||
matchFound = ( matchWhat.find( matchThis, 0, d->m_auxData[index].isCaseSensitive ) >= 0 );
|
||||
break;
|
||||
}
|
||||
|
||||
if ( matchFound != d->m_matchSpecList[index].expectMatch ) {
|
||||
TSMTRACE << " Match of pattern '" << matchThis << "' against '" << matchWhat << "' did not turn out as expected" << endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
TSMTRACE << " Expected pattern matching succeeded" << endl;
|
||||
return true;
|
||||
}
|
||||
|
||||
//================================================================================================
|
||||
// Utility functions
|
||||
//================================================================================================
|
||||
|
||||
/*
|
||||
The following code is a modified copy of that found in tqt3/src/tools/qregexp.cpp.
|
||||
We export this as utility function for applications that wish to convert a basic
|
||||
wildcard expression into a basic regular expression. TSM will not use this unless
|
||||
GLIBC fnmatch() is not available.
|
||||
*/
|
||||
TQString TDEStringMatcher::wildcardToRegex( const TQString& wildcardPattern )
|
||||
{
|
||||
int wclen = wildcardPattern.length();
|
||||
TQString rx = TQString::fromLatin1( "" );
|
||||
int i = 0;
|
||||
const TQChar *wc = wildcardPattern.unicode();
|
||||
while ( i < wclen ) {
|
||||
TQChar c = wc[i++];
|
||||
switch ( c.unicode() ) {
|
||||
case '*':
|
||||
rx += TQString::fromLatin1( ".*" );
|
||||
break;
|
||||
case '?':
|
||||
rx += TQChar( '.' );
|
||||
break;
|
||||
case '$':
|
||||
case '(':
|
||||
case ')':
|
||||
case '+':
|
||||
case '.':
|
||||
case '\\':
|
||||
case '^':
|
||||
case '{':
|
||||
case '|':
|
||||
case '}':
|
||||
rx += TQChar( '\\' );
|
||||
rx += c;
|
||||
break;
|
||||
case '[':
|
||||
rx += c;
|
||||
/* This is not correct, POSIX states that negation character is '!'
|
||||
if ( wc[i] == TQChar('^') )
|
||||
rx += wc[i++];
|
||||
*/
|
||||
if ( wc[i] == TQChar('!') ) {
|
||||
rx += TQChar('^');
|
||||
i++;
|
||||
} else if ( wc[i] == TQChar('^') ) {
|
||||
rx += TQChar( '\\' );
|
||||
rx += wc[i++];
|
||||
}
|
||||
if ( i < wclen ) {
|
||||
if ( rx[i] == ']' )
|
||||
rx += wc[i++];
|
||||
while ( i < wclen && wc[i] != TQChar(']') ) {
|
||||
if ( wc[i] == '\\' )
|
||||
rx += TQChar( '\\' );
|
||||
rx += wc[i++];
|
||||
}
|
||||
}
|
||||
break;
|
||||
default:
|
||||
rx += c;
|
||||
}
|
||||
}
|
||||
/* Wildcard patterns must match entire string */
|
||||
return TQChar('^') + rx + TQChar('$');
|
||||
/* TBD: Add support for extglob */
|
||||
}
|
||||
static TQString escapeRegexChars( const TQString& basicString )
|
||||
{
|
||||
int wclen = basicString.length();
|
||||
TQString outputString = TQString::fromLatin1( "" );
|
||||
int i = 0;
|
||||
const TQChar *wc = basicString.unicode();
|
||||
while ( i < wclen ) {
|
||||
TQChar c = wc[i++];
|
||||
switch ( c.unicode() ) {
|
||||
case '+':
|
||||
case '.':
|
||||
case '^':
|
||||
case '(':
|
||||
case ')':
|
||||
case '[':
|
||||
case ']':
|
||||
case '{':
|
||||
case '}':
|
||||
case '|':
|
||||
case '$':
|
||||
case '?':
|
||||
case '*':
|
||||
case '\\':
|
||||
outputString += TQChar( '\\' );
|
||||
outputString += c;
|
||||
break;
|
||||
default:
|
||||
outputString += c;
|
||||
}
|
||||
}
|
||||
return outputString;
|
||||
}
|
||||
|
||||
//================================================================================================
|
||||
|
||||
#include "tdestringmatcher.moc"
|
@ -0,0 +1,134 @@
|
||||
#ifndef TDESTRINGMATCHER_H
|
||||
#define TDESTRINGMATCHER_H
|
||||
|
||||
#include "tdelibs_export.h"
|
||||
|
||||
#include <tqobject.h>
|
||||
#include <tqvaluevector.h>
|
||||
|
||||
#define TSMTRACE kdWarning() << "<TSMTRACE> "
|
||||
|
||||
namespace TSM
|
||||
{
|
||||
/**
|
||||
* Enumeration used by the TDEStringMatcher class
|
||||
* defining types of patterns to be matched
|
||||
*/
|
||||
enum class PatternType: uchar
|
||||
{
|
||||
REGEX,
|
||||
WILDCARD,
|
||||
SUBSTRING,
|
||||
//OTHER,
|
||||
DEFAULT = REGEX
|
||||
};
|
||||
|
||||
/**
|
||||
* Enumeration used by the TDEStringMatcher class
|
||||
* defining special handling of alphanumeric characters
|
||||
*/
|
||||
enum class ANCHandling: uchar
|
||||
{
|
||||
CASE_SENSITIVE = 0, // No handling, each character distinct
|
||||
CASE_INSENSITIVE = 1, // Alphabetic case variants are same
|
||||
EQUIVALENCE = 2, // Alphanumeric equivalents are same
|
||||
DEFAULT = CASE_SENSITIVE
|
||||
};
|
||||
|
||||
/**
|
||||
* Structure used by the TDEStringMatcher class
|
||||
* representing properties of a single match specification.
|
||||
*/
|
||||
struct MatchSpec
|
||||
{
|
||||
PatternType patternType;
|
||||
ANCHandling ancHandling;
|
||||
bool expectMatch; // "matching" vs. "not matching"
|
||||
TQString pattern;
|
||||
};
|
||||
|
||||
/**
|
||||
* Container used in a TDEStringMatcher object
|
||||
* representing multiple match specifications.
|
||||
*/
|
||||
typedef TQValueVector<MatchSpec> MatchSpecList;
|
||||
|
||||
// Use horizontal tab as m_patternString separator
|
||||
inline constexpr char PatterStringDivider { '\t' };
|
||||
|
||||
} // End of namespace TSM
|
||||
|
||||
|
||||
/**
|
||||
* Generic string matcher class.
|
||||
*/
|
||||
class TDECORE_EXPORT TDEStringMatcher : public TQObject
|
||||
{
|
||||
Q_OBJECT
|
||||
public:
|
||||
|
||||
TDEStringMatcher();
|
||||
~TDEStringMatcher();
|
||||
|
||||
/**
|
||||
@return list of currently defined match specifications.
|
||||
*/
|
||||
const TSM::MatchSpecList getMatchSpecs() const;
|
||||
|
||||
/**
|
||||
@return string encoding list of currently defined match specifications.
|
||||
*/
|
||||
const TQString getMatchSpecString() const;
|
||||
|
||||
/**
|
||||
Use @param newMatchSpecList to generate the internal list of match
|
||||
specifications to be used for pattern matching.
|
||||
*/
|
||||
bool setMatchSpecs( TSM::MatchSpecList newMatchSpecList );
|
||||
|
||||
/**
|
||||
Use specially encoded @param newPatternString to generate the internal
|
||||
list of match specifications to be used for pattern matching. Refer
|
||||
to file README.tdestringmatcher in tdelibs/tdecore source code for
|
||||
more information on how the input string should be formatted.
|
||||
*/
|
||||
bool setMatchSpecs( TQString newMatchSpecString );
|
||||
|
||||
/**
|
||||
@return whether or not @param stringToMatch matches any of
|
||||
the current match specifications.
|
||||
*/
|
||||
bool matchAny( const TQString& stringToMatch ) const;
|
||||
|
||||
/**
|
||||
@return whether or not @param stringToMatch matches all of
|
||||
the current match specifications.
|
||||
*/
|
||||
bool matchAll( const TQString& stringToMatch ) const;
|
||||
|
||||
/**
|
||||
@return a basic regular expression formed by converting the basic
|
||||
wildcard expression in @param wildcardPattern.
|
||||
*/
|
||||
TQString wildcardToRegex( const TQString& wildcardPattern );
|
||||
|
||||
/**
|
||||
@return a string that is @param basicString with all special regular
|
||||
expression characters escaped. Useful for regular expression engines
|
||||
that do not support /Q.../E.
|
||||
*/
|
||||
TQString escapeRegexChars( const TQString& basicString );
|
||||
|
||||
|
||||
signals:
|
||||
|
||||
void patternsChanged();
|
||||
|
||||
private:
|
||||
|
||||
class TDEStringMatcherPrivate;
|
||||
TDEStringMatcherPrivate *d;
|
||||
|
||||
};
|
||||
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,241 @@
|
||||
#undef REGEX_IS_PCRE2
|
||||
#define OPTIMIZE_ASCII_LOOKUP
|
||||
|
||||
#ifdef REGEXP_IS_PCRE2
|
||||
#pragma message "############ Assuming regular expressions are PCRE2 ############"
|
||||
#endif
|
||||
|
||||
#ifdef OPTIMIZE_ASCII_LOOKUP
|
||||
#pragma message "############ ASCII characters will be processed separately ############"
|
||||
#endif
|
||||
|
||||
#include "tequivchars.h"
|
||||
|
||||
//typedef wchar_t CHAR16;
|
||||
//typedef unsigned short CHAR16;
|
||||
typedef TQChar CHAR16;
|
||||
|
||||
class TEquivChars_Private
|
||||
{
|
||||
public:
|
||||
|
||||
struct defaultCollation {
|
||||
CHAR16 character;
|
||||
CHAR16 collatesTo;
|
||||
};
|
||||
|
||||
const defaultCollation EquivalentsTable // terminating ';' is provided in include file
|
||||
#include "tequivchars-mapping.h"
|
||||
uint EquivTableROWS = sizeof(EquivalentsTable)/sizeof(EquivalentsTable[0]);
|
||||
};
|
||||
|
||||
TEquivChars::TEquivChars()
|
||||
{
|
||||
p = new TEquivChars_Private;
|
||||
}
|
||||
|
||||
TEquivChars::~TEquivChars()
|
||||
{
|
||||
delete p;
|
||||
}
|
||||
|
||||
TQString TEquivChars::replaceChars( const TQString &inputString, bool isRegex )
|
||||
{
|
||||
int inStrLen = inputString.length();
|
||||
TQString outString = TQString::fromLatin1( "" );
|
||||
outString.reserve( inStrLen );
|
||||
const TQChar *char16 = inputString.unicode();
|
||||
|
||||
bool backSlashed = false; // \_
|
||||
bool startedCharClass = false; // Previous character was starting '[' of character class
|
||||
bool inCharacterClass = false; // [___]
|
||||
bool inPosixBracketExpr = false; // [:___:]
|
||||
#ifdef REGEXP_IS_PCRE2
|
||||
bool quoteLiteral = false; // \Q___\E
|
||||
bool inBraceExpr = false; // \c{___} where 'c' is any of: 'x' 'o' 'p' 'P' 'N' 'g'
|
||||
bool inDirective = false; // (*___)
|
||||
bool inGroupName = false; // (?<___>
|
||||
#endif // REGEXP_IS_PCRE2
|
||||
CHAR16 currChar = 0;
|
||||
CHAR16 prevChar = 0;
|
||||
CHAR16 nextChar = 0;
|
||||
|
||||
for ( int i = 0 ; i < inStrLen ; outString[i] = CHAR16(currChar), i++ ) {
|
||||
|
||||
prevChar = currChar;
|
||||
currChar = char16[i].unicode();
|
||||
|
||||
if ( isRegex ) {
|
||||
|
||||
/*
|
||||
Look for regex characters and character sequences
|
||||
that should never be converted to an equivalent.
|
||||
*/
|
||||
|
||||
if ( i < ( inStrLen - 1 ) )
|
||||
nextChar = char16[i+1].unicode();
|
||||
else
|
||||
nextChar = 0;
|
||||
|
||||
if ( currChar == '\\' ) {
|
||||
backSlashed = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Don't convert backSlashed characters
|
||||
if ( backSlashed ) {
|
||||
#ifdef REGEXP_IS_PCRE2
|
||||
switch (currChar) {
|
||||
case 'Q' : quoteLiteral = true; break; // Entering literal \Q___\E
|
||||
case 'E' : quoteLiteral = false; break; // Leaving literal \Q___\E
|
||||
case 'N' : // Entering Unicode codepoint specification \N{U+___} ?
|
||||
case 'P' : // Entering (negated) Unicode property specification \p{} ?
|
||||
case 'p' : // Entering Unicode property specification \p{} ?
|
||||
case 'g' : // Entering a named backreference \g{___} ?
|
||||
if ( nextChar == '{' ) inBraceExpr = true;
|
||||
break;
|
||||
}
|
||||
#endif // REGEXP_IS_PCRE2
|
||||
backSlashed = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
#ifdef REGEXP_IS_PCRE2
|
||||
if ( quoteLiteral )
|
||||
continue;
|
||||
|
||||
if ( inBraceExpr ) {
|
||||
// Is it time to leave brace expression {___} ?
|
||||
if ( nextChar == '}' ) inBraceExpr = true;
|
||||
continue;
|
||||
}
|
||||
#endif // REGEXP_IS_PCRE2
|
||||
|
||||
if ( startedCharClass ) {
|
||||
switch (currChar) {
|
||||
case '^' : // Negated character class, proceed to next character
|
||||
continue; // Bypass converting this special character
|
||||
case ']' : // Treat as part of character class, not as a closure
|
||||
case ':' : // Treat as part of character class, not as start of bracket expression
|
||||
startedCharClass = false;
|
||||
continue; // Bypass converting these special characters
|
||||
}
|
||||
startedCharClass = false;
|
||||
} // startedCharClass
|
||||
|
||||
if ( inCharacterClass ) {
|
||||
|
||||
if ( inPosixBracketExpr ) {
|
||||
// Is it time to leave POSIX bracket expression [:___:] ?
|
||||
if ( currChar == ':' && nextChar == ']' ) inPosixBracketExpr = false;
|
||||
continue;
|
||||
} // inPosixBracketExpr
|
||||
|
||||
else { // ! inPosixBracketExpr
|
||||
|
||||
if ( prevChar == '[' && currChar == ':' ) {
|
||||
// Enter POSIX bracket expression [:___:]
|
||||
inPosixBracketExpr = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( currChar == ']' ) {
|
||||
// Leaving character class [___]
|
||||
inCharacterClass = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
} // ! inPosixBracketExpr
|
||||
|
||||
} // inCharacterClass
|
||||
|
||||
else { // ! inCharacterClass
|
||||
|
||||
switch (currChar) {
|
||||
|
||||
case '[' :
|
||||
// Entering a character class [___]
|
||||
startedCharClass = true;
|
||||
inCharacterClass = true;
|
||||
continue;
|
||||
break;
|
||||
#ifdef REGEXP_IS_PCRE2
|
||||
case '*' :
|
||||
if ( prevChar != '(' ) continue;
|
||||
// Entering a PCRE2 directive (*___)
|
||||
inDirective = true;
|
||||
continue;
|
||||
break;
|
||||
|
||||
case '?' :
|
||||
if ( prevChar != '(' ) continue;
|
||||
if ( nextChar != '<' ) continue;
|
||||
// Entering PCRE2 group name (?<___>)
|
||||
inGroupName = true;
|
||||
continue;
|
||||
break;
|
||||
#endif // REGEXP_IS_PCRE2
|
||||
}
|
||||
#ifdef REGEXP_IS_PCRE2
|
||||
if ( inDirective ) {
|
||||
// Is it time to leave PCRE2 directive (*___) ?
|
||||
if (currChar == ')' ) inDirective = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ( inGroupName ) {
|
||||
// Is it time to leave PCRE2 group name (?<___>) ?
|
||||
if (currChar == '>' ) inGroupName = false;
|
||||
continue;
|
||||
}
|
||||
#endif // REGEXP_IS_PCRE2
|
||||
} // ! inCharacterClass
|
||||
|
||||
/*
|
||||
If we have reached here, this regex character is a
|
||||
candidate for potential conversion to an equivalent.
|
||||
*/
|
||||
|
||||
} // isRegex
|
||||
|
||||
//-Debug: std::cerr << "Converting '" << TQString(currChar).utf8().data() << "' to '";
|
||||
|
||||
#ifdef OPTIMIZE_ASCII_LOOKUP
|
||||
// We can process ASCII quickly without using lookup table
|
||||
unsigned short codepoint = currChar.unicode();
|
||||
if ( codepoint < 128 ) {
|
||||
if ( codepoint > 64 && codepoint < 91 ) // convert upper case ASCII
|
||||
currChar = TQChar(codepoint + 32 ); // to corresponding lower case
|
||||
// All other ASCII characters are equivalent to themselves
|
||||
//-Debug: std::cerr << TQString(currChar).utf8().data() << "' (ascii)" << std::endl;
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Use a simple binary search to look up an equivalent character
|
||||
int low = 0;
|
||||
int high = p->EquivTableROWS - 1;
|
||||
while (low <= high) {
|
||||
int mid = low + (high - low) / 2;
|
||||
if ( currChar == p->EquivalentsTable[mid].character ) {
|
||||
// Found equivalent character, use it instead
|
||||
currChar = p->EquivalentsTable[mid].collatesTo;
|
||||
break;
|
||||
}
|
||||
if ( p->EquivalentsTable[mid].character < currChar )
|
||||
low = mid + 1;
|
||||
else
|
||||
high = mid - 1;
|
||||
}
|
||||
//-Debug: std::cerr << TQString(currChar).utf8().data() << "'" << std::endl;
|
||||
|
||||
/* FIXME: Possible ideas for optimizing table lookup speed
|
||||
(1) Detect & handle ASCII (<128) characters separately. *DONE*
|
||||
(2) Split table into multiple lookup tables and search each
|
||||
in order of descending likelihood of character match.
|
||||
*/
|
||||
|
||||
}
|
||||
|
||||
return outString;
|
||||
}
|
@ -0,0 +1,34 @@
|
||||
#ifndef TEQUIVCHARS_H
|
||||
#define TEQUIVCHARS_H
|
||||
|
||||
#include "tdelibs_export.h"
|
||||
|
||||
#include <tqstring.h>
|
||||
|
||||
/**
|
||||
* Class representing a mapping of each alphanumeric character to its "collating
|
||||
* equivalent" as defined by the Default Unicode Collation Entity Table (DUCET).
|
||||
* The mapping is limited to single-codepoint characters <= U+FFFF.
|
||||
*/
|
||||
class TDECORE_EXPORT TEquivChars
|
||||
{
|
||||
public:
|
||||
TEquivChars();
|
||||
~TEquivChars();
|
||||
|
||||
/**
|
||||
@return copy of @param inputString modified such that each alphanumeric
|
||||
character is replaced with it's collating character equivalent. If the
|
||||
value @param isRegex is true, the input string is treated as a regular
|
||||
expression and the alphabetical characters inside Posix bracket [::]
|
||||
expressions are left as-is
|
||||
*/
|
||||
TQString replaceChars( const TQString &inputString, bool isRegex = false );
|
||||
|
||||
|
||||
private:
|
||||
|
||||
class TEquivChars_Private *p;
|
||||
};
|
||||
|
||||
#endif // TEQUIVCHARS_H
|
Loading…
Reference in new issue