You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
tdelibs/tdecore/tdestringmatcher.cpp

515 lines
16 KiB
C++

#include "tdestringmatcher.h"
#include <tqregexp.h>
#include <kdebug.h>
typedef TQValueVector<TQRegExp> RegexList;
class TDEStringMatcher::TDEStringMatcherPrivate {
public:
// Properties that may be set / accessed through the TSM interface
TQString m_matchSpecString;
MatchSpecList m_matchSpecList;
// Properties that implementation only
RegexList m_regexList;
/* Individual TQRegExp objects would not be used to process
a PatternType doesn't require a regex engine for matching
but we may "borrow" the TQRegExp.pattern() field to store
a "converted" version of the pattern.
*/
};
TDEStringMatcher::TDEStringMatcher()
{
TSMTRACE << "TDEStringMatcher::TDEStringMatcher: New instance created: " << this << endl;
p = new TDEStringMatcherPrivate;
}
TDEStringMatcher::~TDEStringMatcher()
{
p->m_matchSpecList.clear();
p->m_regexList.clear();
delete p;
TSMTRACE << "TDEStringMatcher::TDEStringMatcher: Instance destroyed: " << this << endl;
}
//================================================================================================
// Match specification output functions
//================================================================================================
TQString TDEStringMatcher::getMatchSpecString()
{
return p->m_matchSpecString;
}
MatchSpecList TDEStringMatcher::getMatchSpecs()
{
return p->m_matchSpecList;
}
//================================================================================================
// Match specification input functions
//================================================================================================
bool TDEStringMatcher::setMatchSpecs( MatchSpecList newMatchSpecList )
{
RegexList newRegexList;
TQString optionString = "rc" ; // start with defaults
TQStringList newMatchSpecs;
TQRegExp rxWork;
TSMTRACE << "TDEStringMatcher::setPatterns: validating match specification list" << endl;
for ( MatchSpec matchSpec : newMatchSpecList ) {
if ( matchSpec.pattern.isEmpty() ) {
TSMTRACE << " Error: empty pattern!" << endl;
newRegexList.clear();
return false;
}
if ( matchSpec.pattern.find( TQChar(SEP) ) >= 0 ) {
TSMTRACE << " Error: pattern contains reserved separator character" << endl;
newRegexList.clear();
return false;
}
switch ( matchSpec.patternType ) {
// The following pattern types will be using TQRegExp functions for matching
case PatternType::REGEX :
optionString += TQChar('r');
rxWork.setPattern( matchSpec.pattern );
break;
case PatternType::WILDCARD :
optionString += TQChar('w');
rxWork.setPattern( wildcardToRegex( matchSpec.pattern ) );
break;
// The following pattern types will be using TQString functions for matching
case PatternType::SUBSTRING :
optionString += TQChar('s');
rxWork.setPattern( matchSpec.pattern ); // we will "borrow" this field
break;
default:
newRegexList.clear();
TSMTRACE << " Error: pattern type out of range" << endl;
return false;
}
switch ( matchSpec.ancHandling ) {
case ANCHandling::CASE_SENSITIVE :
optionString += TQChar('c');
rxWork.setCaseSensitive( true );
break;
case ANCHandling::CASE_INSENSITIVE :
optionString += TQChar('i');
rxWork.setCaseSensitive( false );
break;
case ANCHandling::EQUIVALENCE :
optionString += TQChar('e');
rxWork.setCaseSensitive( true );
// FIXME TBD: This is where we will be converting each (unescaped)
// alphanumeric character in rxWork.pattern to its "least" equivalent.
break;
default:
newRegexList.clear();
TSMTRACE << " Error: alphabetic character handling specification out of range" << endl;
return false;
}
if ( matchSpec.wantMatch )
optionString += TQChar('=');
else
optionString += TQChar('!');
if (! rxWork.isValid() ) {
TSMTRACE << " Error: invalid pattern syntax'" << endl;
newRegexList.clear();
return false;
}
// This particular match specification is good
newMatchSpecs.append( optionString );
newMatchSpecs.append( matchSpec.pattern );
newRegexList.append( rxWork );
optionString = "";
}
// All proposed match specifications are good, update everything accordingly
p->m_matchSpecList.clear(); p->m_matchSpecList = newMatchSpecList;
p->m_regexList.clear(); p->m_regexList = newRegexList;
p->m_matchSpecString = newMatchSpecs.join( TQChar(SEP) );
emit patternsChanged();
return true;
}
//=================================================================================================
bool TDEStringMatcher::setMatchSpecs( TQString newMatchSpecString )
{
MatchSpecList newMatchSpecList;
RegexList newRegexList;
TQRegExp rxWork; // single working copy == each pattern inherits previous options
MatchSpec matchSpec = {
PatternType::DEFAULT,
ANCHandling::DEFAULT,
true, // seeking matches, not non-matches
""
};
if ( newMatchSpecString == p->m_matchSpecString )
return true;
TSMTRACE << "TDEStringMatcher::setPatterns: Proposed match specification string: <" << newMatchSpecString << ">" << endl;
if ( newMatchSpecString.isEmpty() ) {
TSMTRACE << " Empty pattern string => match specifications will be cleared" << endl;
p->m_matchSpecList.clear();
p->m_regexList.clear();
p->m_matchSpecString = "";
emit patternsChanged();
return true;
}
TQStringList newMatchSpecs = TQStringList::split( SEP, newMatchSpecString, true );
if ( newMatchSpecs.count() % 2 != 0 ) {
TSMTRACE << " Error: match specification string must contain an even number of components" << endl;
return false;
}
TSMTRACE << newMatchSpecs.count() << endl;
bool processingPattern = false; // expected format: option string , pattern string, ...
for ( TQString &specification : newMatchSpecs ) {
if ( specification.find( TQChar(SEP) ) >= 0 ) {
TSMTRACE << " Error: match specification string contains reserved separator character" << endl;
newMatchSpecList.clear();
newRegexList.clear();
return false;
}
if ( processingPattern ) {
TSMTRACE << " Processing match pattern string: '" << specification << "'" << endl;
if ( specification.isEmpty() ) {
TSMTRACE << " Error: empty patterns are not allowed" << endl;
newMatchSpecList.clear();
newRegexList.clear();
return false;
}
// Prepare regex
switch ( matchSpec.patternType ) {
// The following pattern types will be using TQRegExp functions for matching
case PatternType::REGEX :
rxWork.setPattern( specification );
break;
case PatternType::WILDCARD :
rxWork.setPattern( wildcardToRegex( specification ) );
break;
// The following pattern types will be using TQString functions for matching
case PatternType::SUBSTRING :
rxWork.setPattern( specification ); // used for storage only
break;
default:
continue; // should not arise
}
switch ( matchSpec.ancHandling ) {
case ANCHandling::CASE_SENSITIVE :
rxWork.setCaseSensitive( true );
break;
case ANCHandling::CASE_INSENSITIVE :
rxWork.setCaseSensitive( false );
break;
case ANCHandling::EQUIVALENCE :
rxWork.setCaseSensitive( false );
// FIXME TBD: This is where we will be converting each (unescaped)
// alphanumeric character in rxWork.pattern to its "least" equivalent.
break;
default:
continue; // should not arise
}
// Test regex
if (! rxWork.isValid() ) {
TSMTRACE << " Error: invalid pattern syntax'" << endl;
newMatchSpecList.clear();
newRegexList.clear();
return false;
continue;
}
// if (! rxWork.isReallyWhatUserIntended() ) { HA HA
TSMTRACE << " Final Wildcard/CaseSensitive settings: " << rxWork.wildcard() << "/" << rxWork.caseSensitive() << endl;
matchSpec.pattern = specification;
newMatchSpecList.push_back( matchSpec );
newRegexList.append( rxWork );
processingPattern = false; // next spec should be an option string
continue;
}
specification = specification.lower();
TSMTRACE << " Processing match option string: '" << specification << "'" << endl;
for ( int i = 0 ; i < specification.length() ; i++ ) {
TQChar optionChar = specification[i];
TSMTRACE << " Option character: '" << optionChar << "'" << endl;
switch ( optionChar ) {
case 'r' : matchSpec.patternType = PatternType::REGEX ; break;
case 'w' : matchSpec.patternType = PatternType::WILDCARD ; break;
case 's' : matchSpec.patternType = PatternType::SUBSTRING ; break;
case 'c' : matchSpec.ancHandling = ANCHandling::CASE_SENSITIVE ; break;
case 'i' : matchSpec.ancHandling = ANCHandling::CASE_INSENSITIVE; break;
case 'e' : matchSpec.ancHandling = ANCHandling::EQUIVALENCE ; break;
case '=' : matchSpec.wantMatch = true ; break;
case '!' : matchSpec.wantMatch = false ; break;
default:
// We reserve ALL other possible option characters for future use!
TSMTRACE << " Error: invalid option character" << endl;
return false;
}
}
processingPattern = true; // next spec should be a pattern string
}
p->m_matchSpecList.clear(); p->m_matchSpecList = newMatchSpecList;
p->m_regexList.clear(); p->m_regexList = newRegexList;
p->m_matchSpecString = newMatchSpecString;
//newRegexList.clear(); // no need to do this?
TSMTRACE << " Final patternString: '" << p->m_matchSpecString << "'" << endl;
TSMTRACE << " Number of regex match patterns in list: '" << p->m_regexList.count() << "'" << endl;
TSMTRACE << " Notifying slots of pattern change" << endl;
emit patternsChanged();
TSMTRACE << " All slots have been notified" << endl;
TSMTRACE << "TDEStringMatcher::setPatterns: Patterns were successfully regenerated" << endl << endl;
return true;
}
//================================================================================================
// Match functions
//================================================================================================
bool TDEStringMatcher::matchAny( const TQString& stringToMatch )
{
TSMTRACE << "Attempting to match string '" << stringToMatch << "' against stored patterns" << endl;
if ( p->m_matchSpecList.isEmpty() ) {
//-Debug: TSMTRACE << "Match failed on empty pattern list!" << endl;
return false; //FIXME: or should that be true per MicheleC's comment?
}
TQString equivalentString;
for ( size_t index = 0 ; index < p->m_matchSpecList.count() ; index++ )
{
TQString matchThis = stringToMatch;
if ( p->m_matchSpecList[index].ancHandling == ANCHandling::EQUIVALENCE )
{
if ( equivalentString.isNull() ) {
// FIXME TBD: This is where we will be converting each alphanumeric
// character in stringToMatch to its "least" equivalent and storing
// the result in equivalentString. Until then, we'll just do:
equivalentString = stringToMatch;
}
matchThis = equivalentString;
}
switch ( p->m_matchSpecList[index].patternType ) {
case PatternType::REGEX :
case PatternType::WILDCARD :
if (
( p->m_regexList[index].search( matchThis ) >= 0 ) // was there a match?
== p->m_matchSpecList[index].wantMatch // is that what we were looking for?
) {
TSMTRACE << "Match succeeded with regex pattern: '" << p->m_regexList[index].pattern() << "'" << endl;
return true;
}
break;
case PatternType::SUBSTRING :
bool cs = ! (bool) p->m_matchSpecList[index].ancHandling;
if (
( matchThis.find( p->m_matchSpecList[index].pattern, 0, cs ) >= 0 ) // was there a match?
== p->m_matchSpecList[index].wantMatch // is that what we were looking for?
) {
TSMTRACE << "Match succeeded with substring: '" << p->m_matchSpecList[index].pattern << "'" << endl;
return true;
}
break;
}
}
//-Debug: TSMTRACE << "Match failed, no pattern matched!" << endl;
return false ;
}
bool TDEStringMatcher::matchAll( const TQString& stringToMatch )
{
//-Debug: TSMTRACE << "Attempting to match string '" << stringToMatch << "' against stored patterns" << endl;
if ( p->m_matchSpecList.isEmpty() ) {
//-Debug: TSMTRACE << "Match failed on empty pattern list!" << endl;
return false; //FIXME: or should that be true per MicheleC's comment?
}
TQString equivalentString;
for ( size_t index = 0 ; index < p->m_matchSpecList.count() ; index++ )
{
TQString matchThis = stringToMatch;
if ( p->m_matchSpecList[index].ancHandling == ANCHandling::EQUIVALENCE )
{
if ( equivalentString.isNull() ) {
// FIXME TBD: This is where we will be converting each alphanumeric
// character in stringToMatch to its "least" equivalent and storing
// the result in equivalentString. Until then, we'll just do:
equivalentString = stringToMatch;
}
matchThis = equivalentString;
}
if (
( p->m_regexList[index].search( matchThis ) < 0 ) // was there no match?
!= p->m_matchSpecList[index].wantMatch // is that what we were looking for?
) {
//-Debug: TSMTRACE << "String fail3ed to matching pattern: '" << rxPattern->pattern() << "'" << endl;
return false;
}
if ( p->m_regexList[index].search( matchThis ) < 0 ) {
//-Debug: TSMTRACE << "String failed to match pattern: '" << rxPattern->pattern() << "'" << endl;
return false;
}
}
//-Debug: TSMTRACE << "Match succeeded, all patterns matched!" << endl;
return true;
}
//================================================================================================
// Utility functions
//================================================================================================
/*
The following code is a modified copy of that found in tqt3/src/tools/qregexp.cpp.
*/
TQString TDEStringMatcher::wildcardToRegex( const TQString& wildcardPattern )
{
int wclen = wildcardPattern.length();
TQString rx = TQString::fromLatin1( "" );
int i = 0;
const TQChar *wc = wildcardPattern.unicode();
while ( i < wclen ) {
TQChar c = wc[i++];
switch ( c.unicode() ) {
case '*':
rx += TQString::fromLatin1( ".*" );
break;
case '?':
rx += TQChar( '.' );
break;
case '$':
case '(':
case ')':
case '+':
case '.':
case '\\':
case '^':
case '{':
case '|':
case '}':
rx += TQChar( '\\' );
rx += c;
break;
case '[':
rx += c;
/* This is not correct, POSIX states that negation character is '!'
if ( wc[i] == TQChar('^') )
rx += wc[i++];
*/
if ( wc[i] == TQChar('!') ) {
rx += TQChar('^');
i++;
} else if ( wc[i] == TQChar('^') ) {
rx += TQChar( '\\' );
rx += wc[i++];
}
if ( i < wclen ) {
if ( rx[i] == ']' )
rx += wc[i++];
while ( i < wclen && wc[i] != TQChar(']') ) {
if ( wc[i] == '\\' )
rx += TQChar( '\\' );
rx += wc[i++];
}
}
break;
default:
rx += c;
}
}
/* Wildcard patterns must match entire string */
return TQChar('^') + rx + TQChar('$');
/* TBD: Add support for extglob */
}
static TQString escapeRegexChars( const TQString& basicString )
{
int wclen = basicString.length();
TQString outputString = TQString::fromLatin1( "" );
int i = 0;
const TQChar *wc = basicString.unicode();
while ( i < wclen ) {
TQChar c = wc[i++];
switch ( c.unicode() ) {
case '+':
case '.':
case '^':
case '(':
case ')':
case '[':
case ']':
case '{':
case '}':
case '|':
case '$':
case '?':
case '*':
case '\\':
outputString += TQChar( '\\' );
outputString += c;
break;
default:
outputString += c;
}
}
return outputString;
}
//================================================================================================
#include "tdestringmatcher.moc"