You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kchmviewer/lib/libchmfile/libchmfile_search.cpp

290 lines
8.9 KiB

/***************************************************************************
* Copyright (C) 2004-2007 by Georgy Yunaev, gyunaev@ulduzsoft.com *
* Please do not use email address above for bug reports; see *
* the README file *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the *
* Free Software Foundation, Inc., *
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
***************************************************************************/
#include <tqregexp.h>
#include "libchmfile.h"
#include "libchmfileimpl.h"
//#define DEBUG_SEARCH(A) tqDebug A
#define DEBUG_SEARCH(A)
static inline void validateWord ( TQString & word, bool & query_valid )
{
TQRegExp rxvalid ("[^\\d\\w_\\.]+");
TQString orig = word;
word.remove ( rxvalid );
if ( word != orig )
query_valid = false;
}
static inline void validateWords ( TQStringList & wordlist, bool & query_valid )
{
TQRegExp rxvalid ("[^\\d\\w_\\.]+");
for ( unsigned int i = 0; i < wordlist.size(); i++ )
validateWord ( wordlist[i], query_valid );
}
inline static void mergeResults ( LCHMSearchProgressResults & results, const LCHMSearchProgressResults & src, bool add )
{
if ( results.empty() && add )
{
results = src;
return;
}
for ( unsigned int s1 = 0; s1 < results.size(); s1++ )
{
bool found = false;
for ( unsigned int s2 = 0; s2 < src.size(); s2++ )
{
if ( results[s1].urloff == src[s2].urloff )
{
found = true;
break;
}
}
// If we're adding, we only add the items found (i.e. any item, which is not found, is removed.
// But if we're removing, we only remove the items found.
if ( (found && !add) || (!found && add) )
{
results.erase ( results.begin() + s1 );
s1--;
}
}
}
static inline void findNextWords ( QT34VECTOR<u_int64_t> & src, const QT34VECTOR<u_int64_t> & needle )
{
for ( unsigned int s1 = 0; s1 < src.size(); s1++ )
{
bool found = false;
u_int64_t target_offset = src[s1] + 1;
DEBUG_SEARCH (("Offset loop: offset at %u is %u, target %u", (unsigned int) s1,
(unsigned int) src[s1], (unsigned int) target_offset));
// Search in the offsets list in attempt to find next word
for ( unsigned int s2 = 0; s2 < needle.size(); s2++ )
{
if ( needle[s2] == target_offset )
{
found = true;
break;
}
}
if ( !found )
{
// Remove this offset, we don't need it anymore
DEBUG_SEARCH (("Offset loop failed: offset %u not found", (unsigned int) target_offset));
src.erase ( src.begin() + s1 );
s1--;
}
else
{
DEBUG_SEARCH (("Offset loop succeed: offset %u found", (unsigned int) target_offset));
src[s1]++;
}
}
}
inline bool searchPhrase( LCHMFileImpl * impl, const TQStringList & phrase, LCHMSearchProgressResults & results )
{
// Accumulate the phrase data here.
LCHMSearchProgressResults phrasekeeper;
// On the first word, just fill the phrasekeeper with every occupence of the first word
DEBUG_SEARCH (("Search word(0): '%s'", phrase[0].ascii()));
if ( !impl->searchWord ( phrase[0], true, false, phrasekeeper, true ) )
return false; // the word not found, so the whole phrase is not found either.
for ( unsigned int i = 1; i < phrase.size(); i++ )
{
LCHMSearchProgressResults srchtmp;
DEBUG_SEARCH (("Search word(%d): '%s'", i, phrase[i].ascii()));
if ( !impl->searchWord ( phrase[i], true, false, srchtmp, true ) )
return false; // the ith word not found, so the whole phrase is not found either.
// Iterate the both arrays, and remove every word in phrasekeeper, which is not found
// in the srchtmp, or is found on a different position.
for ( unsigned int p1 = 0; p1 < phrasekeeper.size(); p1++ )
{
bool found = false;
DEBUG_SEARCH (("Ext loop (it %d): urloff %d", p1, phrasekeeper[p1].urloff));
for ( unsigned int p2 = 0; p2 < srchtmp.size(); p2++ )
{
// look up for words on the the same page
if ( srchtmp[p2].urloff != phrasekeeper[p1].urloff )
continue;
// Now check every offset to find the one which is 1 bigger than the
findNextWords ( phrasekeeper[p1].offsets, srchtmp[p2].offsets );
// If at least one next word is found, we leave the block intact, otherwise remove it.
if ( !phrasekeeper[p1].offsets.empty() )
found = true;
}
if ( !found )
{
DEBUG_SEARCH (("Ext loop: this word not found on %d, remove it", phrasekeeper[p1].urloff));
phrasekeeper.erase ( phrasekeeper.begin() + p1 );
p1--;
}
}
}
for ( unsigned int o = 0; o < phrasekeeper.size(); o++ )
results.push_back ( LCHMSearchProgressResult (phrasekeeper[o].titleoff, phrasekeeper[o].urloff) );
return !results.empty();
}
bool LCHMFile::searchQuery( const TQString& inquery, TQStringList * searchresults, unsigned int limit )
{
TQStringList words_must_exist, words_must_not_exist, words_highlight;
QT34VECTOR<TQStringList> phrases_must_exist;
TQString query = inquery;
bool query_valid = true;
LCHMSearchProgressResults results;
int pos;
unsigned int i;
/*
* Parse the search query with a simple state machine.
* Query should consist of one of more words separated by a space with a possible prefix.
* A prefix may be:
* + indicates that the word is required; any page without this word is excluded from the result.
* - indicates that the word is required to be absent; any page with this word is excluded from
* the result.
* "." indicates a phrase. Anything between quotes indicates a phrase, which is set of space-separated
* words. Will be in result only if the words in phrase are in page in the same sequence, and
* follow each other.
* If there is no prefix, the word considered as required.
*/
TQRegExp rxphrase( "\"(.*)\"" );
TQRegExp rxword( "([^\\s]+)" );
rxphrase.setMinimal( TRUE );
// First, get the phrase queries
while ( (pos = rxphrase.search (query, 0)) != -1 )
{
// A phrase query found. Locate its boundaries, and parse it.
TQStringList plist = TQStringList::split ( TQRegExp ("\\s+"), rxphrase.cap ( 1 ));
validateWords ( plist, query_valid );
if ( plist.size() > 0 )
phrases_must_exist.push_back( plist );
query.remove (pos, rxphrase.matchedLength());
}
// Then, parse the rest query
while ( (pos = rxword.search (query, 0)) != -1 )
{
// A phrase query found. Locate its boundaries, and parse it.
TQString word = rxword.cap ( 1 );
TQChar type = '+';
if ( word[0] == '-' || word[0] == '+' )
{
type = word[0];
word.remove (0, 1);
}
validateWord ( word, query_valid );
if ( type == '-' )
words_must_not_exist.push_back ( word );
else
words_must_exist.push_back ( word );
query.remove (pos, rxword.matchedLength());
}
#if defined (DUMP_SEARCH_QUERY)
// Dump the search query
TQString qdump;
for ( i = 0; i < phrases_must_exist.size(); i++ )
qdump += TQString(" \"") + phrases_must_exist[i].join (" ") + TQString ("\"");
for ( i = 0; i < words_must_not_exist.size(); i++ )
qdump += TQString (" -") + words_must_not_exist[i];
for ( i = 0; i < words_must_exist.size(); i++ )
qdump += TQString (" +") + words_must_exist[i];
tqDebug ("Search query dump: %s", qdump.ascii());
#endif
// First search for phrases
if ( phrases_must_exist.size() > 0 )
{
LCHMSearchProgressResults tempres;
for ( i = 0; i < phrases_must_exist.size(); i++ )
{
if ( !searchPhrase ( impl(), phrases_must_exist[i], tempres ) )
return false;
mergeResults ( results, tempres, true );
}
}
for ( i = 0; i < words_must_exist.size(); i++ )
{
LCHMSearchProgressResults tempres;
if ( !m_impl->searchWord ( words_must_exist[i], true, false, tempres, false ) )
return false;
mergeResults ( results, tempres, true );
}
for ( i = 0; i < words_must_not_exist.size(); i++ )
{
LCHMSearchProgressResults tempres;
m_impl->searchWord ( words_must_not_exist[i], true, false, tempres, false );
mergeResults ( results, tempres, false );
}
m_impl->getSearchResults( results, searchresults, limit );
return true;
}