You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
kvirc/src/modules/help/index.cpp

855 lines
12 KiB

#include "index.h"
#include "kvi_file.h"
#include <tqdir.h>
#include <tqstringlist.h>
#include "kvi_pointerhashtable.h"
#include <tqapplication.h>
#include <tqtextstream.h>
#include <ctype.h>
int kvi_compare(const Term * p1,const Term * p2)
{
if(p1->frequency == p2->frequency)
return 0;
if(p1->frequency < p2->frequency)
return -1;
return 1;
}
TQDataStream &operator>>( TQDataStream &s, Document &l )
{
s >> l.docNumber;
s >> l.frequency;
return s;
}
TQDataStream &operator<<( TQDataStream &s, const Document &l )
{
s << (TQ_INT16)l.docNumber;
s << (TQ_INT16)l.frequency;
return s;
}
Index::Index( const TQString &dp, const TQString &hp )
: TQObject( 0, 0 ), dict( 8999 ), docPath( dp )
{
alreadyHaveDocList = FALSE;
lastWindowClosed = FALSE;
connect( tqApp, TQT_SIGNAL( lastWindowClosed() ),
this, TQT_SLOT( setLastWinClosed() ) );
}
Index::Index( const TQStringList &dl, const TQString &hp )
: TQObject( 0, 0 ), dict( 8999 )
{
docList = dl;
alreadyHaveDocList = TRUE;
lastWindowClosed = FALSE;
connect( tqApp, TQT_SIGNAL( lastWindowClosed() ),
this, TQT_SLOT( setLastWinClosed() ) );
}
void Index::setLastWinClosed()
{
lastWindowClosed = TRUE;
}
void Index::setDictionaryFile( const TQString &f )
{
dictFile = f;
}
void Index::setDocListFile( const TQString &f )
{
docListFile = f;
}
int Index::makeIndex()
{
if ( !alreadyHaveDocList )
setupDocumentList();
if ( docList.isEmpty() )
return 1;
dict.clear();
TQStringList::Iterator it = docList.begin();
int steps = docList.count() / 100;
if ( !steps )
steps++;
int prog = 0;
for ( int i = 0; it != docList.end(); ++it, ++i ) {
if ( lastWindowClosed ) {
return -1;
}
parseDocument( *it, i );
if ( i%steps == 0 ) {
prog++;
emit indexingProgress( prog );
}
}
return 0;
}
void Index::setupDocumentList()
{
docList.clear();
titleList.clear();
TQDir d( docPath );
TQString szCur;
TQStringList lst = d.entryList( "*.html" );
TQStringList::ConstIterator it = lst.begin();
for ( ; it != lst.end(); ++it )
{
szCur=docPath + "/" + *it;
docList.append( szCur );
titleList.append(getDocumentTitle( szCur ));
}
}
void Index::insertInDict( const TQString &str, int docNum )
{
if ( strcmp( str, "amp" ) == 0 || strcmp( str, "nbsp" ) == 0 )
return;
Entry *e = 0;
if ( dict.count() )
e = dict[ str ];
if ( e ) {
if ( e->documents.first().docNumber != docNum )
e->documents.prepend( Document( docNum, 1 ) );
else
e->documents.first().frequency++;
} else {
dict.insert( str, new Entry( docNum ) );
}
}
void Index::parseDocument( const TQString &filename, int docNum )
{
KviFile file( filename );
if ( !file.openForReading() ) {
tqWarning( "can not open file %s", filename.ascii() );
return;
}
TQTextStream s( &file );
TQString text = s.read();
if (text.isNull())
return;
bool valid = TRUE;
const TQChar *buf = text.unicode();
TQChar str[64];
TQChar c = buf[0];
int j = 0;
int i = 0;
while ( (uint)j < text.length() ) {
if ( c == '<' || c == '&' ) {
valid = FALSE;
if ( i > 1 )
insertInDict( TQString(str,i), docNum );
i = 0;
c = buf[++j];
continue;
}
if ( ( c == '>' || c == ';' ) && !valid ) {
valid = TRUE;
c = buf[++j];
continue;
}
if ( !valid ) {
c = buf[++j];
continue;
}
if ( ( c.isLetterOrNumber() || c == '_' ) && i < 63 ) {
str[i] = c.lower();
++i;
} else {
if ( i > 1 )
insertInDict( TQString(str,i), docNum );
i = 0;
}
c = buf[++j];
}
if ( i > 1 )
insertInDict( TQString(str,i), docNum );
file.close();
}
void Index::writeDict()
{
KviPointerHashTableIterator<TQString,Entry> it( dict );
KviFile f( dictFile );
if ( !f.openForWriting() )
return;
TQDataStream s( &f );
for( ; it.current(); ++it ) {
Entry *e = it.current();
s << it.currentKey();
s << e->documents;
}
f.close();
writeDocumentList();
}
void Index::writeDocumentList()
{
KviFile f( docListFile );
if ( !f.openForWriting() )
return;
TQTextStream s( &f );
TQString docs = docList.join("[#item#]");
s << docs;
KviFile f1( docListFile+".titles" );
if ( !f1.openForWriting() )
return;
TQTextStream s1( &f1 );
docs = titleList.join("[#item#]");
s1 << docs;
}
void Index::readDict()
{
KviFile f( dictFile );
if ( !f.openForReading() )
return;
dict.clear();
TQDataStream s( &f );
TQString key;
KviValueList<Document> docs;
while ( !s.atEnd() ) {
s >> key;
s >> docs;
dict.insert( key, new Entry( docs ) );
}
f.close();
readDocumentList();
}
void Index::readDocumentList()
{
//reading docs
KviFile f( docListFile );
if ( !f.openForReading() )
return;
TQTextStream s( &f );
docList = TQStringList::split("[#item#]",s.read());
//reading titles
KviFile f1( docListFile+".titles" );
if ( !f1.openForReading() )
return;
TQTextStream s1( &f1 );
titleList = TQStringList::split("[#item#]",s1.read());
// debug(titleList);
}
TQStringList Index::query( const TQStringList &terms, const TQStringList &termSeq, const TQStringList &seqWords )
{
TermList termList;
TQStringList::ConstIterator it = terms.begin();
for ( it = terms.begin(); it != terms.end(); ++it ) {
Entry *e = 0;
if ( (*it).contains( '*' ) ) {
KviValueList<Document> wcts = setupDummyTerm( getWildcardTerms( *it ) );
termList.append( new Term( "dummy", wcts.count(), wcts ) );
} else if ( dict[ *it ] ) {
e = dict[ *it ];
termList.append( new Term( *it, e->documents.count(), e->documents ) );
} else {
return TQStringList();
}
}
termList.sort();
Term *minTerm = termList.first();
if ( !termList.count() )
return TQStringList();
termList.removeFirst();
KviValueList<Document> minDocs = minTerm->documents;
KviValueList<Document>::iterator C;
KviValueList<Document>::ConstIterator It;
Term *t = termList.first();
for ( ; t; t = termList.next() ) {
KviValueList<Document> docs = t->documents;
C = minDocs.begin();
while ( C != minDocs.end() ) {
bool found = FALSE;
for ( It = docs.begin(); It != docs.end(); ++It ) {
if ( (*C).docNumber == (*It).docNumber ) {
(*C).frequency += (*It).frequency;
found = TRUE;
break;
}
}
if ( !found )
C = minDocs.remove( C );
else
++C;
}
}
TQStringList results;
#ifndef COMPILE_USE_QT4
qHeapSort( minDocs );
#endif
if ( termSeq.isEmpty() ) {
for ( C = minDocs.begin(); C != minDocs.end(); ++C )
results << docList[ (int)(*C).docNumber ];
return results;
}
TQString fileName;
for ( C = minDocs.begin(); C != minDocs.end(); ++C ) {
fileName = docList[ (int)(*C).docNumber ];
if ( searchForPattern( termSeq, seqWords, fileName ) )
results << fileName;
}
return results;
}
TQString Index::getDocumentTitle( const TQString &fileName )
{
KviFile file( fileName );
if ( !file.openForReading() ) {
tqWarning( "cannot open file %s", fileName.ascii() );
return fileName;
}
TQTextStream s( &file );
TQString text = s.read();
int start = text.find( "<title>", 0, FALSE ) + 7;
int end = text.find( "</title>", 0, FALSE );
TQString title = ( end - start <= 0 ? tr("Untitled") : text.mid( start, end - start ) );
return title;
}
TQStringList Index::getWildcardTerms( const TQString &term )
{
TQStringList lst;
TQStringList terms = split( term );
#ifdef COMPILE_USE_QT4
TQStringList::Iterator iter;
#else
KviValueList<TQString>::iterator iter;
#endif
KviPointerHashTableIterator<TQString,Entry> it( dict );
for( ; it.current(); ++it ) {
int index = 0;
bool found = FALSE;
TQString text( it.currentKey() );
for ( iter = terms.begin(); iter != terms.end(); ++iter ) {
if ( *iter == "*" ) {
found = TRUE;
continue;
}
if ( iter == terms.begin() && (*iter)[0] != text[0] ) {
found = FALSE;
break;
}
index = text.find( *iter, index );
if ( *iter == terms.last() && index != (int)text.length()-1 ) {
index = text.findRev( *iter );
if ( index != (int)text.length() - (int)(*iter).length() ) {
found = FALSE;
break;
}
}
if ( index != -1 ) {
found = TRUE;
index += (*iter).length();
continue;
} else {
found = FALSE;
break;
}
}
if ( found )
lst << text;
}
return lst;
}
TQStringList Index::split( const TQString &str )
{
TQStringList lst;
int j = 0;
int i = str.find( '*', j );
while ( i != -1 ) {
if ( i > j && i <= (int)str.length() ) {
lst << str.mid( j, i - j );
lst << "*";
}
j = i + 1;
i = str.find( '*', j );
}
int l = str.length() - 1;
if ( str.mid( j, l - j + 1 ).length() > 0 )
lst << str.mid( j, l - j + 1 );
return lst;
}
KviValueList<Document> Index::setupDummyTerm( const TQStringList &terms )
{
TermList termList;
TQStringList::ConstIterator it = terms.begin();
for ( ; it != terms.end(); ++it ) {
Entry *e = 0;
if ( dict[ *it ] ) {
e = dict[ *it ];
termList.append( new Term( *it, e->documents.count(), e->documents ) );
}
}
termList.sort();
KviValueList<Document> maxList;
if ( !termList.count() )
return maxList;
maxList = termList.last()->documents;
termList.removeLast();
KviValueList<Document>::iterator docIt;
Term *t = termList.first();
while ( t ) {
KviValueList<Document> docs = t->documents;
for ( docIt = docs.begin(); docIt != docs.end(); ++docIt ) {
if ( maxList.findIndex( *docIt ) == -1 )
maxList.append( *docIt );
}
t = termList.next();
}
return maxList;
}
void Index::buildMiniDict( const TQString &str )
{
if ( miniDict[ str ] )
miniDict[ str ]->positions.append( wordNum );
++wordNum;
}
bool Index::searchForPattern( const TQStringList &patterns, const TQStringList &words, const TQString &fileName )
{
KviFile file( fileName );
if ( !file.openForReading() ) {
tqWarning( "cannot open file %s", fileName.ascii() );
return FALSE;
}
wordNum = 3;
miniDict.clear();
TQStringList::ConstIterator cIt = words.begin();
for ( ; cIt != words.end(); ++cIt )
miniDict.insert( *cIt, new PosEntry( 0 ) );
TQTextStream s( &file );
TQString text = s.read();
bool valid = TRUE;
const TQChar *buf = text.unicode();
TQChar str[64];
TQChar c = buf[0];
int j = 0;
int i = 0;
while ( (uint)j < text.length() ) {
if ( c == '<' || c == '&' ) {
valid = FALSE;
if ( i > 1 )
buildMiniDict( TQString(str,i) );
i = 0;
c = buf[++j];
continue;
}
if ( ( c == '>' || c == ';' ) && !valid ) {
valid = TRUE;
c = buf[++j];
continue;
}
if ( !valid ) {
c = buf[++j];
continue;
}
if ( ( c.isLetterOrNumber() || c == '_' ) && i < 63 ) {
str[i] = c.lower();
++i;
} else {
if ( i > 1 )
buildMiniDict( TQString(str,i) );
i = 0;
}
c = buf[++j];
}
if ( i > 1 )
buildMiniDict( TQString(str,i) );
file.close();
TQStringList::ConstIterator patIt = patterns.begin();
TQStringList wordLst;
KviValueList<uint> a, b;
KviValueList<uint>::iterator aIt;
for ( ; patIt != patterns.end(); ++patIt ) {
wordLst = TQStringList::split( ' ', *patIt );
a = miniDict[ wordLst[0] ]->positions;
for ( int j = 1; j < (int)wordLst.count(); ++j ) {
b = miniDict[ wordLst[j] ]->positions;
aIt = a.begin();
while ( aIt != a.end() ) {
if ( b.find( *aIt + 1 ) != b.end() ) {
(*aIt)++;
++aIt;
} else {
aIt = a.remove( aIt );
}
}
}
}
if ( a.count() )
return TRUE;
return FALSE;
}
#include "index.moc"