You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
919 lines
23 KiB
919 lines
23 KiB
3 years ago
|
//
|
||
|
// parser.cc
|
||
|
//
|
||
|
// parser: Parses a boolean expression tree, retrieving and scoring
|
||
|
// the resulting document list
|
||
|
//
|
||
|
// Part of the ht://Dig package <http://www.htdig.org/>
|
||
|
// Copyright (c) 1995-2004 The ht://Dig Group
|
||
|
// For copyright details, see the file COPYING in your distribution
|
||
|
// or the GNU Library General Public License (LGPL) version 2 or later
|
||
|
// <http://www.gnu.org/copyleft/lgpl.html>
|
||
|
//
|
||
|
// $Id: parser.cc,v 1.36 2004/06/11 16:50:33 grdetil Exp $
|
||
|
//
|
||
|
|
||
|
#ifdef HAVE_CONFIG_H
|
||
|
#include "htconfig.h"
|
||
|
#endif /* HAVE_CONFIG_H */
|
||
|
|
||
|
#include "parser.h"
|
||
|
#include "HtPack.h"
|
||
|
#include "Collection.h"
|
||
|
#include "Dictionary.h"
|
||
|
#include "QuotedStringList.h"
|
||
|
|
||
|
#define WORD 1000
|
||
|
#define DONE 1001
|
||
|
|
||
|
QuotedStringList boolean_syntax_errors;
|
||
|
enum ErrorIndices { EXPECTED, SEARCH_WORD, AT_END, INSTEAD_OF, END_OF_EXPR, QUOTE };
|
||
|
|
||
|
//*****************************************************************************
|
||
|
Parser::Parser() :
|
||
|
words(*(HtConfiguration::config()))
|
||
|
{
|
||
|
tokens = 0;
|
||
|
result = 0;
|
||
|
current = 0;
|
||
|
valid = 1;
|
||
|
}
|
||
|
|
||
|
|
||
|
//*****************************************************************************
|
||
|
// int Parser::checkSyntax(List *tokenList)
|
||
|
// As the name of the function implies, we will only perform a syntax check
|
||
|
// on the list of tokens.
|
||
|
//
|
||
|
int
|
||
|
Parser::checkSyntax(List *tokenList)
|
||
|
{
|
||
|
HtConfiguration* config= HtConfiguration::config();
|
||
|
void reportError(char *);
|
||
|
// Load boolean_syntax_errors from configuration
|
||
|
// they should be placed in this order:
|
||
|
// 0 1 2 3 4
|
||
|
// Expected "a search word" "at the end" "instead of" "end of expression"
|
||
|
// 5
|
||
|
// "a closing quote"
|
||
|
boolean_syntax_errors.Destroy();
|
||
|
boolean_syntax_errors.Create(config->Find("boolean_syntax_errors"), "| \t\r\n\001");
|
||
|
if (boolean_syntax_errors.Count() == 5)
|
||
|
{ // for backward compatibility
|
||
|
boolean_syntax_errors.Add (new String ("a closing quote"));
|
||
|
if (debug)
|
||
|
cerr << "Parser::checkSyntax() : boolean_syntax_errors should have six entries\n";
|
||
|
} else if (boolean_syntax_errors.Count() != 6)
|
||
|
reportError("boolean_syntax_errors attribute should have six entries");
|
||
|
tokens = tokenList;
|
||
|
valid = 1;
|
||
|
fullexpr(0);
|
||
|
return valid;
|
||
|
}
|
||
|
|
||
|
//*****************************************************************************
|
||
|
/* Called by: Parser::parse(List*, ResultList&), checkSyntax(List*) */
|
||
|
/* Inputs: output -- if zero, simply check syntax */
|
||
|
/* otherwise, list matching documents in head of "stack" */
|
||
|
void
|
||
|
Parser::fullexpr(int output)
|
||
|
{
|
||
|
tokens->Start_Get();
|
||
|
lookahead = lexan();
|
||
|
expr(output);
|
||
|
if (valid && lookahead != DONE)
|
||
|
{
|
||
|
setError(boolean_syntax_errors[END_OF_EXPR]);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
//*****************************************************************************
|
||
|
int
|
||
|
Parser::lexan()
|
||
|
{
|
||
|
current = (WeightWord *) tokens->Get_Next();
|
||
|
if (!current)
|
||
|
return DONE;
|
||
|
else if (mystrcasecmp((char*)current->word, "&") == 0)
|
||
|
return '&';
|
||
|
else if (mystrcasecmp((char*)current->word, "|") == 0)
|
||
|
return '|';
|
||
|
else if (mystrcasecmp((char*)current->word, "!") == 0)
|
||
|
return '!';
|
||
|
else if (mystrcasecmp((char*)current->word, "(") == 0)
|
||
|
return '(';
|
||
|
else if (mystrcasecmp((char*)current->word, ")") == 0)
|
||
|
return ')';
|
||
|
else if (mystrcasecmp((char*)current->word, "\"") == 0)
|
||
|
return '"';
|
||
|
else
|
||
|
return WORD;
|
||
|
}
|
||
|
|
||
|
//*****************************************************************************
|
||
|
// Attempt to deal with expressions in the form
|
||
|
// term | term | term ...
|
||
|
/* Called by: Parser::fullexpr(int), factor(int) */
|
||
|
/* Inputs: output -- if zero, simply check syntax */
|
||
|
void
|
||
|
Parser::expr(int output)
|
||
|
{
|
||
|
term(output);
|
||
|
while (1)
|
||
|
{
|
||
|
if (match('|'))
|
||
|
{
|
||
|
term(output);
|
||
|
if (output)
|
||
|
{
|
||
|
if(debug) cerr << "or--" << endl;
|
||
|
perform_or();
|
||
|
if(debug) cerr << "stack:" << stack.Size() << endl;
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
break;
|
||
|
}
|
||
|
if (valid && lookahead == WORD)
|
||
|
{
|
||
|
String expected = "'";
|
||
|
expected << boolean_keywords[AND] << "' "<< boolean_keywords[OR] <<" '"
|
||
|
<< boolean_keywords[OR] << "'";
|
||
|
setError(expected.get());
|
||
|
}
|
||
|
}
|
||
|
|
||
|
//*****************************************************************************
|
||
|
// Attempt to deal with terms in the form
|
||
|
// factor & factor & factor ...
|
||
|
/* Called by: Parser::expr(int) */
|
||
|
/* Inputs: output -- if zero, simply check syntax */
|
||
|
void
|
||
|
Parser::term(int output)
|
||
|
{
|
||
|
|
||
|
factor(output);
|
||
|
if(debug) cerr << "term:factor" << endl;
|
||
|
while (1)
|
||
|
{
|
||
|
if(match('&'))
|
||
|
{
|
||
|
factor(output);
|
||
|
if(output)
|
||
|
{
|
||
|
if(debug) cerr << "and--" << endl;
|
||
|
perform_and();
|
||
|
if(debug) cerr << "stack:" << stack.Size() << endl;
|
||
|
}
|
||
|
}
|
||
|
else if(match('!'))
|
||
|
{
|
||
|
factor(output);
|
||
|
if(output)
|
||
|
{
|
||
|
if(debug) cerr << "not--" << endl;
|
||
|
perform_not();
|
||
|
if(debug) cerr << "stack:" << stack.Size() << endl;
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
//*****************************************************************************
|
||
|
/* Gather and score a (possibly bracketed) boolean expression */
|
||
|
/* Called by: Parser::term(int) */
|
||
|
/* Inputs: output -- if zero, simply check syntax */
|
||
|
void
|
||
|
Parser::factor(int output)
|
||
|
{
|
||
|
if(match('"'))
|
||
|
{
|
||
|
phrase(output);
|
||
|
}
|
||
|
else if (match('('))
|
||
|
{
|
||
|
expr(output);
|
||
|
if (match(')'))
|
||
|
{
|
||
|
return;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
setError("')'");
|
||
|
}
|
||
|
}
|
||
|
else if (lookahead == WORD)
|
||
|
{
|
||
|
if (output)
|
||
|
{
|
||
|
perform_push();
|
||
|
}
|
||
|
lookahead = lexan();
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
setError(boolean_syntax_errors[SEARCH_WORD]);
|
||
|
// setError("a search word, a quoted phrase, a boolean expression between ()");
|
||
|
}
|
||
|
}
|
||
|
|
||
|
//*****************************************************************************
|
||
|
/* Gather and score a quoted phrase */
|
||
|
/* Called by: Parser::factor(int) */
|
||
|
/* Inputs: output -- if zero, simply check syntax */
|
||
|
void
|
||
|
Parser::phrase(int output)
|
||
|
{
|
||
|
List *wordList = 0;
|
||
|
double weight = 1.0;
|
||
|
|
||
|
while (1)
|
||
|
{
|
||
|
if (match('"'))
|
||
|
{
|
||
|
if (output)
|
||
|
{
|
||
|
if(!wordList) wordList = new List;
|
||
|
if(debug) cerr << "scoring phrase" << endl;
|
||
|
score(wordList, weight, FLAGS_MATCH_ONE); // look in all fields
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
else if (lookahead == WORD)
|
||
|
{
|
||
|
weight *= current->weight;
|
||
|
if (output)
|
||
|
perform_phrase(wordList);
|
||
|
|
||
|
lookahead = lexan();
|
||
|
}
|
||
|
else if (lookahead == DONE)
|
||
|
{
|
||
|
setError(boolean_syntax_errors[QUOTE]);
|
||
|
break;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
// skip '&' '|' and '!' in the phrase
|
||
|
current->isIgnore = 1;
|
||
|
if (output)
|
||
|
perform_phrase(wordList);
|
||
|
lookahead = lexan ();
|
||
|
}
|
||
|
} // end while
|
||
|
if(wordList) delete wordList;
|
||
|
}
|
||
|
|
||
|
//*****************************************************************************
|
||
|
int
|
||
|
Parser::match(int t)
|
||
|
{
|
||
|
if (lookahead == t)
|
||
|
{
|
||
|
lookahead = lexan();
|
||
|
return 1;
|
||
|
}
|
||
|
else
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
//*****************************************************************************
|
||
|
void
|
||
|
Parser::setError(char *expected)
|
||
|
{
|
||
|
if (valid)
|
||
|
{
|
||
|
valid = 0;
|
||
|
error = 0;
|
||
|
error << boolean_syntax_errors[EXPECTED] << ' ' << expected;
|
||
|
if (lookahead == DONE || !current)
|
||
|
{
|
||
|
error << ' ' << boolean_syntax_errors[AT_END];
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
error << ' ' << boolean_syntax_errors[INSTEAD_OF] << " '"
|
||
|
<< current->word.get() << "'";
|
||
|
switch (lookahead)
|
||
|
{
|
||
|
case '&': error << ' ' << boolean_keywords[OR] << " '"
|
||
|
<< boolean_keywords[AND] << "'";
|
||
|
break;
|
||
|
case '|': error << ' ' << boolean_keywords[OR] << " '"
|
||
|
<< boolean_keywords[OR] << "'";
|
||
|
break;
|
||
|
case '!': error << ' ' << boolean_keywords[OR] << " '"
|
||
|
<< boolean_keywords[NOT] << "'";
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
if (debug) cerr << "Syntax error: " << error << endl;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
//*****************************************************************************
|
||
|
// Perform a lookup of the current word and push the result onto the stack
|
||
|
//
|
||
|
void
|
||
|
Parser::perform_push()
|
||
|
{
|
||
|
HtConfiguration* config= HtConfiguration::config();
|
||
|
static int maximum_word_length = config->Value("maximum_word_length", 12);
|
||
|
String temp = current->word.get();
|
||
|
char *p;
|
||
|
|
||
|
if(debug)
|
||
|
cerr << "perform_push @"<< stack.Size() << ": " << temp << endl;
|
||
|
|
||
|
String wildcard = config->Find("prefix_match_character");
|
||
|
if (!wildcard.get())
|
||
|
wildcard = "*";
|
||
|
if (temp == wildcard)
|
||
|
{
|
||
|
if (debug) cerr << "Wild card search\n";
|
||
|
ResultList *list = new ResultList;
|
||
|
String doc_db = config->Find("doc_db");
|
||
|
DocumentDB docdb;
|
||
|
docdb.Read(doc_db);
|
||
|
List *docs = docdb.DocIDs();
|
||
|
|
||
|
//
|
||
|
// Traverse all the known documents
|
||
|
//
|
||
|
DocumentRef *ref;
|
||
|
IntObject *id;
|
||
|
DocMatch *dm;
|
||
|
docs->Start_Get();
|
||
|
while ((id = (IntObject *) docs->Get_Next()))
|
||
|
{
|
||
|
ref = docdb[id->Value()];
|
||
|
if (debug)
|
||
|
cerr << (ref ? "Wildcard match" : "Wildcard empty") << endl;
|
||
|
if (ref)
|
||
|
{
|
||
|
dm = new DocMatch;
|
||
|
dm->score = current->weight;
|
||
|
dm->id = ref->DocID();
|
||
|
dm->orMatches = 1;
|
||
|
dm->anchor = 0;
|
||
|
list->add(dm);
|
||
|
}
|
||
|
delete ref;
|
||
|
}
|
||
|
delete docs;
|
||
|
stack.push(list);
|
||
|
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
// Must be after wildcard: "*" is "isIgnore" because it is too short.
|
||
|
if (current->isIgnore)
|
||
|
{
|
||
|
if(debug) cerr << "ignore: " << temp << " @" << stack.Size() << endl;
|
||
|
//
|
||
|
// This word needs to be ignored. Make it so.
|
||
|
//
|
||
|
ResultList *list = new ResultList;
|
||
|
list->isIgnore = 1;
|
||
|
stack.push(list);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
temp.lowercase();
|
||
|
p = temp.get();
|
||
|
if (temp.length() > maximum_word_length)
|
||
|
p[maximum_word_length] = '\0';
|
||
|
|
||
|
List* result = words[p];
|
||
|
score(result, current->weight, current->flags);
|
||
|
delete result;
|
||
|
}
|
||
|
|
||
|
//*****************************************************************************
|
||
|
// BUG: Phrases containing "bad words" can have *any* "bad word" in that
|
||
|
// position. Words less than minimum_word_length ignored entirely,
|
||
|
// as they are not indexed.
|
||
|
void
|
||
|
Parser::perform_phrase(List * &oldWords)
|
||
|
{
|
||
|
HtConfiguration* config= HtConfiguration::config();
|
||
|
static int maximum_word_length = config->Value("maximum_word_length", 12);
|
||
|
String temp = current->word.get();
|
||
|
char *p;
|
||
|
List *newWords = 0;
|
||
|
HtWordReference *oldWord, *newWord;
|
||
|
|
||
|
// how many words ignored since last checked word?
|
||
|
static int ignoredWords = 0;
|
||
|
|
||
|
// if the query is empty, no further effort is needed
|
||
|
if(oldWords && oldWords->Count() == 0)
|
||
|
{
|
||
|
if(debug) cerr << "phrase not found, skip" << endl;
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
if(debug) cerr << "phrase current: " << temp << endl;
|
||
|
if (current->isIgnore)
|
||
|
{
|
||
|
//
|
||
|
// This word needs to be ignored. Make it so.
|
||
|
//
|
||
|
if (temp.length() >= config->Value ("minimum_word_length") && oldWords)
|
||
|
ignoredWords++;
|
||
|
if(debug) cerr << "ignoring: " << temp << endl;
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
temp.lowercase();
|
||
|
p = temp.get();
|
||
|
if (temp.length() > maximum_word_length)
|
||
|
p[maximum_word_length] = '\0';
|
||
|
|
||
|
newWords = words[p];
|
||
|
if(debug) cerr << "new words count: " << newWords->Count() << endl;
|
||
|
|
||
|
// If we don't have a prior list of words, we want this one...
|
||
|
if (!oldWords)
|
||
|
{
|
||
|
oldWords = new List;
|
||
|
if(debug) cerr << "phrase adding first: " << temp << endl;
|
||
|
newWords->Start_Get();
|
||
|
while ((newWord = (HtWordReference *) newWords->Get_Next()))
|
||
|
{
|
||
|
oldWords->Add(newWord);
|
||
|
}
|
||
|
if(debug) cerr << "old words count: " << oldWords->Count() << endl;
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
// OK, now we have a previous list in wordList and a new list
|
||
|
List *results = new List;
|
||
|
|
||
|
Dictionary newDict(5000);
|
||
|
|
||
|
String nid;
|
||
|
newWords->Start_Get();
|
||
|
while ((newWord = (HtWordReference *) newWords->Get_Next()))
|
||
|
{
|
||
|
nid = "";
|
||
|
int did = newWord->DocID();
|
||
|
nid << did;
|
||
|
nid << "-";
|
||
|
int loc = newWord->Location();
|
||
|
nid << loc;
|
||
|
if (! newDict.Exists(nid)) {
|
||
|
newDict.Add(nid, (Object *)newWord);
|
||
|
} else {
|
||
|
// cerr << "perform_phrase: NewWords Duplicate: " << nid << "\n";
|
||
|
// Double addition is a problem if you don't want your original objects deleted
|
||
|
}
|
||
|
}
|
||
|
|
||
|
String oid;
|
||
|
oldWords->Start_Get();
|
||
|
while ((oldWord = (HtWordReference *) oldWords->Get_Next()))
|
||
|
{
|
||
|
oid = "";
|
||
|
int did = oldWord->DocID();
|
||
|
oid << did;
|
||
|
oid << "-";
|
||
|
int loc = oldWord->Location();
|
||
|
oid << loc + ignoredWords+1;
|
||
|
if (newDict.Exists(oid))
|
||
|
{
|
||
|
newWord = (HtWordReference *)newDict.Find(oid);
|
||
|
|
||
|
HtWordReference *result = new HtWordReference(*oldWord);
|
||
|
|
||
|
result->Flags(oldWord->Flags() & newWord->Flags());
|
||
|
result->Location(newWord->Location());
|
||
|
|
||
|
results->Add(result);
|
||
|
}
|
||
|
}
|
||
|
ignoredWords = 0; // most recent word is not a non-ignored word
|
||
|
|
||
|
newDict.Release();
|
||
|
|
||
|
if(debug) cerr << "old words count: " << oldWords->Count() << endl;
|
||
|
if(debug) cerr << "results count: " << results->Count() << endl;
|
||
|
oldWords->Destroy();
|
||
|
results->Start_Get();
|
||
|
while ((newWord = (HtWordReference *) results->Get_Next()))
|
||
|
{
|
||
|
oldWords->Add(newWord);
|
||
|
}
|
||
|
if(debug) cerr << "old words count: " << oldWords->Count() << endl;
|
||
|
results->Release();
|
||
|
delete results;
|
||
|
|
||
|
newWords->Destroy();
|
||
|
delete newWords;
|
||
|
|
||
|
}
|
||
|
|
||
|
//*****************************************************************************
|
||
|
// Allocate scores based on words in wordList.
|
||
|
// Fields within which the word must appear are specified in flags
|
||
|
// (see HtWordReference.h).
|
||
|
void
|
||
|
Parser::score(List *wordList, double weight, unsigned int flags)
|
||
|
{
|
||
|
HtConfiguration* config= HtConfiguration::config();
|
||
|
DocMatch *dm;
|
||
|
HtWordReference *wr;
|
||
|
static double text_factor = config->Double("text_factor", 1);
|
||
|
static double caps_factor = config->Double("caps_factor", 1);
|
||
|
static double title_factor = config->Double("title_factor", 1);
|
||
|
static double heading_factor = config->Double("heading_factor", 1);
|
||
|
static double keywords_factor = config->Double("keywords_factor", 1);
|
||
|
static double meta_description_factor = config->Double("meta_description_factor", 1);
|
||
|
static double author_factor = config->Double("author_factor", 1);
|
||
|
static double description_factor = config->Double("description_factor", 1);
|
||
|
double wscore;
|
||
|
int docanchor;
|
||
|
int word_count;
|
||
|
|
||
|
if (!wordList || wordList->Count() == 0)
|
||
|
{
|
||
|
// We can't score an empty list, so push a null pointer...
|
||
|
if(debug) cerr << "score: empty list, push 0 @" << stack.Size() << endl;
|
||
|
|
||
|
stack.push(0);
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
ResultList *list = new ResultList;
|
||
|
if(debug) cerr << "score: push @" << stack.Size() << endl;
|
||
|
stack.push(list);
|
||
|
// We're now guaranteed to have a non-empty list
|
||
|
// We'll use the number of occurences of this word for scoring
|
||
|
word_count = wordList->Count();
|
||
|
|
||
|
wordList->Start_Get();
|
||
|
while ((wr = (HtWordReference *) wordList->Get_Next()))
|
||
|
{
|
||
|
//
|
||
|
// ******* Compute the score for the document
|
||
|
//
|
||
|
|
||
|
// If word not in one of the required fields, skip the entry.
|
||
|
// Plain text sets no flag in dbase, so treat it separately.
|
||
|
if (!(wr->Flags() & flags) && (wr->Flags() || !(flags & FLAG_PLAIN)))
|
||
|
{
|
||
|
if (debug > 2)
|
||
|
cerr << "Flags " << wr->Flags() << " lack " << flags << endl;
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
wscore = 0.0;
|
||
|
if (wr->Flags() == FLAG_TEXT) wscore += text_factor;
|
||
|
if (wr->Flags() & FLAG_CAPITAL) wscore += caps_factor;
|
||
|
if (wr->Flags() & FLAG_TITLE) wscore += title_factor;
|
||
|
if (wr->Flags() & FLAG_HEADING) wscore += heading_factor;
|
||
|
if (wr->Flags() & FLAG_KEYWORDS) wscore += keywords_factor;
|
||
|
if (wr->Flags() & FLAG_DESCRIPTION) wscore += meta_description_factor;
|
||
|
if (wr->Flags() & FLAG_AUTHOR) wscore += author_factor;
|
||
|
if (wr->Flags() & FLAG_LINK_TEXT) wscore += description_factor;
|
||
|
wscore *= weight;
|
||
|
wscore = wscore / (double)word_count;
|
||
|
docanchor = wr->Anchor();
|
||
|
dm = list->find(wr->DocID());
|
||
|
if (dm)
|
||
|
{
|
||
|
wscore += dm->score;
|
||
|
if (dm->anchor < docanchor)
|
||
|
docanchor = dm->anchor;
|
||
|
// We wish to *update* this, not add a duplicate
|
||
|
list->remove(wr->DocID());
|
||
|
}
|
||
|
|
||
|
dm = new DocMatch;
|
||
|
dm->id = wr->DocID();
|
||
|
dm->score = wscore;
|
||
|
dm->orMatches = 1; // how many "OR" terms this doc has
|
||
|
dm->anchor = docanchor;
|
||
|
list->add(dm);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
//*****************************************************************************
|
||
|
// The top two entries in the stack need to be ANDed together.
|
||
|
//
|
||
|
// a b a and b
|
||
|
// 0 0 0
|
||
|
// 0 1 0
|
||
|
// 0 x 0
|
||
|
// 1 0 0
|
||
|
// 1 1 intersect(a,b)
|
||
|
// 1 x a
|
||
|
// x 0 0
|
||
|
// x 1 b
|
||
|
// x x x
|
||
|
//
|
||
|
void
|
||
|
Parser::perform_and()
|
||
|
{
|
||
|
ResultList *l1 = (ResultList *) stack.pop();
|
||
|
ResultList *l2 = (ResultList *) stack.pop();
|
||
|
int i;
|
||
|
DocMatch *dm, *dm2, *dm3;
|
||
|
HtVector *elements;
|
||
|
|
||
|
if(!(l2 && l1))
|
||
|
{
|
||
|
if(debug) cerr << "and: at least one empty operator, pushing 0 @" << stack.Size() << endl;
|
||
|
stack.push(0);
|
||
|
if(l1) delete l1;
|
||
|
if(l2) delete l2;
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// If either of the arguments is set to be ignored, we will use the
|
||
|
// other as the result.
|
||
|
// remember l2 and l1, l2 not l1
|
||
|
|
||
|
if (l1->isIgnore && l2->isIgnore)
|
||
|
{
|
||
|
if(debug) cerr << "and: ignoring all, pushing ignored list @" << stack.Size() << endl;
|
||
|
ResultList *result = new ResultList;
|
||
|
result->isIgnore = 1;
|
||
|
delete l1; delete l2;
|
||
|
stack.push(result);
|
||
|
return;
|
||
|
}
|
||
|
else if (l1->isIgnore)
|
||
|
{
|
||
|
if(debug) cerr << "and: ignoring l1, pushing l2 @" << stack.Size() << endl;
|
||
|
stack.push(l2);
|
||
|
delete l1;
|
||
|
return;
|
||
|
}
|
||
|
else if (l2->isIgnore)
|
||
|
{
|
||
|
if(debug) cerr << "and: ignoring l2, pushing l2 @" << stack.Size() << endl;
|
||
|
stack.push(l1);
|
||
|
delete l2;
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
ResultList *result = new ResultList;
|
||
|
stack.push(result);
|
||
|
elements = l2->elements();
|
||
|
|
||
|
if(debug)
|
||
|
cerr << "perform and: " << elements->Count() << " " << l1->elements()->Count() << " ";
|
||
|
|
||
|
for (i = 0; i < elements->Count(); i++)
|
||
|
{
|
||
|
dm = (DocMatch *) (*elements)[i];
|
||
|
dm2 = l1->find(dm->id);
|
||
|
if (dm2)
|
||
|
{
|
||
|
//
|
||
|
// Duplicate document. Add scores and average "OR-matches" count
|
||
|
//
|
||
|
dm3 = new DocMatch;
|
||
|
// "if (dm2)" means "?:" operator not needed...
|
||
|
// dm3->score = dm->score + (dm2 ? dm2->score : 0);
|
||
|
// dm3->orMatches = (dm->orMatches + (dm2 ? dm2->orMatches : 0))/2;
|
||
|
dm3->score = dm->score + dm2->score;
|
||
|
dm3->orMatches = (dm->orMatches + dm2->orMatches)/2;
|
||
|
dm3->id = dm->id;
|
||
|
dm3->anchor = dm->anchor;
|
||
|
// if (dm2 && dm2->anchor < dm3->anchor)
|
||
|
if (dm2->anchor < dm3->anchor)
|
||
|
dm3->anchor = dm2->anchor;
|
||
|
result->add(dm3);
|
||
|
}
|
||
|
}
|
||
|
if(debug)
|
||
|
cerr << result->elements()->Count() << endl;
|
||
|
|
||
|
elements->Release();
|
||
|
delete elements;
|
||
|
delete l1;
|
||
|
delete l2;
|
||
|
}
|
||
|
|
||
|
// a b a not b
|
||
|
// 0 0 0
|
||
|
// 0 1 0
|
||
|
// 0 x 0
|
||
|
// 1 0 a
|
||
|
// 1 1 intersect(a,not b)
|
||
|
// 1 x a
|
||
|
// x 0 x
|
||
|
// x 1 x
|
||
|
// x x x
|
||
|
void
|
||
|
Parser::perform_not()
|
||
|
{
|
||
|
ResultList *l1 = (ResultList *) stack.pop();
|
||
|
ResultList *l2 = (ResultList *) stack.pop();
|
||
|
int i;
|
||
|
DocMatch *dm, *dm2, *dm3;
|
||
|
HtVector *elements;
|
||
|
|
||
|
|
||
|
if(!l2)
|
||
|
{
|
||
|
if(debug) cerr << "not: no positive term, pushing 0 @" << stack.Size() << endl;
|
||
|
// Should probably be interpreted as "* not l1"
|
||
|
stack.push(0);
|
||
|
if(l1) delete l1;
|
||
|
return;
|
||
|
}
|
||
|
if(!l1 || l1->isIgnore || l2->isIgnore)
|
||
|
{
|
||
|
if(debug) cerr << "not: no negative term, pushing positive @" << stack.Size() << endl;
|
||
|
stack.push(l2);
|
||
|
if(l1) delete l1;
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
ResultList *result = new ResultList;
|
||
|
if(debug) cerr << "not: pushing result @" << stack.Size() << endl;
|
||
|
stack.push(result);
|
||
|
elements = l2->elements();
|
||
|
|
||
|
if(debug)
|
||
|
cerr << "perform not: " << elements->Count() << " " << l1->elements()->Count() << " ";
|
||
|
|
||
|
for (i = 0; i < elements->Count(); i++)
|
||
|
{
|
||
|
dm = (DocMatch *) (*elements)[i];
|
||
|
dm2 = l1->find(dm->id);
|
||
|
if (!dm2)
|
||
|
{
|
||
|
//
|
||
|
// Duplicate document.
|
||
|
//
|
||
|
dm3 = new DocMatch;
|
||
|
dm3->score = dm->score;
|
||
|
dm3->orMatches = dm->orMatches;
|
||
|
dm3->id = dm->id;
|
||
|
dm3->anchor = dm->anchor;
|
||
|
result->add(dm3);
|
||
|
}
|
||
|
}
|
||
|
if(debug)
|
||
|
cerr << result->elements()->Count() << endl;
|
||
|
|
||
|
elements->Release();
|
||
|
delete elements;
|
||
|
delete l1;
|
||
|
delete l2;
|
||
|
}
|
||
|
|
||
|
//*****************************************************************************
|
||
|
// The top two entries in the stack need to be ORed together.
|
||
|
//
|
||
|
void
|
||
|
Parser::perform_or()
|
||
|
{
|
||
|
ResultList *l1 = (ResultList *) stack.pop();
|
||
|
ResultList *result = (ResultList *) stack.peek();
|
||
|
int i;
|
||
|
DocMatch *dm, *dm2;
|
||
|
HtVector *elements;
|
||
|
|
||
|
//
|
||
|
// If either of the arguments is not present, we will use the other as
|
||
|
// the results.
|
||
|
//
|
||
|
if (!l1 && result)
|
||
|
{
|
||
|
if(debug) cerr << "or: no 2nd operand" << endl;
|
||
|
return; // result in top of stack
|
||
|
}
|
||
|
else if (l1 && !result)
|
||
|
{
|
||
|
if(debug) cerr << "or: no 1st operand" << endl;
|
||
|
stack.pop();
|
||
|
stack.push(l1);
|
||
|
return;
|
||
|
}
|
||
|
else if (!l1 && !result)
|
||
|
{
|
||
|
if(debug) cerr << "or: no operands" << endl;
|
||
|
stack.pop();
|
||
|
stack.push(0); // empty result
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
//
|
||
|
// If either of the arguments is set to be ignored, we will use the
|
||
|
// other as the result.
|
||
|
//
|
||
|
if (l1->isIgnore)
|
||
|
{
|
||
|
delete l1;
|
||
|
return;
|
||
|
}
|
||
|
else if (result->isIgnore)
|
||
|
{
|
||
|
result = (ResultList *) stack.pop();
|
||
|
stack.push(l1);
|
||
|
delete result;
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
elements = l1->elements();
|
||
|
if(debug)
|
||
|
cerr << "perform or: " << elements->Count() << " " << result->elements()->Count() << " ";
|
||
|
for (i = 0; i < elements->Count(); i++)
|
||
|
{
|
||
|
dm = (DocMatch *) (*elements)[i];
|
||
|
dm2 = result->find(dm->id);
|
||
|
if (dm2)
|
||
|
{
|
||
|
//
|
||
|
// Update document. Add scores and add "OR-match" counts
|
||
|
//
|
||
|
dm2->score += dm->score;
|
||
|
dm2->orMatches += dm->orMatches;
|
||
|
if (dm->anchor < dm2->anchor)
|
||
|
dm2->anchor = dm->anchor;
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
dm2 = new DocMatch;
|
||
|
dm2->score = dm->score;
|
||
|
dm2->orMatches = dm->orMatches;
|
||
|
dm2->id = dm->id;
|
||
|
dm2->anchor = dm->anchor;
|
||
|
result->add(dm2);
|
||
|
}
|
||
|
}
|
||
|
if(debug)
|
||
|
cerr << result->elements()->Count() << endl;
|
||
|
elements->Release();
|
||
|
delete elements;
|
||
|
delete l1;
|
||
|
}
|
||
|
|
||
|
//*****************************************************************************
|
||
|
// void Parser::parse(List *tokenList, ResultList &resultMatches)
|
||
|
//
|
||
|
void
|
||
|
Parser::parse(List *tokenList, ResultList &resultMatches)
|
||
|
{
|
||
|
HtConfiguration* config= HtConfiguration::config();
|
||
|
tokens = tokenList;
|
||
|
DocumentRef *ref = NULL;
|
||
|
|
||
|
fullexpr(1);
|
||
|
|
||
|
ResultList *result = (ResultList *) stack.pop();
|
||
|
if (!result) // Ouch!
|
||
|
{
|
||
|
// It seems we now end up here on a syntax error, so don't clear anything!
|
||
|
// valid = 0;
|
||
|
// error = 0;
|
||
|
// error << "Expected to have something to parse!";
|
||
|
return;
|
||
|
}
|
||
|
HtVector *elements = result->elements();
|
||
|
DocMatch *dm;
|
||
|
|
||
|
// multimatch_factor gives extra weight to matching documents which
|
||
|
// contain more than one "OR" term. This is applied after the whole
|
||
|
// document is parsed, so multiple matches don't give exponentially
|
||
|
// increasing weights
|
||
|
double multimatch_factor = config->Double("multimatch_factor");
|
||
|
|
||
|
for (int i = 0; i < elements->Count(); i++)
|
||
|
{
|
||
|
dm = (DocMatch *) (*elements)[i];
|
||
|
ref = collection->getDocumentRef(dm->GetId());
|
||
|
if(ref && ref->DocState() == Reference_normal)
|
||
|
{
|
||
|
dm->collection = collection; // back reference
|
||
|
if (dm->orMatches > 1)
|
||
|
dm->score *= 1+multimatch_factor;
|
||
|
resultMatches.add(dm);
|
||
|
}
|
||
|
}
|
||
|
elements->Release();
|
||
|
result->Release();
|
||
|
delete elements;
|
||
|
delete result;
|
||
|
}
|
||
|
|
||
|
void
|
||
|
Parser::setCollection(Collection *coll)
|
||
|
{
|
||
|
if (coll)
|
||
|
words.Open(coll->getWordFile(), O_RDONLY);
|
||
|
collection = coll;
|
||
|
}
|
||
|
|