You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
216 lines
5.4 KiB
216 lines
5.4 KiB
//
|
|
// HtURLSeedScore.cc
|
|
//
|
|
// URLSeedScore:
|
|
// Holds a list of configured adjustments to be applied on a given
|
|
// score and given URL.
|
|
//
|
|
// Part of the ht://Dig package <http://www.htdig.org/>
|
|
// Copyright (c) 2000-2004 The ht://Dig Group
|
|
// For copyright details, see the file COPYING in your distribution
|
|
// or the GNU Library General Public License (LGPL) version 2 or later
|
|
// <http://www.gnu.org/copyleft/lgpl.html>
|
|
//
|
|
// $Id: HtURLSeedScore.cc,v 1.6 2004/05/28 13:15:24 lha Exp $
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "htconfig.h"
|
|
#endif /* HAVE_CONFIG_H */
|
|
|
|
#include "StringList.h"
|
|
#include "HtRegex.h"
|
|
#include "HtURLSeedScore.h"
|
|
|
|
#include <stdio.h>
|
|
#include <ctype.h>
|
|
|
|
// This class is only used in private members of URLSeedScore.
|
|
// The OO-right thing would be to nest this inside the private
|
|
// declaration of HtURLSeedScore, but that would cause portability
|
|
// problems according to
|
|
// <URL:http://www.mozilla.org/hacking/portable-cpp.html#inner_classes>.
|
|
|
|
class ScoreAdjustItem : public Object
|
|
{
|
|
public:
|
|
// Construct from a string applicable to StringMatch, and a string to
|
|
// parse for a formula.
|
|
ScoreAdjustItem(String &, String &);
|
|
|
|
~ScoreAdjustItem();
|
|
|
|
// Does this item match?
|
|
inline bool Match(const String &s) { return match.match(s, 1, 0) != 0; }
|
|
|
|
// Return the argument adjusted according to this item.
|
|
double adjust_score(double orig)
|
|
{ return orig*my_mul_factor + my_add_constant; }
|
|
|
|
// Error in parsing? Message given here if non-empty string.
|
|
String& ErrMsg() { return myErrMsg; }
|
|
|
|
private:
|
|
double my_add_constant;
|
|
double my_mul_factor;
|
|
HtRegex match;
|
|
|
|
static String myErrMsg;
|
|
|
|
// These member functions are not supposed to be implemented, but
|
|
// mentioned here as private so the compiler will not generate them if
|
|
// someone puts in buggy code that would use them.
|
|
ScoreAdjustItem();
|
|
ScoreAdjustItem(const ScoreAdjustItem &);
|
|
void operator= (const ScoreAdjustItem &);
|
|
};
|
|
|
|
// Definition of myErrMsg.
|
|
String ScoreAdjustItem::myErrMsg("");
|
|
|
|
ScoreAdjustItem::ScoreAdjustItem(String &url_regex, String &formula)
|
|
{
|
|
double mul_factor = 1;
|
|
double add_constant = 0;
|
|
bool factor_found = false;
|
|
bool constant_found = false;
|
|
int chars_so_far;
|
|
StringList l(url_regex.get(), '|');
|
|
match.setEscaped(l);
|
|
|
|
// FIXME: Missing method to check if the regex was in error.
|
|
// myErrMsg = form("%s is not a valid regex", url_regex.get());
|
|
|
|
char *s = formula.get();
|
|
|
|
// Parse the ([*]N[ ]*)?[+]?M format.
|
|
if (s[0] == '*')
|
|
{
|
|
// Skip past the '*'.
|
|
s++;
|
|
|
|
// There is a mul_factor. Let's parse it.
|
|
chars_so_far = 0;
|
|
sscanf(s, "%lf%n", &mul_factor, &chars_so_far);
|
|
|
|
// If '%lf' failed to match, then it will show up as either no
|
|
// assignment to chars_so_far, or as writing 0 there.
|
|
if (chars_so_far == 0)
|
|
{
|
|
myErrMsg = form("%s is not a valid adjustment formula", s);
|
|
return;
|
|
}
|
|
|
|
// Skip past the number.
|
|
s += chars_so_far;
|
|
|
|
// Skip any whitespaces.
|
|
while (isspace(*s))
|
|
s++;
|
|
|
|
// Eat any plus-sign; it's redundant if alone, and may come before a
|
|
// minus.
|
|
if (*s == '+')
|
|
s++;
|
|
|
|
factor_found = true;
|
|
}
|
|
|
|
// If there's anything here, it must be the additive constant.
|
|
if (*s)
|
|
{
|
|
chars_so_far = 0;
|
|
sscanf(s, "%lf%n", &add_constant, &chars_so_far);
|
|
|
|
// If '%lf' failed to match, then it will show up as either no
|
|
// assignment to chars_so_far, or as writing 0 there.
|
|
// We also need to check that it was the end of the input.
|
|
if (chars_so_far == 0 || s[chars_so_far] != 0)
|
|
{
|
|
myErrMsg = form("%s is not a valid adjustment formula",
|
|
formula.get());
|
|
return;
|
|
}
|
|
|
|
constant_found = true;
|
|
}
|
|
|
|
// Either part must be there.
|
|
if (!factor_found && !constant_found)
|
|
{
|
|
myErrMsg = form("%s is not a valid formula", formula.get());
|
|
return;
|
|
}
|
|
|
|
my_add_constant = add_constant;
|
|
my_mul_factor = mul_factor;
|
|
}
|
|
|
|
ScoreAdjustItem::~ScoreAdjustItem()
|
|
{
|
|
}
|
|
|
|
URLSeedScore::URLSeedScore(Configuration &config)
|
|
{
|
|
char *config_item = "url_seed_score";
|
|
|
|
StringList sl(config[config_item], "\t \r\n");
|
|
|
|
myAdjustmentList = new List();
|
|
|
|
if (sl.Count() % 2)
|
|
{
|
|
myErrMsg = form("%s is not a list of pairs (odd number of items)",
|
|
config_item);
|
|
|
|
// We *could* continue, but that just means the error will be harder
|
|
// to find, unless someone actually sees the error message.
|
|
return;
|
|
}
|
|
|
|
// Parse each as in TemplateList::createFromString.
|
|
for (int i = 0; i < sl.Count(); i += 2)
|
|
{
|
|
String url_regex = sl[i];
|
|
String adjust_formula = sl[i+1];
|
|
|
|
ScoreAdjustItem *adjust_item
|
|
= new ScoreAdjustItem(url_regex, adjust_formula);
|
|
|
|
if (adjust_item->ErrMsg().length() != 0)
|
|
{
|
|
// No point in continuing beyond the error; we might just
|
|
// overwrite the first error.
|
|
myErrMsg = form("While parsing %s: %s",
|
|
config_item,
|
|
adjust_item->ErrMsg().get());
|
|
return;
|
|
}
|
|
|
|
myAdjustmentList->Add(adjust_item);
|
|
}
|
|
}
|
|
|
|
URLSeedScore::~URLSeedScore()
|
|
{
|
|
delete myAdjustmentList;
|
|
}
|
|
|
|
double
|
|
URLSeedScore::noninline_adjust_score(double orig_score, const String &url)
|
|
{
|
|
List *adjlist = myAdjustmentList;
|
|
ScoreAdjustItem *adjust_item;
|
|
|
|
adjlist->Start_Get();
|
|
|
|
while ((adjust_item = (ScoreAdjustItem *) adjlist->Get_Next()))
|
|
{
|
|
// Use the first match only.
|
|
if (adjust_item->Match(url))
|
|
return adjust_item->adjust_score(orig_score);
|
|
}
|
|
|
|
// We'll get here if no match was found.
|
|
return orig_score;
|
|
}
|