You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

548 lines
21 KiB

//
// DocumentRef.cc
//
// DocumentRef: Reference to an indexed document. Keeps track of all
// information stored on the document, either by the dig
// or temporary search information.
//
// Part of the ht://Dig package <http://www.htdig.org/>
// Copyright (c) 1995-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later
// <http://www.gnu.org/copyleft/lgpl.html>
//
// $Id: DocumentRef.cc,v 1.53 2004/05/28 13:15:12 lha Exp $
//
#ifdef HAVE_CONFIG_H
#include "htconfig.h"
#endif /* HAVE_CONFIG_H */
#include "DocumentRef.h"
#include "good_strtok.h"
#include "WordRecord.h"
#include "HtConfiguration.h"
#include "HtURLCodec.h"
#include "WordType.h"
#include "HtWordReference.h"
#include <stdlib.h>
#include <ctype.h>
#ifdef HAVE_STD
#include <fstream>
#ifdef HAVE_NAMESPACES
using namespace std;
#endif
#else
#include <fstream.h>
#endif /* HAVE_STD */
// extern HtConfiguration config;
//*****************************************************************************
// DocumentRef::DocumentRef()
//
DocumentRef::DocumentRef()
{
Clear();
}
//*****************************************************************************
// DocumentRef::~DocumentRef()
//
DocumentRef::~DocumentRef()
{
}
//*****************************************************************************
// void DocumentRef::Clear()
//
void DocumentRef::Clear()
{
docID = 0;
docURL = 0;
docTime = 0;
docAccessed = 0;
docHead = 0;
docHeadIsSet = 0;
docMetaDsc = 0;
docTitle = 0;
descriptions.Destroy();
docState = Reference_normal;
docSize = 0;
docLinks = 0;
docBackLinks = 0;
docAnchors.Destroy();
docHopCount = 0;
docSig = 0;
docEmail = 0;
docNotification = 0;
docSubject = 0;
docScore = 0;
docAnchor = 0;
}
//*****************************************************************************
// void DocumentRef::DocState(int s)
//
void DocumentRef::DocState(int s)
{
// You can't easily do this with a cast, so we'll use a switch
switch(s)
{
case 0:
docState = Reference_normal;
break;
case 1:
docState = Reference_not_found;
break;
case 2:
docState = Reference_noindex;
break;
case 3:
docState = Reference_obsolete;
break;
}
}
enum
{
DOC_ID, // 0
DOC_TIME, // 1
DOC_ACCESSED, // 2
DOC_STATE, // 3
DOC_SIZE, // 4
DOC_LINKS, // 5
DOC_IMAGESIZE, // 6 -- No longer used
DOC_HOPCOUNT, // 7
DOC_URL, // 8
DOC_HEAD, // 9
DOC_TITLE, // 10
DOC_DESCRIPTIONS, // 11
DOC_ANCHORS, // 12
DOC_EMAIL, // 13
DOC_NOTIFICATION, // 14
DOC_SUBJECT, // 15
DOC_STRING, // 16
DOC_METADSC, // 17
DOC_BACKLINKS, // 18
DOC_SIG // 19
};
// Must be powers of two never reached by the DOC_... enums.
#define CHARSIZE_MARKER_BIT 64
#define SHORTSIZE_MARKER_BIT 128
//*****************************************************************************
// void DocumentRef::Serialize(String &s)
// Convert all the data in the object to a string.
// The data is in the string is tagged with
//
void DocumentRef::Serialize(String &s)
{
int length;
String *str;
//
// The following macros make the serialization process a little easier
// to follow. Note that if an object to be serialized has the default
// value for this class, it it NOT serialized. This means that
// storage will be saved...
//
#define addnum(id, out, var) \
if (var != 0) \
{ \
if (var <= (unsigned char) ~1) \
{ \
unsigned char _tmp = var; \
out << (char) (id | CHARSIZE_MARKER_BIT); \
out.append((char *) &_tmp, sizeof(_tmp)); \
} \
else if (var <= (unsigned short int) ~1) \
{ \
unsigned short int _tmp = var; \
out << (char) (id | SHORTSIZE_MARKER_BIT); \
out.append((char *) &_tmp, sizeof(_tmp)); \
} \
else \
{ \
out << (char) id; \
out.append((char *) &var, sizeof(var)); \
} \
}
#define addstring(id, out, str) \
if (str.length()) \
{ \
length = str.length(); \
if (length <= (unsigned char) ~1) \
{ \
unsigned char _tmp = length; \
out << (char) (id | CHARSIZE_MARKER_BIT); \
out.append((char *) &_tmp, sizeof(_tmp)); \
} \
else if (length <= (unsigned short int) ~1) \
{ \
unsigned short int _tmp = length; \
out << (char) (id | SHORTSIZE_MARKER_BIT); \
out.append((char *) &_tmp, sizeof(_tmp)); \
} \
else \
{ \
out << (char) id; \
out.append((char *) &length, sizeof(length)); \
} \
out.append(str); \
}
// To keep compatibility with old databases, don't bother
// with long lists at all. Bloat the size for long strings with
// one char to just keep a ~1 marker since we don't know the
// endianness; we don't know where to put a endian-safe
// size-marker, and we probably rather want the full char to
// keep the length. Only strings shorter than (unsigned char) ~1
// will be "optimized"; trying to optimize strings that fit in
// (unsigned short) does not seem to give anything substantial.
#define addlist(id, out, list) \
if (list.Count()) \
{ \
length = list.Count(); \
if (length <= (unsigned short int) ~1) \
{ \
if (length <= (unsigned char) ~1) \
{ \
unsigned char _tmp = length; \
out << (char) (id | CHARSIZE_MARKER_BIT); \
out.append((char *) &_tmp, sizeof(_tmp)); \
} \
else \
{ \
unsigned short int _tmp = length; \
out << (char) (id | SHORTSIZE_MARKER_BIT); \
out.append((char *) &_tmp, sizeof(_tmp)); \
} \
list.Start_Get(); \
while ((str = (String *) list.Get_Next())) \
{ \
length = str->length(); \
if (length < (unsigned char) ~1) \
{ \
unsigned char _tmp = length; \
out.append((char*) &_tmp, sizeof(_tmp)); \
} \
else \
{ \
unsigned char _tmp = ~1; \
out.append((char*) &_tmp, sizeof(_tmp)); \
out.append((char*) &length, sizeof(length)); \
} \
out.append(*str); \
} \
} \
else \
{ \
out << (char) id; \
out.append((char *) &length, sizeof(length)); \
list.Start_Get(); \
while ((str = (String *) list.Get_Next())) \
{ \
length = str->length(); \
out.append((char*) &length, sizeof(length)); \
out.append(*str); \
} \
} \
}
addnum(DOC_ID, s, docID);
addnum(DOC_TIME, s, docTime);
addnum(DOC_ACCESSED, s, docAccessed);
addnum(DOC_STATE, s, docState);
addnum(DOC_SIZE, s, docSize);
addnum(DOC_LINKS, s, docLinks);
addnum(DOC_BACKLINKS, s, docBackLinks);
addnum(DOC_HOPCOUNT, s, docHopCount);
addnum(DOC_SIG, s, docSig);
// Use a temporary since the addstring macro will evaluate
// this multiple times.
String tmps = HtURLCodec::instance()->encode(docURL);
addstring(DOC_URL, s, tmps);
// This is done in the DocumentDB code through the excerpt database
// addstring(DOC_HEAD, s, docHead);
addstring(DOC_METADSC, s, docMetaDsc);
addstring(DOC_TITLE, s, docTitle);
addlist(DOC_DESCRIPTIONS, s, descriptions);
addlist(DOC_ANCHORS, s, docAnchors);
addstring(DOC_EMAIL, s, docEmail);
addstring(DOC_NOTIFICATION, s, docNotification);
addstring(DOC_SUBJECT, s, docSubject);
}
//*****************************************************************************
// void DocumentRef::Deserialize(String &stream)
// Extract the contents of our private variables from the given
// character string. The character string is expected to have been
// created using the Serialize member.
//
void DocumentRef::Deserialize(String &stream)
{
Clear();
char *s = stream.get();
char *end = s + stream.length();
int length;
int count;
int i;
int x;
int throwaway; // As the name sounds--used for old fields
String *str;
// There is a problem with getting a numeric value into a
// numeric unknown type that may be an enum (the other way
// around is simply by casting (int)).
// Supposedly the enum incarnates as a simple type, so we can
// just check the size and copy the bits.
#define MEMCPY_ASSIGN(to, from, type) \
do { \
type _tmp = (type) (from); \
memcpy((char *) &(to), (char *) &_tmp, sizeof(to)); \
} while (0)
#define NUM_ASSIGN(to, from) \
do { \
if (sizeof(to) == sizeof(unsigned long int)) \
MEMCPY_ASSIGN(to, from, unsigned long int); \
else if (sizeof(to) == sizeof(unsigned int)) \
MEMCPY_ASSIGN(to, from, unsigned int); \
else if (sizeof(to) == sizeof(unsigned short int)) \
MEMCPY_ASSIGN(to, from, unsigned short int); \
else if (sizeof(to) == sizeof(unsigned char)) \
MEMCPY_ASSIGN(to, from, unsigned char); \
/* else fatal error here? */ \
} while (0)
#define getnum(type, in, var) \
if (type & CHARSIZE_MARKER_BIT) \
{ \
NUM_ASSIGN(var, *(unsigned char *) in); \
in += sizeof(unsigned char); \
} \
else if (type & SHORTSIZE_MARKER_BIT) \
{ \
unsigned short int _tmp0; \
memcpy((char *) &_tmp0, (char *) (in), sizeof(unsigned short)); \
NUM_ASSIGN(var, _tmp0); \
in += sizeof(unsigned short int); \
} \
else \
{ \
memcpy((char *) &var, in, sizeof(var)); \
in += sizeof(var); \
}
#define getstring(type, in, str) \
getnum(type, in, length); \
str = 0; \
str.append(in, length); \
in += length
#define getlist(type, in, list) \
getnum(type, in, count); \
if (type & (CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT)) \
{ \
for (i = 0; i < count; i++) \
{ \
unsigned char _tmp = *(unsigned char *) in; \
in += sizeof(_tmp); \
if (_tmp < (unsigned char) ~1) \
length = _tmp; \
else \
getnum(~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT), in, \
length); \
str = new String; \
str->append(in, length); \
list.Add(str); \
in += length; \
} \
} \
else \
{ \
for (i = 0; i < count; i++) \
{ \
getnum(~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT), in, \
length); \
str = new String; \
str->append(in, length); \
list.Add(str); \
in += length; \
} \
}
while (s < end)
{
x = (unsigned char) *s++;
switch (x & ~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT))
{
case DOC_ID:
getnum(x, s, docID);
break;
case DOC_TIME:
getnum(x, s, docTime);
break;
case DOC_ACCESSED:
getnum(x, s, docAccessed);
break;
case DOC_STATE:
getnum(x, s, docState);
break;
case DOC_SIZE:
getnum(x, s, docSize);
break;
case DOC_IMAGESIZE: // No longer used
getnum(x, s, throwaway);
break;
case DOC_LINKS:
getnum(x, s, docLinks);
break;
case DOC_HOPCOUNT:
getnum(x, s, docHopCount);
break;
case DOC_BACKLINKS:
getnum(x, s, docBackLinks);
break;
case DOC_SIG:
getnum(x, s, docSig);
break;
case DOC_URL:
{
// Use a temporary since the addstring macro will evaluate
// this multiple times.
String tmps;
getstring(x, s, tmps);
docURL = HtURLCodec::instance()->decode(tmps);
}
break;
case DOC_HEAD:
getstring(x, s, docHead); docHeadIsSet = 1;
break;
case DOC_METADSC:
getstring(x, s, docMetaDsc);
break;
case DOC_TITLE:
getstring(x, s, docTitle);
break;
case DOC_DESCRIPTIONS:
getlist(x, s, descriptions);
break;
case DOC_ANCHORS:
getlist(x, s, docAnchors);
break;
case DOC_EMAIL:
getstring(x, s, docEmail);
break;
case DOC_NOTIFICATION:
getstring(x, s, docNotification);
break;
case DOC_SUBJECT:
getstring(x, s, docSubject);
break;
case DOC_STRING:
// This is just a debugging string. Ignore it.
break;
default:
cerr << "BAD TAG IN SERIALIZED DATA: " << x << endl;
return;
}
}
}
//*****************************************************************************
// void DocumentRef::AddDescription(char *d, HtWordList &words)
//
void DocumentRef::AddDescription(const char *d, HtWordList &words)
{
if (!d || !*d)
return;
while (isspace(*d))
d++;
if (!d || !*d)
return;
String desc = d;
desc.chop(" \t");
// Add the description text to the word database with proper factor
// Do this first because we may have reached the max_description limit
// This also ensures we keep the proper weight on descriptions
// that occur many times
// Parse words.
char *p = desc;
HtConfiguration* config= HtConfiguration::config();
static int minimum_word_length = config->Value("minimum_word_length", 3);
static int max_descriptions = config->Value("max_descriptions", 5);
String word;
HtWordReference wordRef;
wordRef.Flags(FLAG_LINK_TEXT);
wordRef.DocID(docID);
while (*p)
{
// Reset contents before adding chars each round.
word = 0;
while (*p && HtIsWordChar(*p))
word << *p++;
HtStripPunctuation(word);
if (word.length() >= minimum_word_length) {
// The wordlist takes care of lowercasing; just add it.
wordRef.Location((p - (char*)desc) - word.length());
wordRef.Word(word);
words.Replace(wordRef);
}
while (*p && !HtIsStrictWordChar(*p))
p++;
}
// And let's flush the words! (nice comment hu :-)
words.Flush();
// Now are we at the max_description limit?
if (descriptions.Count() >= max_descriptions)
return;
descriptions.Start_Get();
String *description;
while ((description = (String *) descriptions.Get_Next()))
{
if (mystrcasecmp(description->get(), (char*)desc) == 0)
return;
}
descriptions.Add(new String(desc));
}
//*****************************************************************************
// void DocumentRef::AddAnchor(char *a)
//
void DocumentRef::AddAnchor(const char *a)
{
if (a)
docAnchors.Add(new String(a));
}