// // DocumentRef.cc // // DocumentRef: Reference to an indexed document. Keeps track of all // information stored on the document, either by the dig // or temporary search information. // // Part of the ht://Dig package // Copyright (c) 1995-2004 The ht://Dig Group // For copyright details, see the file COPYING in your distribution // or the GNU Library General Public License (LGPL) version 2 or later // // // $Id: DocumentRef.cc,v 1.53 2004/05/28 13:15:12 lha Exp $ // #ifdef HAVE_CONFIG_H #include "htconfig.h" #endif /* HAVE_CONFIG_H */ #include "DocumentRef.h" #include "good_strtok.h" #include "WordRecord.h" #include "HtConfiguration.h" #include "HtURLCodec.h" #include "WordType.h" #include "HtWordReference.h" #include #include #ifdef HAVE_STD #include #ifdef HAVE_NAMESPACES using namespace std; #endif #else #include #endif /* HAVE_STD */ // extern HtConfiguration config; //***************************************************************************** // DocumentRef::DocumentRef() // DocumentRef::DocumentRef() { Clear(); } //***************************************************************************** // DocumentRef::~DocumentRef() // DocumentRef::~DocumentRef() { } //***************************************************************************** // void DocumentRef::Clear() // void DocumentRef::Clear() { docID = 0; docURL = 0; docTime = 0; docAccessed = 0; docHead = 0; docHeadIsSet = 0; docMetaDsc = 0; docTitle = 0; descriptions.Destroy(); docState = Reference_normal; docSize = 0; docLinks = 0; docBackLinks = 0; docAnchors.Destroy(); docHopCount = 0; docSig = 0; docEmail = 0; docNotification = 0; docSubject = 0; docScore = 0; docAnchor = 0; } //***************************************************************************** // void DocumentRef::DocState(int s) // void DocumentRef::DocState(int s) { // You can't easily do this with a cast, so we'll use a switch switch(s) { case 0: docState = Reference_normal; break; case 1: docState = Reference_not_found; break; case 2: docState = Reference_noindex; break; case 3: docState = Reference_obsolete; break; } } enum { DOC_ID, // 0 DOC_TIME, // 1 DOC_ACCESSED, // 2 DOC_STATE, // 3 DOC_SIZE, // 4 DOC_LINKS, // 5 DOC_IMAGESIZE, // 6 -- No longer used DOC_HOPCOUNT, // 7 DOC_URL, // 8 DOC_HEAD, // 9 DOC_TITLE, // 10 DOC_DESCRIPTIONS, // 11 DOC_ANCHORS, // 12 DOC_EMAIL, // 13 DOC_NOTIFICATION, // 14 DOC_SUBJECT, // 15 DOC_STRING, // 16 DOC_METADSC, // 17 DOC_BACKLINKS, // 18 DOC_SIG // 19 }; // Must be powers of two never reached by the DOC_... enums. #define CHARSIZE_MARKER_BIT 64 #define SHORTSIZE_MARKER_BIT 128 //***************************************************************************** // void DocumentRef::Serialize(String &s) // Convert all the data in the object to a string. // The data is in the string is tagged with // void DocumentRef::Serialize(String &s) { int length; String *str; // // The following macros make the serialization process a little easier // to follow. Note that if an object to be serialized has the default // value for this class, it it NOT serialized. This means that // storage will be saved... // #define addnum(id, out, var) \ if (var != 0) \ { \ if (var <= (unsigned char) ~1) \ { \ unsigned char _tmp = var; \ out << (char) (id | CHARSIZE_MARKER_BIT); \ out.append((char *) &_tmp, sizeof(_tmp)); \ } \ else if (var <= (unsigned short int) ~1) \ { \ unsigned short int _tmp = var; \ out << (char) (id | SHORTSIZE_MARKER_BIT); \ out.append((char *) &_tmp, sizeof(_tmp)); \ } \ else \ { \ out << (char) id; \ out.append((char *) &var, sizeof(var)); \ } \ } #define addstring(id, out, str) \ if (str.length()) \ { \ length = str.length(); \ if (length <= (unsigned char) ~1) \ { \ unsigned char _tmp = length; \ out << (char) (id | CHARSIZE_MARKER_BIT); \ out.append((char *) &_tmp, sizeof(_tmp)); \ } \ else if (length <= (unsigned short int) ~1) \ { \ unsigned short int _tmp = length; \ out << (char) (id | SHORTSIZE_MARKER_BIT); \ out.append((char *) &_tmp, sizeof(_tmp)); \ } \ else \ { \ out << (char) id; \ out.append((char *) &length, sizeof(length)); \ } \ out.append(str); \ } // To keep compatibility with old databases, don't bother // with long lists at all. Bloat the size for long strings with // one char to just keep a ~1 marker since we don't know the // endianness; we don't know where to put a endian-safe // size-marker, and we probably rather want the full char to // keep the length. Only strings shorter than (unsigned char) ~1 // will be "optimized"; trying to optimize strings that fit in // (unsigned short) does not seem to give anything substantial. #define addlist(id, out, list) \ if (list.Count()) \ { \ length = list.Count(); \ if (length <= (unsigned short int) ~1) \ { \ if (length <= (unsigned char) ~1) \ { \ unsigned char _tmp = length; \ out << (char) (id | CHARSIZE_MARKER_BIT); \ out.append((char *) &_tmp, sizeof(_tmp)); \ } \ else \ { \ unsigned short int _tmp = length; \ out << (char) (id | SHORTSIZE_MARKER_BIT); \ out.append((char *) &_tmp, sizeof(_tmp)); \ } \ list.Start_Get(); \ while ((str = (String *) list.Get_Next())) \ { \ length = str->length(); \ if (length < (unsigned char) ~1) \ { \ unsigned char _tmp = length; \ out.append((char*) &_tmp, sizeof(_tmp)); \ } \ else \ { \ unsigned char _tmp = ~1; \ out.append((char*) &_tmp, sizeof(_tmp)); \ out.append((char*) &length, sizeof(length)); \ } \ out.append(*str); \ } \ } \ else \ { \ out << (char) id; \ out.append((char *) &length, sizeof(length)); \ list.Start_Get(); \ while ((str = (String *) list.Get_Next())) \ { \ length = str->length(); \ out.append((char*) &length, sizeof(length)); \ out.append(*str); \ } \ } \ } addnum(DOC_ID, s, docID); addnum(DOC_TIME, s, docTime); addnum(DOC_ACCESSED, s, docAccessed); addnum(DOC_STATE, s, docState); addnum(DOC_SIZE, s, docSize); addnum(DOC_LINKS, s, docLinks); addnum(DOC_BACKLINKS, s, docBackLinks); addnum(DOC_HOPCOUNT, s, docHopCount); addnum(DOC_SIG, s, docSig); // Use a temporary since the addstring macro will evaluate // this multiple times. String tmps = HtURLCodec::instance()->encode(docURL); addstring(DOC_URL, s, tmps); // This is done in the DocumentDB code through the excerpt database // addstring(DOC_HEAD, s, docHead); addstring(DOC_METADSC, s, docMetaDsc); addstring(DOC_TITLE, s, docTitle); addlist(DOC_DESCRIPTIONS, s, descriptions); addlist(DOC_ANCHORS, s, docAnchors); addstring(DOC_EMAIL, s, docEmail); addstring(DOC_NOTIFICATION, s, docNotification); addstring(DOC_SUBJECT, s, docSubject); } //***************************************************************************** // void DocumentRef::Deserialize(String &stream) // Extract the contents of our private variables from the given // character string. The character string is expected to have been // created using the Serialize member. // void DocumentRef::Deserialize(String &stream) { Clear(); char *s = stream.get(); char *end = s + stream.length(); int length; int count; int i; int x; int throwaway; // As the name sounds--used for old fields String *str; // There is a problem with getting a numeric value into a // numeric unknown type that may be an enum (the other way // around is simply by casting (int)). // Supposedly the enum incarnates as a simple type, so we can // just check the size and copy the bits. #define MEMCPY_ASSIGN(to, from, type) \ do { \ type _tmp = (type) (from); \ memcpy((char *) &(to), (char *) &_tmp, sizeof(to)); \ } while (0) #define NUM_ASSIGN(to, from) \ do { \ if (sizeof(to) == sizeof(unsigned long int)) \ MEMCPY_ASSIGN(to, from, unsigned long int); \ else if (sizeof(to) == sizeof(unsigned int)) \ MEMCPY_ASSIGN(to, from, unsigned int); \ else if (sizeof(to) == sizeof(unsigned short int)) \ MEMCPY_ASSIGN(to, from, unsigned short int); \ else if (sizeof(to) == sizeof(unsigned char)) \ MEMCPY_ASSIGN(to, from, unsigned char); \ /* else fatal error here? */ \ } while (0) #define getnum(type, in, var) \ if (type & CHARSIZE_MARKER_BIT) \ { \ NUM_ASSIGN(var, *(unsigned char *) in); \ in += sizeof(unsigned char); \ } \ else if (type & SHORTSIZE_MARKER_BIT) \ { \ unsigned short int _tmp0; \ memcpy((char *) &_tmp0, (char *) (in), sizeof(unsigned short)); \ NUM_ASSIGN(var, _tmp0); \ in += sizeof(unsigned short int); \ } \ else \ { \ memcpy((char *) &var, in, sizeof(var)); \ in += sizeof(var); \ } #define getstring(type, in, str) \ getnum(type, in, length); \ str = 0; \ str.append(in, length); \ in += length #define getlist(type, in, list) \ getnum(type, in, count); \ if (type & (CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT)) \ { \ for (i = 0; i < count; i++) \ { \ unsigned char _tmp = *(unsigned char *) in; \ in += sizeof(_tmp); \ if (_tmp < (unsigned char) ~1) \ length = _tmp; \ else \ getnum(~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT), in, \ length); \ str = new String; \ str->append(in, length); \ list.Add(str); \ in += length; \ } \ } \ else \ { \ for (i = 0; i < count; i++) \ { \ getnum(~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT), in, \ length); \ str = new String; \ str->append(in, length); \ list.Add(str); \ in += length; \ } \ } while (s < end) { x = (unsigned char) *s++; switch (x & ~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT)) { case DOC_ID: getnum(x, s, docID); break; case DOC_TIME: getnum(x, s, docTime); break; case DOC_ACCESSED: getnum(x, s, docAccessed); break; case DOC_STATE: getnum(x, s, docState); break; case DOC_SIZE: getnum(x, s, docSize); break; case DOC_IMAGESIZE: // No longer used getnum(x, s, throwaway); break; case DOC_LINKS: getnum(x, s, docLinks); break; case DOC_HOPCOUNT: getnum(x, s, docHopCount); break; case DOC_BACKLINKS: getnum(x, s, docBackLinks); break; case DOC_SIG: getnum(x, s, docSig); break; case DOC_URL: { // Use a temporary since the addstring macro will evaluate // this multiple times. String tmps; getstring(x, s, tmps); docURL = HtURLCodec::instance()->decode(tmps); } break; case DOC_HEAD: getstring(x, s, docHead); docHeadIsSet = 1; break; case DOC_METADSC: getstring(x, s, docMetaDsc); break; case DOC_TITLE: getstring(x, s, docTitle); break; case DOC_DESCRIPTIONS: getlist(x, s, descriptions); break; case DOC_ANCHORS: getlist(x, s, docAnchors); break; case DOC_EMAIL: getstring(x, s, docEmail); break; case DOC_NOTIFICATION: getstring(x, s, docNotification); break; case DOC_SUBJECT: getstring(x, s, docSubject); break; case DOC_STRING: // This is just a debugging string. Ignore it. break; default: cerr << "BAD TAG IN SERIALIZED DATA: " << x << endl; return; } } } //***************************************************************************** // void DocumentRef::AddDescription(char *d, HtWordList &words) // void DocumentRef::AddDescription(const char *d, HtWordList &words) { if (!d || !*d) return; while (isspace(*d)) d++; if (!d || !*d) return; String desc = d; desc.chop(" \t"); // Add the description text to the word database with proper factor // Do this first because we may have reached the max_description limit // This also ensures we keep the proper weight on descriptions // that occur many times // Parse words. char *p = desc; HtConfiguration* config= HtConfiguration::config(); static int minimum_word_length = config->Value("minimum_word_length", 3); static int max_descriptions = config->Value("max_descriptions", 5); String word; HtWordReference wordRef; wordRef.Flags(FLAG_LINK_TEXT); wordRef.DocID(docID); while (*p) { // Reset contents before adding chars each round. word = 0; while (*p && HtIsWordChar(*p)) word << *p++; HtStripPunctuation(word); if (word.length() >= minimum_word_length) { // The wordlist takes care of lowercasing; just add it. wordRef.Location((p - (char*)desc) - word.length()); wordRef.Word(word); words.Replace(wordRef); } while (*p && !HtIsStrictWordChar(*p)) p++; } // And let's flush the words! (nice comment hu :-) words.Flush(); // Now are we at the max_description limit? if (descriptions.Count() >= max_descriptions) return; descriptions.Start_Get(); String *description; while ((description = (String *) descriptions.Get_Next())) { if (mystrcasecmp(description->get(), (char*)desc) == 0) return; } descriptions.Add(new String(desc)); } //***************************************************************************** // void DocumentRef::AddAnchor(char *a) // void DocumentRef::AddAnchor(const char *a) { if (a) docAnchors.Add(new String(a)); }