extra-dependencies/debian/htdig/htdig-3.2.0b6/htcommon/DocumentRef.cc

//
// DocumentRef.cc
//
// DocumentRef: Reference to an indexed document. Keeps track of all
//              information stored on the document, either by the dig
//              or temporary search information.
//
// Part of the ht://Dig package   <http://www.htdig.org/>
// Copyright (c) 1995-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later
// <http://www.gnu.org/copyleft/lgpl.html>
//
// $Id: DocumentRef.cc,v 1.53 2004/05/28 13:15:12 lha Exp $
//

#ifdef HAVE_CONFIG_H
#include "htconfig.h"
#endif /* HAVE_CONFIG_H */

#include "DocumentRef.h"
#include "good_strtok.h"
#include "WordRecord.h"
#include "HtConfiguration.h"
#include "HtURLCodec.h"
#include "WordType.h"
#include "HtWordReference.h"
#include <stdlib.h>
#include <ctype.h>

#ifdef HAVE_STD
#include <fstream>
#ifdef HAVE_NAMESPACES
using namespace std;
#endif
#else
#include <fstream.h>
#endif /* HAVE_STD */

// extern HtConfiguration config;

//*****************************************************************************
// DocumentRef::DocumentRef()
//
DocumentRef::DocumentRef()
{
    Clear();
}


//*****************************************************************************
// DocumentRef::~DocumentRef()
//
DocumentRef::~DocumentRef()
{
}


//*****************************************************************************
// void DocumentRef::Clear()
//
void DocumentRef::Clear()
{
  docID = 0;
  docURL = 0;
  docTime = 0;
  docAccessed = 0;
  docHead = 0;
  docHeadIsSet = 0;
  docMetaDsc = 0;
  docTitle = 0;
  descriptions.Destroy();
  docState = Reference_normal;
  docSize = 0;
  docLinks = 0;
  docBackLinks = 0;
  docAnchors.Destroy();
  docHopCount = 0;
  docSig = 0;
  docEmail = 0;
  docNotification = 0;
  docSubject = 0;
  docScore = 0;
  docAnchor = 0;
}

//*****************************************************************************
// void DocumentRef::DocState(int s)
//
void DocumentRef::DocState(int s)
{
  // You can't easily do this with a cast, so we'll use a switch
  switch(s)
    {
      case 0:
	docState = Reference_normal;
	break;
      case 1:
	docState = Reference_not_found;
	break;
      case 2:
	docState = Reference_noindex;
	break;
      case 3:
	docState = Reference_obsolete;
	break;
    }
}


enum
{
    DOC_ID,				// 0
    DOC_TIME,				// 1
    DOC_ACCESSED,			// 2
    DOC_STATE,				// 3
    DOC_SIZE,				// 4
    DOC_LINKS,				// 5
    DOC_IMAGESIZE,			// 6 -- No longer used
    DOC_HOPCOUNT,			// 7
    DOC_URL,				// 8
    DOC_HEAD,				// 9
    DOC_TITLE,				// 10
    DOC_DESCRIPTIONS,	        	// 11
    DOC_ANCHORS,			// 12
    DOC_EMAIL,				// 13
    DOC_NOTIFICATION,		        // 14
    DOC_SUBJECT,			// 15
    DOC_STRING,                         // 16
    DOC_METADSC,                        // 17
    DOC_BACKLINKS,                      // 18
    DOC_SIG                             // 19
};

// Must be powers of two never reached by the DOC_... enums.
#define CHARSIZE_MARKER_BIT 64
#define SHORTSIZE_MARKER_BIT 128

//*****************************************************************************
// void DocumentRef::Serialize(String &s)
//   Convert all the data in the object to a string.
//   The data is in the string is tagged with
//
void DocumentRef::Serialize(String &s)
{
    int		length;
    String	*str;

//
// The following macros make the serialization process a little easier
// to follow.  Note that if an object to be serialized has the default
// value for this class, it it NOT serialized.  This means that
// storage will be saved...
//
#define addnum(id, out, var) \
 if (var != 0)                                                        \
 {                                                                    \
   if (var <= (unsigned char) ~1)                                     \
   {                                                                  \
     unsigned char _tmp = var;                                        \
     out << (char) (id | CHARSIZE_MARKER_BIT);                        \
     out.append((char *) &_tmp, sizeof(_tmp));                        \
   }                                                                  \
   else if (var <= (unsigned short int) ~1)                           \
   {                                                                  \
     unsigned short int _tmp = var;                                   \
     out << (char) (id | SHORTSIZE_MARKER_BIT);                       \
     out.append((char *) &_tmp, sizeof(_tmp));                        \
   }                                                                  \
   else                                                               \
   {                                                                  \
     out << (char) id;                                                \
     out.append((char *) &var, sizeof(var));                          \
   }                                                                  \
 }

#define	addstring(id, out, str)	\
 if (str.length())                                                    \
 {                                                                    \
   length = str.length();                                             \
   if (length <= (unsigned char) ~1)                                  \
   {                                                                  \
     unsigned char _tmp = length;                                     \
     out << (char) (id | CHARSIZE_MARKER_BIT);                        \
     out.append((char *) &_tmp, sizeof(_tmp));                        \
   }                                                                  \
   else if (length <= (unsigned short int) ~1)                        \
   {                                                                  \
     unsigned short int _tmp = length;                                \
     out << (char) (id | SHORTSIZE_MARKER_BIT);                       \
     out.append((char *) &_tmp, sizeof(_tmp));                        \
   }                                                                  \
   else                                                               \
   {                                                                  \
     out << (char) id;                                                \
     out.append((char *) &length, sizeof(length));                    \
   }                                                                  \
   out.append(str);                                                   \
 }

// To keep compatibility with old databases, don't bother
// with long lists at all.  Bloat the size for long strings with
// one char to just keep a ~1 marker since we don't know the
// endianness; we don't know where to put a endian-safe
// size-marker, and we probably rather want the full char to
// keep the length.  Only strings shorter than (unsigned char) ~1
// will be "optimized"; trying to optimize strings that fit in
// (unsigned short) does not seem to give anything substantial.
#define	addlist(id, out, list) \
 if (list.Count())                                                    \
 {                                                                    \
   length = list.Count();                                             \
   if (length <= (unsigned short int) ~1)                             \
   {                                                                  \
     if (length <= (unsigned char) ~1)                                \
     {                                                                \
       unsigned char _tmp = length;                                   \
       out << (char) (id | CHARSIZE_MARKER_BIT);                      \
       out.append((char *) &_tmp, sizeof(_tmp));                      \
     }                                                                \
     else                                                             \
     {                                                                \
       unsigned short int _tmp = length;                              \
       out << (char) (id | SHORTSIZE_MARKER_BIT);                     \
       out.append((char *) &_tmp, sizeof(_tmp));                      \
     }                                                                \
     list.Start_Get();                                                \
     while ((str = (String *) list.Get_Next()))		              \
     {                                                                \
       length = str->length();                                        \
       if (length < (unsigned char) ~1)                               \
       {                                                              \
         unsigned char _tmp = length;                                 \
         out.append((char*) &_tmp, sizeof(_tmp));                     \
       }                                                              \
       else                                                           \
       {                                                              \
         unsigned char _tmp = ~1;                                     \
         out.append((char*) &_tmp, sizeof(_tmp));                     \
         out.append((char*) &length, sizeof(length));                 \
       }                                                              \
       out.append(*str);                                              \
     }                                                                \
   }                                                                  \
   else                                                               \
   {                                                                  \
     out << (char) id;                                                \
     out.append((char *) &length, sizeof(length));                    \
     list.Start_Get();                                                \
     while ((str = (String *) list.Get_Next()))                       \
     {                                                                \
       length = str->length();                                        \
       out.append((char*) &length, sizeof(length));                   \
       out.append(*str);                                              \
     }                                                                \
   }                                                                  \
 }

    addnum(DOC_ID, s, docID);
    addnum(DOC_TIME, s, docTime);
    addnum(DOC_ACCESSED, s, docAccessed);
    addnum(DOC_STATE, s, docState);
    addnum(DOC_SIZE, s, docSize);
    addnum(DOC_LINKS, s, docLinks);
    addnum(DOC_BACKLINKS, s, docBackLinks);
    addnum(DOC_HOPCOUNT, s, docHopCount);
    addnum(DOC_SIG, s, docSig);

    // Use a temporary since the addstring macro will evaluate
    // this multiple times.
    String tmps = HtURLCodec::instance()->encode(docURL);
    addstring(DOC_URL, s, tmps);
    // This is done in the DocumentDB code through the excerpt database
    //    addstring(DOC_HEAD, s, docHead);
    addstring(DOC_METADSC, s, docMetaDsc);
    addstring(DOC_TITLE, s, docTitle);

    addlist(DOC_DESCRIPTIONS, s, descriptions);
    addlist(DOC_ANCHORS, s, docAnchors);

    addstring(DOC_EMAIL, s, docEmail);
    addstring(DOC_NOTIFICATION, s, docNotification);
    addstring(DOC_SUBJECT, s, docSubject);
}


//*****************************************************************************
// void DocumentRef::Deserialize(String &stream)
//   Extract the contents of our private variables from the given
//   character string.  The character string is expected to have been
//   created using the Serialize member.
//
void DocumentRef::Deserialize(String &stream)
{
    Clear();
    char	*s = stream.get();
    char	*end = s + stream.length();
    int		length;
    int		count;
    int		i;
    int		x;
    int		throwaway; // As the name sounds--used for old fields
    String	*str;

// There is a problem with getting a numeric value into a
// numeric unknown type that may be an enum (the other way
// around is simply by casting (int)).
//  Supposedly the enum incarnates as a simple type, so we can
// just check the size and copy the bits.
#define MEMCPY_ASSIGN(to, from, type) \
 do {                                                                 \
   type _tmp = (type) (from);                                         \
   memcpy((char *) &(to), (char *) &_tmp, sizeof(to));                \
 } while (0)

#define NUM_ASSIGN(to, from) \
 do {                                                                 \
   if (sizeof(to) == sizeof(unsigned long int))                       \
     MEMCPY_ASSIGN(to, from, unsigned long int);                      \
   else if (sizeof(to) == sizeof(unsigned int))                       \
     MEMCPY_ASSIGN(to, from, unsigned int);                           \
   else if (sizeof(to) == sizeof(unsigned short int))                 \
     MEMCPY_ASSIGN(to, from, unsigned short int);                     \
   else if (sizeof(to) == sizeof(unsigned char))                      \
     MEMCPY_ASSIGN(to, from, unsigned char);                          \
   /* else fatal error here? */                                       \
 } while (0)

#define	getnum(type, in, var) \
 if (type & CHARSIZE_MARKER_BIT)                                      \
 {                                                                    \
   NUM_ASSIGN(var, *(unsigned char *) in);                            \
   in += sizeof(unsigned char);                                       \
 }                                                                    \
 else if (type & SHORTSIZE_MARKER_BIT)                                \
 {                                                                    \
   unsigned short int _tmp0;                                          \
   memcpy((char *) &_tmp0, (char *) (in), sizeof(unsigned short));    \
   NUM_ASSIGN(var, _tmp0);                                            \
   in += sizeof(unsigned short int);                                  \
 }                                                                    \
 else                                                                 \
 {                                                                    \
   memcpy((char *) &var, in, sizeof(var));                            \
   in += sizeof(var);                                                 \
 }

#define	getstring(type, in, str) \
 getnum(type, in, length);                                            \
 str = 0;                                                             \
 str.append(in, length);                                              \
 in += length

#define	getlist(type, in, list) \
 getnum(type, in, count);                                             \
 if (type & (CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT))             \
 {                                                                    \
   for (i = 0; i < count; i++)                                        \
   {                                                                  \
     unsigned char _tmp = *(unsigned char *) in;                      \
     in += sizeof(_tmp);                                              \
     if (_tmp < (unsigned char) ~1)                                   \
       length = _tmp;                                                 \
     else                                                             \
       getnum(~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT), in,      \
              length);                                                \
     str = new String;                                                \
     str->append(in, length);                                         \
     list.Add(str);                                                   \
     in += length;                                                    \
   }                                                                  \
 }                                                                    \
 else                                                                 \
 {                                                                    \
   for (i = 0; i < count; i++)                                        \
   {                                                                  \
     getnum(~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT), in,        \
            length);                                                  \
     str = new String;                                                \
     str->append(in, length);                                         \
     list.Add(str);                                                   \
     in += length;                                                    \
   }                                                                  \
 }

    while (s < end)
    {
        x = (unsigned char) *s++;
        switch (x & ~(CHARSIZE_MARKER_BIT | SHORTSIZE_MARKER_BIT))
        {
        case DOC_ID:
            getnum(x, s, docID);
            break;
        case DOC_TIME:
            getnum(x, s, docTime);
            break;
        case DOC_ACCESSED:
            getnum(x, s, docAccessed);
            break;
        case DOC_STATE:
            getnum(x, s, docState);
            break;
        case DOC_SIZE:
            getnum(x, s, docSize);
            break;
        case DOC_IMAGESIZE: // No longer used
	    getnum(x, s, throwaway);
	    break;
        case DOC_LINKS:
            getnum(x, s, docLinks);
            break;
        case DOC_HOPCOUNT:
            getnum(x, s, docHopCount);
            break;
	case DOC_BACKLINKS:
	    getnum(x, s, docBackLinks);
	    break;
	case DOC_SIG:
	    getnum(x, s, docSig);
	    break;
        case DOC_URL:
	    {
	      // Use a temporary since the addstring macro will evaluate
	      // this multiple times.
	      String tmps;
	      getstring(x, s, tmps);

	      docURL = HtURLCodec::instance()->decode(tmps);
	    }
	    break;
        case DOC_HEAD:
            getstring(x, s, docHead); docHeadIsSet = 1;
            break;
	case DOC_METADSC:
	    getstring(x, s, docMetaDsc);
	    break;
        case DOC_TITLE:
            getstring(x, s, docTitle);
            break;
        case DOC_DESCRIPTIONS:
            getlist(x, s, descriptions);
            break;
        case DOC_ANCHORS:
            getlist(x, s, docAnchors);
            break;
        case DOC_EMAIL:
            getstring(x, s, docEmail);
            break;
        case DOC_NOTIFICATION:
            getstring(x, s, docNotification);
            break;
        case DOC_SUBJECT:
            getstring(x, s, docSubject);
            break;
	case DOC_STRING:
	  // This is just a debugging string. Ignore it.
	    break;
        default:
            cerr << "BAD TAG IN SERIALIZED DATA: " << x << endl;
            return;
        }
    }
}


//*****************************************************************************
// void DocumentRef::AddDescription(char *d, HtWordList &words)
//
void DocumentRef::AddDescription(const char *d, HtWordList &words)
{
    if (!d || !*d)
        return;

    while (isspace(*d))
        d++;

   if (!d || !*d)
     return;

    String	desc = d;
    desc.chop(" \t");

    // Add the description text to the word database with proper factor
    // Do this first because we may have reached the max_description limit
    // This also ensures we keep the proper weight on descriptions
    // that occur many times

    // Parse words.
    char         *p                   = desc;
    HtConfiguration* config= HtConfiguration::config();
    static int    minimum_word_length = config->Value("minimum_word_length", 3);
    static int    max_descriptions    = config->Value("max_descriptions", 5);

    String word;
    HtWordReference wordRef;
    wordRef.Flags(FLAG_LINK_TEXT);
    wordRef.DocID(docID);

    while (*p)
    {
      // Reset contents before adding chars each round.
      word = 0;

      while (*p && HtIsWordChar(*p))
        word << *p++;

      HtStripPunctuation(word);

      if (word.length() >= minimum_word_length) {
        // The wordlist takes care of lowercasing; just add it.
	wordRef.Location((p - (char*)desc) - word.length());
	wordRef.Word(word);
        words.Replace(wordRef);
      }

      while (*p && !HtIsStrictWordChar(*p))
        p++;
    }

    // And let's flush the words! (nice comment hu :-)
    words.Flush();

    // Now are we at the max_description limit?
    if (descriptions.Count() >= max_descriptions)
  	return;

    descriptions.Start_Get();
    String	*description;
    while ((description = (String *) descriptions.Get_Next()))
    {
        if (mystrcasecmp(description->get(), (char*)desc) == 0)
            return;
    }
    descriptions.Add(new String(desc));
}


//*****************************************************************************
// void DocumentRef::AddAnchor(char *a)
//
void DocumentRef::AddAnchor(const char *a)
{
    if (a)
    	docAnchors.Add(new String(a));
}