You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
317 lines
7.7 KiB
317 lines
7.7 KiB
3 years ago
|
//
|
||
|
// BasicDocument.cc
|
||
|
//
|
||
|
// 2/6/2002 created for libhtdig to simplify & mimic Document.cc
|
||
|
//
|
||
|
// Neal Richter nealr@rightnow.com
|
||
|
//
|
||
|
//
|
||
|
// BasicDocument: This class holds everything there is to know about a document.
|
||
|
// The actual contents of the document may or may not be present at
|
||
|
// all times for memory conservation reasons.
|
||
|
//
|
||
|
// This is a basic extensable container for plain text holding documents.
|
||
|
//
|
||
|
// Uses any Parser with parse method handling this class.
|
||
|
//
|
||
|
// Part of the ht://Dig package <http://www.htdig.org/>
|
||
|
// Copyright (c) 1995-2004 The ht://Dig Group
|
||
|
// For copyright details, see the file COPYING in your distribution
|
||
|
// or the GNU Library General Public License (LGPL) version 2 or later or later
|
||
|
// <http://www.gnu.org/copyleft/lgpl.html>
|
||
|
//
|
||
|
// $Id: BasicDocument.cc,v 1.3 2004/05/28 13:15:28 lha Exp $
|
||
|
//
|
||
|
//--------------------------------------------------------------------
|
||
|
|
||
|
#ifdef HAVE_CONFIG_H
|
||
|
#include "htconfig.h"
|
||
|
#endif /* HAVE_CONFIG_H */
|
||
|
|
||
|
#include <signal.h>
|
||
|
#include <sys/types.h>
|
||
|
#include <sys/stat.h>
|
||
|
#include <ctype.h>
|
||
|
|
||
|
#include "BasicDocument.h"
|
||
|
#include "TextCollector.h"
|
||
|
#include "StringList.h"
|
||
|
#include "htdig.h"
|
||
|
#include "Plaintext.h"
|
||
|
#include "HTML.h"
|
||
|
#include "ExternalParser.h"
|
||
|
#include "lib.h"
|
||
|
|
||
|
#include "defaults.h"
|
||
|
|
||
|
#if 1
|
||
|
typedef void (*SIGNAL_HANDLER) (...);
|
||
|
#else
|
||
|
typedef SIG_PF SIGNAL_HANDLER;
|
||
|
#endif
|
||
|
|
||
|
//*****************************************************************************
|
||
|
// BasicDocument::BasicDocument(char *loc)
|
||
|
// Initialize with the given loc-parameter as the location for this document.
|
||
|
// If the max_size is given, use that for size, otherwise use the
|
||
|
// config value.
|
||
|
//
|
||
|
BasicDocument::BasicDocument(char *loc, int suggested_size)
|
||
|
{
|
||
|
int temp_size = 0;
|
||
|
|
||
|
id = 0;
|
||
|
location = 0;
|
||
|
title = 0;
|
||
|
metacontent = 0;
|
||
|
contents = 0;
|
||
|
document_length = 0;
|
||
|
|
||
|
|
||
|
HtConfiguration *config = HtConfiguration::config();
|
||
|
|
||
|
//We probably need to move assignment of max_doc_size, according
|
||
|
//to a configuration value.
|
||
|
|
||
|
if (suggested_size > 0)
|
||
|
temp_size = suggested_size;
|
||
|
else
|
||
|
temp_size = config->Value("max_doc_size");
|
||
|
|
||
|
contents.allocate(temp_size + 100);
|
||
|
|
||
|
contentType = "";
|
||
|
|
||
|
if (loc)
|
||
|
{
|
||
|
Location(loc);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
//*****************************************************************************
|
||
|
// BasicDocument::~BasicDocument()
|
||
|
//
|
||
|
BasicDocument::~BasicDocument()
|
||
|
{
|
||
|
// We delete only the derived class objects
|
||
|
|
||
|
#if MEM_DEBUG
|
||
|
char *p = new char;
|
||
|
cout << "==== BasicDocument deleted: " << this << " new at " << ((void *) p) << endl;
|
||
|
delete p;
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
|
||
|
//*****************************************************************************
|
||
|
// void BasicDocument::Reset()
|
||
|
// Restore the BasicDocument object to an initial state.
|
||
|
//
|
||
|
void
|
||
|
BasicDocument::Reset()
|
||
|
{
|
||
|
|
||
|
id = 0;
|
||
|
location = 0;
|
||
|
title = 0;
|
||
|
metacontent = 0;
|
||
|
contents = 0;
|
||
|
|
||
|
contentType = 0;
|
||
|
document_length = 0;
|
||
|
|
||
|
}
|
||
|
|
||
|
//*****************************************************************************
|
||
|
// void BasicDocument::Length()
|
||
|
// Return/Calc length of BasicDocument... icummulative size of the Strings
|
||
|
//
|
||
|
int
|
||
|
BasicDocument::Length()
|
||
|
{
|
||
|
if (document_length < 0)
|
||
|
{
|
||
|
document_length = 0;
|
||
|
document_length += location.length();
|
||
|
document_length += title.length();
|
||
|
document_length += metacontent.length();
|
||
|
document_length += contents.length();
|
||
|
document_length += id.length();
|
||
|
}
|
||
|
|
||
|
return (document_length);
|
||
|
}
|
||
|
|
||
|
|
||
|
//*****************************************************************************
|
||
|
// Parsable *BasicDocument::getParsable()
|
||
|
// Given the content-type of a document, returns a document parser.
|
||
|
// This will first look through the list of user supplied parsers and
|
||
|
// then at our (limited) builtin list of parsers. The user supplied
|
||
|
// parsers are external programs that will be used.
|
||
|
|
||
|
Parsable *
|
||
|
BasicDocument::getParsable()
|
||
|
{
|
||
|
static HTML *html = 0;
|
||
|
static Plaintext *plaintext = 0;
|
||
|
static ExternalParser *externalParser = 0;
|
||
|
|
||
|
Parsable *parsable = 0;
|
||
|
|
||
|
if (ExternalParser::canParse(contentType))
|
||
|
{
|
||
|
if (externalParser)
|
||
|
{
|
||
|
delete externalParser;
|
||
|
}
|
||
|
externalParser = new ExternalParser(contentType);
|
||
|
parsable = externalParser;
|
||
|
}
|
||
|
else if (mystrncasecmp((char *) contentType, "text/html", 9) == 0)
|
||
|
{
|
||
|
if (!html)
|
||
|
html = new HTML();
|
||
|
parsable = html;
|
||
|
}
|
||
|
else if (mystrncasecmp((char *) contentType, "text/plain", 10) == 0)
|
||
|
{
|
||
|
if (!plaintext)
|
||
|
plaintext = new Plaintext();
|
||
|
parsable = plaintext;
|
||
|
}
|
||
|
else if (mystrncasecmp((char *) contentType, "text/css", 8) == 0)
|
||
|
{
|
||
|
return NULL;
|
||
|
}
|
||
|
else if (mystrncasecmp((char *) contentType, "text/", 5) == 0)
|
||
|
{
|
||
|
if (!plaintext)
|
||
|
plaintext = new Plaintext();
|
||
|
parsable = plaintext;
|
||
|
if (debug > 1)
|
||
|
{
|
||
|
cout << '"' << contentType << "\" not a recognized type. Assuming text/plain\n";
|
||
|
}
|
||
|
}
|
||
|
else
|
||
|
{
|
||
|
if (debug > 1)
|
||
|
{
|
||
|
cout << '"' << contentType << "\" not a recognized type. Ignoring\n";
|
||
|
}
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
parsable->setContents(contents.get(), contents.length());
|
||
|
return parsable;
|
||
|
}
|
||
|
|
||
|
//*****************************************************************************
|
||
|
//
|
||
|
// Test for self parseaable
|
||
|
//
|
||
|
int
|
||
|
BasicDocument::SelfParseable()
|
||
|
{
|
||
|
|
||
|
if (mystrncasecmp((char *) contentType, "text/vnd.customdocument", 10) == 0)
|
||
|
{
|
||
|
return (TRUE);
|
||
|
}
|
||
|
else
|
||
|
return (FALSE);
|
||
|
|
||
|
}
|
||
|
|
||
|
|
||
|
//*****************************************************************************
|
||
|
// Parsable *BasicDocument::internalParser()
|
||
|
int
|
||
|
BasicDocument::internalParser(TextCollector & textcollector)
|
||
|
{
|
||
|
HtConfiguration* config= HtConfiguration::config();
|
||
|
char *position = NULL;
|
||
|
static int minimumWordLength = config->Value("minimum_word_length", 3);
|
||
|
int wordIndex = 1;
|
||
|
String word;
|
||
|
int letter_count = 0;
|
||
|
|
||
|
//First Process Title
|
||
|
textcollector.got_title((char *) title);
|
||
|
|
||
|
//Next Process Contents
|
||
|
position = contents;
|
||
|
|
||
|
while (*position)
|
||
|
{
|
||
|
word = 0;
|
||
|
|
||
|
if (HtIsStrictWordChar(*position))
|
||
|
{
|
||
|
//
|
||
|
// Start of a word. Try to find the whole thing
|
||
|
//
|
||
|
//TODO NEAL RICHTER Imposed a 50-letter word length limit here
|
||
|
//
|
||
|
while (*position && HtIsWordChar(*position) && (letter_count < 50))
|
||
|
{
|
||
|
word << *position;
|
||
|
position++;
|
||
|
letter_count++;
|
||
|
}
|
||
|
|
||
|
letter_count = 0;
|
||
|
if (word.length() >= minimumWordLength)
|
||
|
{
|
||
|
textcollector.got_word((char *) word, wordIndex++, 0);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (*position)
|
||
|
position++;
|
||
|
|
||
|
}//end while
|
||
|
|
||
|
textcollector.got_head((char*) contents);
|
||
|
|
||
|
//Third, Process MetaContent
|
||
|
position = metacontent;
|
||
|
textcollector.got_meta_dsc(metacontent);
|
||
|
|
||
|
|
||
|
//max_meta_description_length???
|
||
|
|
||
|
while (*position)
|
||
|
{
|
||
|
word = 0;
|
||
|
|
||
|
if (HtIsStrictWordChar(*position))
|
||
|
{
|
||
|
//
|
||
|
// Start of a word. Try to find the whole thing
|
||
|
//
|
||
|
while (*position && HtIsWordChar(*position) && (letter_count < 50))
|
||
|
{
|
||
|
word << *position;
|
||
|
position++;
|
||
|
letter_count++;
|
||
|
}
|
||
|
|
||
|
letter_count = 0;
|
||
|
|
||
|
if (word.length() >= minimumWordLength)
|
||
|
{
|
||
|
textcollector.got_word((char *) word, wordIndex++, 9);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (*position)
|
||
|
position++;
|
||
|
|
||
|
}//end while
|
||
|
|
||
|
return(1);
|
||
|
}
|