blob: d44869a6ee40043f4cf1f36ea72d7b7e774728d6 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
|
//--------------------------------------------------------------------
//
// TextCollector.h
//
// 2/6/2002 created for libhtdig
//
// Neal Richter nealr@rightnow.com
//
// TextCollector:
// General Purpose Text Document Indexer.
// Calls appropriate parsers.
// The parser notifies the TextCollector object that it got something
// (got_* functions) and the TextCollector object feed the databases
// and statistics accordingly.
//
// Part of the ht://Dig package <http://www.htdig.org/>
// Copyright (c) 1995-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later or later
// <http://www.gnu.org/copyleft/lgpl.html>
//
// $Id: TextCollector.h,v 1.4 2004/05/28 13:15:29 lha Exp $
//
//--------------------------------------------------------------------
#ifndef _TextCollector_h_
#define _TextCollector_h_
#include "BasicDocument.h"
#include "DocumentRef.h"
#include "Dictionary.h"
#include "Queue.h"
#include "HtWordReference.h"
#include "List.h"
#include "StringList.h"
#include "DocumentDB.h"
class Document;
class HtWordList;
enum TextCollectorLog {
TextCollector_noLog,
TextCollector_logUrl,
TextCollector_Restart
};
class TextCollector
{
public:
//
// Construction/Destruction
//
TextCollector(TextCollectorLog flags = TextCollector_noLog);
virtual ~TextCollector();
int IndexDoc(BasicDocument & adoc);
int FlushWordDB();
//
// Report statistics about the parser
//
void ReportStatistics(const String& name);
//
// These are the callbacks that we need to write code for
//
void got_word(const char *word, int location, int heading);
void got_href(URL &url, const char *description, int hops = 1);
void got_title(const char *title);
void got_time(const char *time);
void got_head(const char *head);
void got_meta_dsc(const char *md);
void got_anchor(const char *anchor);
void got_image(const char *src);
void got_meta_email(const char *);
void got_meta_notification(const char *);
void got_meta_subject(const char *);
void got_noindex();
private:
//
// A hash to keep track of what we've seen
//
Dictionary visited;
URL *base;
String current_title;
String current_head;
String current_meta_dsc;
time_t current_time;
int current_id;
DocumentRef *current_ref;
int current_anchor_number;
int trackWords;
int n_links;
HtWordReference word_context;
HtWordList words;
int check_unique_md5;
int check_unique_date;
TextCollectorLog log;
//
// These are weights for the words. The index is the heading level.
//
long int factor[11];
int currenthopcount;
//
// For efficiency reasons, we will only use one document object which
// we reuse.
//
BasicDocument *doc;
Database *d_md5;
// Some useful constants
int minimumWordLength;
//
// Helper routines
//
void RetrievedDocument(DocumentRef *ref);
int temp_doc_count;
};
#endif
|