debian/htdig/htdig-3.2.0b6/libhtdig/TextCollector.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

//--------------------------------------------------------------------
//
// TextCollector.h
//
// 2/6/2002 created for libhtdig
//
// Neal Richter nealr@rightnow.com
//
// TextCollector:
//            General Purpose Text Document Indexer.
//            Calls appropriate parsers. 
//            The  parser notifies the TextCollector object that it got something
//            (got_* functions) and the TextCollector object feed the databases
//            and statistics accordingly.
//
// Part of the ht://Dig package   <http://www.htdig.org/>
// Copyright (c) 1995-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later or later
// <http://www.gnu.org/copyleft/lgpl.html>
//
// $Id: TextCollector.h,v 1.4 2004/05/28 13:15:29 lha Exp $
//
//--------------------------------------------------------------------


#ifndef _TextCollector_h_
#define _TextCollector_h_

#include "BasicDocument.h"
#include "DocumentRef.h"
#include "Dictionary.h"
#include "Queue.h"
#include "HtWordReference.h"
#include "List.h"
#include "StringList.h"
#include "DocumentDB.h"

class Document;
class HtWordList;

enum  TextCollectorLog {
    TextCollector_noLog,
    TextCollector_logUrl,
    TextCollector_Restart
};

class TextCollector
{
    public:
        //
        // Construction/Destruction
        //
        			TextCollector(TextCollectorLog flags = TextCollector_noLog);
        virtual		~TextCollector();
    
        int        IndexDoc(BasicDocument & adoc);
        int        FlushWordDB();
    
        //
        // Report statistics about the parser
        //
        void		ReportStatistics(const String& name);
    	
        //
        // These are the callbacks that we need to write code for
        //
        void		got_word(const char *word, int location, int heading);
        void		got_href(URL &url, const char *description, int hops = 1);
        void		got_title(const char *title);
        void		got_time(const char *time);
        void		got_head(const char *head);
        void		got_meta_dsc(const char *md);
        void		got_anchor(const char *anchor);
        void		got_image(const char *src);
        void		got_meta_email(const char *);
        void		got_meta_notification(const char *);
        void		got_meta_subject(const char *);
        void                got_noindex();
    
    
    private:
        //
        // A hash to keep track of what we've seen
        //
        Dictionary		visited;
        
        URL			*base;
        String		current_title;
        String		current_head;
        String		current_meta_dsc;
        time_t		current_time;
        int			current_id;
        DocumentRef		*current_ref;
        int			current_anchor_number;
        int			trackWords;
        int			n_links;
        HtWordReference	word_context;
        HtWordList		words;
    	
        int			check_unique_md5;
        int			check_unique_date;
    
    
        TextCollectorLog log;
        //
        // These are weights for the words.  The index is the heading level.
        //
        long int		factor[11];
        int			currenthopcount;
    
        //
        // For efficiency reasons, we will only use one document object which
        // we reuse.
        //
        BasicDocument		*doc;
    
        Database 		*d_md5;
    
        // Some useful constants
        int              minimumWordLength;
    
        //
        // Helper routines
        //
        void		RetrievedDocument(DocumentRef *ref);
    
        int      temp_doc_count;
};

#endif