// // htstat.cc // // htstat: A utility to give statistics on the contents of the word and doc DB. // // Part of the ht://Dig package // Copyright (c) 1999-2004 The ht://Dig Group // For copyright details, see the file COPYING in your distribution // or the GNU Library General Public License (LGPL) version 2 or later // // // $Id: htstat.cc,v 1.6 2004/05/28 13:15:25 lha Exp $ // #ifdef HAVE_CONFIG_H #include "htconfig.h" #endif /* HAVE_CONFIG_H */ #include "WordContext.h" #include "HtURLCodec.h" #include "HtWordList.h" #include "HtConfiguration.h" #include "DocumentDB.h" #include "defaults.h" #include #ifndef _MSC_VER /* _WIN32 */ #include #endif // If we have this, we probably want it. #ifdef HAVE_GETOPT_H #include #elif HAVE_GETOPT_LOCAL #include #endif int verbose = 0; void usage(); void reportError(char *msg); //***************************************************************************** // int main(int ac, char **av) // int main(int ac, char **av) { int alt_work_area = 0; int url_list = 0; String configfile = DEFAULT_CONFIG_FILE; int c; extern char *optarg; while ((c = getopt(ac, av, "vc:au")) != -1) { switch (c) { case 'c': configfile = optarg; break; case 'v': verbose++; break; case 'a': alt_work_area++; break; case 'u': url_list++; break; case '?': usage(); break; } } HtConfiguration* config= HtConfiguration::config(); config->Defaults(&defaults[0]); if (access((char*)configfile, R_OK) < 0) { reportError(form("Unable to find configuration file '%s'", configfile.get())); } config->Read(configfile); // // Check url_part_aliases and common_url_parts for // errors. String url_part_errors = HtURLCodec::instance()->ErrMsg(); if (url_part_errors.length() != 0) reportError(form("Invalid url_part_aliases or common_url_parts: %s", url_part_errors.get())); // We may need these through the methods we call if (alt_work_area != 0) { String configValue; configValue = config->Find("word_db"); if (configValue.length() != 0) { configValue << ".work"; config->Add("word_db", configValue); } configValue = config->Find("doc_db"); if (configValue.length() != 0) { configValue << ".work"; config->Add("doc_db", configValue); } configValue = config->Find("doc_index"); if (configValue.length() != 0) { configValue << ".work"; config->Add("doc_index", configValue); } configValue = config->Find("doc_excerpt"); if (configValue.length() != 0) { configValue << ".work"; config->Add("doc_excerpt", configValue); } } DocumentDB docs; if (docs.Read(config->Find("doc_db"), config->Find("doc_index"), config->Find("doc_excerpt")) == OK) { List *urls = docs.URLs(); cout << "htstat: Total documents: " << urls->Count() << endl; if (url_list) { // Spit out the list of URLs too String *url; cout << "htstat: URLs in database: " << endl; urls->Start_Get(); while ((url = (String *) urls->Get_Next())) { cout << "\t" << url->get() << endl; } } delete urls; docs.Close(); } // Initialize htword WordContext::Initialize(*config); HtWordList words(*config); if(words.Open(config->Find("word_db"), O_RDONLY) == OK) { cout << "htstat: Total words: " << words.WordRefs()->Count() << endl; cout << "htstat: Total unique words: " << words.Words()->Count() << endl; words.Close(); } return 0; } //***************************************************************************** // void usage() // Display program usage information // void usage() { cout << "usage: htstat [-v][-a][-c configfile][-u]\n"; cout << "This program is part of ht://Dig " << VERSION << "\n\n"; cout << "Options:\n"; cout << "\t-v\tVerbose mode. This increases the verbosity of the\n"; cout << "\t\tprogram. Using more than 2 is probably only useful\n"; cout << "\t\tfor debugging purposes. The default verbose mode\n"; cout << "\t\tgives a progress on what it is doing and where it is.\n\n"; cout << "\t-a\tUse alternate work files.\n"; cout << "\t\tTells htstat to append .work to the database files \n"; cout << "\t\tallowing it to operate on a second set of databases.\n"; cout << "\t-c configfile\n"; cout << "\t\tUse the specified configuration file instead on the\n"; cout << "\t\tdefault.\n\n"; cout << "\t-u\tGive a list of URLs in the document database.\n\n"; exit(0); } //***************************************************************************** // Report an error and die // void reportError(char *msg) { cout << "htstat: " << msg << "\n\n"; exit(1); }