//
// $Id: defaults.cc,v 1.112 2004/06/12 13:39:12 lha Exp $
//
#ifdef HAVE_CONFIG_H
#include "htconfig.h"
#endif /* HAVE_CONFIG_H */
#include "HtConfiguration.h"
// Fields and their values:
// Attribute name
// Default value ("" becomes "no default" in .html docs)
// Type (boolean, number, integer, string, string list, quoted string list,
// pattern list)
// Commands using attribute (all, htdig, htsearch, htfuzzy,
// htdump, htload, htnotify, htpurge)
// Block (Global, Server, URL)
// Versions for which attribute is present
// Class (Extra Output, External:Parsers, External:Protocols,
// File Layout,
// Indexing:Connection, Indexing:Out, Indexing:What,Indexing:Where,
// Presentation:Files, Presentation:How, Presentation:Text,
// Searching:Method, Searching:Ranking, Searching:UI,
// URLs)
// Example
// Description
ConfigDefaults defaults[] =
{
{ "accents_db", "${database_base}.accents.db", \
"string", "htfuzzy htsearch", "", "all", "File Layout", "accents_db: ${database_base}.uml.db", " \
The database file used for the fuzzy \"accents\" search \
algorithm. This database is created by \
htfuzzy and used by \
htsearch. \
" }, \
{ "accept_language", "", \
"string list", "htdig", "Server", "3.2.0b4", "Indexing:Out", "accept_language: en-us en it", " \
This attribute allows you to restrict the set of natural languages \
that are preferred as a response to an HTTP request performed by the \
digger. This can be done by putting one or more language tags \
(as defined by RFC 1766) in the preferred order, separated by spaces. \
By doing this, when the server performs a content negotiation based \
on the 'accept-language' given by the HTTP user agent, a different \
content can be shown depending on the value of this attribute. If \
set to an empty list, no language will be sent and the server default \
will be returned. \
" }, \
{ "add_anchors_to_excerpt", "true", \
"boolean", "htsearch", "", "3.1.0", "Presentation:How", "add_anchors_to_excerpt: no", " \
If set to true, the first occurrence of each matched \
word in the excerpt will be linked to the closest \
anchor in the document. This only has effect if the \
EXCERPT variable is used in the output \
template and the excerpt is actually going to be displayed. \
" }, \
{ "allow_double_slash", "false", \
"boolean", "htdig", "", "3.2.0b4", "Indexing:Out", "allow_double_slash: true", " \
If set to true, strings of multiple slashes ('/') in URL paths \
will be left intact, rather than being collapsed. This is necessary \
for some search engine URLs which use slashes to separate fields rather \
than to separate directory components. However, it can lead to multiple database \
entries refering to the same file, and it causes '/foo//../' to \
be equivalent to '/foo/', rather than to '/'. \
" }, \
{ "allow_in_form", "", \
"string list", "htsearch", "", "3.1.0", "Searching:UI", "allow_in_form: search_algorithm search_results_header", " \
Allows the specified config file attributes to be specified \
in search forms as separate fields. This could be used to \
allow form writers to design their own headers and footers \
and specify them in the search form. Another example would \
be to offer a menu of search_algorithms in the form. \
\
\
\
\
<SELECT NAME=\"search_algorithm\"> \
<OPTION VALUE=\"exact:1 prefix:0.6 synonyms:0.5 endings:0.1\" SELECTED>fuzzy \
<OPTION VALUE=\"exact:1\">exact \
</SELECT> \
| \
\
\
The general idea behind this is to make an input parameter out \
of any configuration attribute that's not already automatically \
handled by an input parameter. You can even make up your own \
configuration attribute names, for purposes of passing data from \
the search form to the results output. You're not restricted to \
the existing attribute names. The attributes listed in the \
allow_in_form list will be settable in the search form using \
input parameters of the same name, and will be propagated to \
the follow-up search form in the results template using template \
variables of the same name in upper-case. \
You can also make select lists out of any of these input \
parameters, in the follow-up search form, using the \
build_select_lists \
configuration attribute. \
WARNING: Extreme care are should be taken with this option, as \
allowing CGI scripts to set file names can open security holes.\
" }, \
{ "allow_numbers", "false", \
"boolean", "htdig htsearch", "", "all", "Indexing:What", "allow_numbers: true", " \
If set to true, numbers are considered words. This \
means that searches can be done on strings of digits as well as \
regular words. All the same rules apply to numbers as \
to words. This does not cause numbers containing a decimal point or \
commas to be treated as a single entity. \
When allow_numbers is false, words are stil \
allowed to contain digits, but they must also contain at \
least one alphabetic character or \
extra word character. \
To disallow digits in words, add the digits to \
valid_punctuation. \
" }, \
{ "allow_space_in_url", "false", \
"boolean", "htdig", "", "3.2.0b6", "Indexing:Where", "allow_space_in_url: true", " \
If set to true, htdig will handle URLs that contain \
embedded spaces. Technically, this is a violation of \
RFC 2396, which says spaces should be stripped out \
(as htdig does by default). However, many web browsers \
and HTML code generators violate this standard already, \
so enabling this attribute allows htdig to handle these \
non-compliant URLs. Even with this attribute set, htdig \
still strips out all white space (leading, trailing and \
embedded), except that space characters embedded within \
the URL will be encoded as %20. \
" }, \
{ "allow_virtual_hosts", "true", \
"boolean", "htdig", "", "3.0.8b2", "Indexing:Where", "allow_virtual_hosts: false", " \
If set to true, htdig will index virtual web sites as \
expected. If false, all URL host names will be \
normalized into whatever the DNS server claims the IP \
address to map to. If this option is set to false, \
there is no way to index either \"soft\" or \"hard\" \
virtual web sites. \
" }, \
{ "anchor_target", "", \
"string", "htsearch", "", "3.1.6", "Presentation:How", "anchor_target: body", " \
When the first matched word in the excerpt is linked \
to the closest anchor in the document, this string \
can be set to specify a target in the link so the \
resulting page is displayed in the desired frame. \
This value will only be used if the \
add_anchors_to_excerpt \
attribute is set to true, the EXCERPT \
variable is used in the output template and the \
excerpt is actually displayed with a link. \
" }, \
{ "any_keywords", "false", \
"boolean", "htsearch", "", "3.2.0b2", "Searching:Method", "any_keywords: yes", " \
If set to true, the words in the keywords \
input parameter in the search form will be joined with logical \
ORs rather than ANDs, so that any of the words provided will do. \
Note that this has nothing to do with limiting the search to \
words in META keywords tags. See the \
search form documentation for details on this. \
" }, \
{ "author_factor", "1", \
"number", "htsearch", "", "3.2.0b4", "Searching:Ranking", "author_factor: 1", " \
Weighting applied to words in a <meta name=\"author\" ... > \
tag.
\
See also heading_factor. \
" }, \
{ "authorization", "", \
"string", "htdig", "URL", "3.1.4", "Indexing:Out", "authorization: myusername:mypassword", " \
This tells htdig to send the supplied \
username:password with each HTTP request. \
The credentials will be encoded using the \"Basic\" authentication \
scheme. There must be a colon (:) between the username and \
password.
\
This attribute can also be specified on htdig's command line using \
the -u option, and will be blotted out so it won't show up in a \
process listing. If you use it directly in a configuration file, \
be sure to protect it so it is readable only by you, and do not \
use that same configuration file for htsearch. \
" }, \
{ "backlink_factor", "0.1", \
"number", "htsearch", "", "3.1.0", "Searching:Ranking", "backlink_factor: 501.1", " \
This is a weight of \"how important\" a page is, based on \
the number of URLs pointing to it. It's actually \
multiplied by the ratio of the incoming URLs (backlinks) \
and outgoing URLs (links on the page), to balance out pages \
with lots of links to pages that link back to them. The ratio \
gives lower weight to \"link farms\", which often have many \
links to them. This factor can \
be changed without changing the database in any way. \
However, setting this value to something other than 0 \
incurs a slowdown on search results. \
" }, \
{ "bad_extensions", ".wav .gz .z .sit .au .zip .tar .hqx .exe .com .gif .jpg .jpeg .aiff .class .map .ram .tgz .bin .rpm .mpg .mov .avi .css", \
"string list", "htdig", "URL", "all", "Indexing:Where", "bad_extensions: .foo .bar .bad", " \
This is a list of extensions on URLs which are \
considered non-parsable. This list is used mainly to \
supplement the MIME-types that the HTTP server provides \
with documents. Some HTTP servers do not have a correct \
list of MIME-types and so can advertise certain \
documents as text while they are some binary format. \
If the list is empty, then all extensions are acceptable, \
provided they pass other criteria for acceptance or rejection. \
See also valid_extensions. \
" }, \
{ "bad_local_extensions", ".php .shtml .cgi", \
"string list", "htdig", "URL", "all", "Indexing:Where", "bad_extensions: .foo .bar .bad", " \
This is a list of extensions on URLs which must be retrieved \
using the URL's true transport mechanism (such as HTTP). \
If local_urls is specified, URLs not \
ending with these extensions may instead be retrieved through \
the local filesystem for efficiency. \
" },
{ "bad_querystr", "", \
"pattern list", "htdig", "URL", "3.1.0", "Indexing:Where", "bad_querystr: forum=private section=topsecret&passwd=required", " \
This is a list of CGI query strings to be excluded from \
indexing. This can be used in conjunction with CGI-generated \
portions of a website to control which pages are \
indexed. \
" }, \
{ "bad_word_list", "${common_dir}/bad_words", \
"string", "htdig htsearch", "", "all", "Indexing:What,Searching:Method", "bad_word_list: ${common_dir}/badwords.txt", " \
This specifies a file which contains words which should \
be excluded when digging or searching. This list should \
include the most common words or other words that you \
don't want to be able to search on (things like \
sex or smut are examples of these.)
\
The file should contain one word per line. A sample \
bad words file is located in the contrib/examples
\
directory. \
" }, \
{ "bin_dir", BIN_DIR, \
"string", "all", "", "all", "File Layout", "bin_dir: /usr/local/bin", " \
This is the directory in which the executables \
related to ht://Dig are installed. It is never used \
directly by any of the programs, but other attributes \
can be defined in terms of this one. \
\
The default value of this attribute is determined at \
compile time. \
\
" }, \
{ "boolean_keywords", "and or not", \
"string list", "htsearch", "", "3.1.6", "Presentation:How", "boolean_keywords: et ou non", " \
These three strings are used as the keywords used in \
constructing the \
LOGICAL_WORDS \
template variable, \
and in parsing the words input \
parameter when the method \
parameter or match_method attribute \
is set to boolean
. \
See also the \
boolean_syntax_errors attribute. \
" },
{ "boolean_syntax_errors", "Expected \
'a search word, a quoted phrase or a boolean expression between ()' \
'at the end' 'instead of' 'end of expression' quotes", \
"quoted string list", "htsearch", "", "3.1.6", "Presentation:How",
"boolean_syntax_errors: Attendait \"un mot\" \"à la fin\" \
\"au lieu de\" \"fin d'expression\" \"guillemet\"", " \
These six strings are used as the keywords used to \
construct various syntax error messages for errors encountered in \
parsing the words input \
parameter when the method parameter \
or match_method attribute \
is set to boolean
. \
They are used in conjunction with the \
boolean_keywords attribute, and \
comprise all \
English-specific parts of these error messages. The order in which \
the strings are put together may not be ideal, or even gramatically \
correct, for all languages, but they can be used to make fairly \
intelligible messages in many languages. \
" },
{ "build_select_lists", "", \
"quoted string list", "htsearch", "", "3.2.0b1", "Searching:UI", "build_select_lists: \
MATCH_LIST matchesperpage matches_per_page_list \\
\
1 1 1 matches_per_page \"Previous Amount\" \\
\
RESTRICT_LIST,multiple restrict restrict_names 2 1 2 restrict \"\" \\
\
FORMAT_LIST,radio format template_map 3 2 1 template_name \"\"", " \
This list allows you to define any htsearch input parameter as \
a select list for use in templates, provided you also define \
the corresponding name list attribute which enumerates all the \
choices to put in the list. It can be used for existing input \
parameters, as well as any you define using the \
allow_in_form \
attribute. The entries in this list each consist of an octuple, \
a set of eight strings defining the variables and how they are to \
be used to build a select list. The attribute can contain many \
of these octuples. The strings in the string list are merely \
taken eight at a time. For each octuple of strings specified in \
build_select_lists, the elements have the following meaning: \
\
- the name of the template variable to be defined as a list, \
optionally followed by a comma and the type of list, and \
optional formatting codes \
- the input parameter name that the select list will set \
- the name of the user-defined attribute containing the \
name list \
- the tuple size used in the name list above \
- the index into a name list tuple for the value \
- the index for the corresponding label on the selector \
- the configuration attribute where the default value for \
this input parameter is defined \
- the default label, if not an empty string, which will be \
used as the label for an additional list item for the current \
input parameter value if it doesn't match any value in the \
given list \
\
See the select list documentation \
for more information on this attribute. \
" }, \
{ "caps_factor", "1", \
"number", "htsearch", "", "??", "Searching:Ranking", "caps_factor: 1", " \
TO BE COMPLETED
\
See also heading_factor. \
" }, \
{ "case_sensitive", "true", \
"boolean", "htdig", "", "3.1.0b2", "Indexing:Where", "case_sensitive: false", " \
This specifies whether ht://Dig should consider URLs \
case-sensitive or not. If your server is case-insensitive, \
you should probably set this to false.
\
Even if this is false, \
common_url_parts, \
url_part_aliases and \
url_rewrite_rules \
are all still case sensitive, and \
server_aliases \
is still case insensitive. \
" }, \
{ "check_unique_date", "false", \
"boolean", "htdig", "Global", "3.2.0b3", "", "check_unique_date: false", " \
Include the modification date of the page in the MD5 hash, to reduce the \
problem with identical but physically separate pages in different parts of the tree pointing to \
different pages. \
" }, \
{ "check_unique_md5", "false", \
"boolean", "htdig", "Global", "3.2.0b3", "", "check_unique_md5: false", " \
Uses the MD5 hash of pages to reject aliases, prevents multiple entries \
in the index caused by such things as symbolic links \
Note: May not do the right thing for incremental update \
" }, \
{ "collection_names", "", \
"string list", "htsearch", "", "3.2.0b2", "", "collection_names: htdig_docs htdig_bugs", " \
This is a list of config file names that are used for searching multiple databases. \
Simply put, htsearch will loop through the databases specified by each of these config \
files and present the result of the search on all of the databases. \
The corresponding config files are looked up in the config_dir directory. \
Each listed config file must exist, as well as the corresponding databases. \
" }, \
{ "common_dir", COMMON_DIR, \
"string", "all", "", "all", "File Layout", "common_dir: /tmp", " \
Specifies the directory for files that will or can be \
shared among different search databases. The default \
value for this attribute is defined at compile time. \
" }, \
{ "common_url_parts", "http:// http://www. ftp:// ftp://ftp. /pub/ .html .htm .shtml /index.html /index.htm .com/ .com mailto:", \
"string list", "all", "", "3.1.0", "URLs", "common_url_parts: http://www.htdig.org/ml/ \\
\
.html \\
\
http://dev.htdig.org/ \\
\
http://www.htdig.org/", " \
Sub-strings often found in URLs stored in the \
database. These are replaced in the database by an \
internal space-saving encoding. If a string \
specified in url_part_aliases, \
overlaps any string in common_url_parts, the \
common_url_parts string is ignored.
\
Note that when this attribute is changed, the \
database should be rebuilt, unless the effect of \
\"changing\" the affected URLs in the database is \
wanted.
\
" }, \
{ "compression_level", "6", \
"integer", "htdig", "", "3.1.0", "Indexing:How", "compression_level: 0", " \
If non-zero and the \
zlib \
compression library was available when compiled, \
this attribute controls the amount of compression used in the \
doc_excerpt file. \
This must be in the range 0-9, and must be non-zero when \
wordlist_compress_zlib \
is used. \
" }, \
{ "config", "", \
"string", "all", "", "??", "File Layout", "", " \
Name of configuration file to load. \
For security reasons, restrictions are placed on the values which \
can be specified on the command line to \
htsearch. \
The default value of this attribute is determined at \
compile time. \
" }, \
{ "config_dir", CONFIG_DIR, \
"string", "all", "", "all", "File Layout", "config_dir: /var/htdig/conf", " \
This is the directory which contains all configuration \
files related to ht://Dig. It is never used \
directly by any of the programs, but other attributes \
or the include directive \
can be defined in terms of this one. \
\
The default value of this attribute is determined at \
compile time. \
\
" },
{ "content_classifier", "${bin_dir}/HtFileType", \
"string", "htdig", "", "3.2.0b4", "Indexing:What", "content_classifier: file -i -b", " \
When ht://Dig can't determine the type of a file://
\
URL from its extension, this program is used to determine the type. \
The program is called with one argument, the name of (possibly a \
temporary copy of) the file. \
\
See also mime_types.\
\
" }, \
{ "cookies_input_file", "", \
"string", "htdig", "", "3.2.0b4", "Indexing:Connection", "cookies_input_file: ${common_dir}/cookies.txt", " \
Specifies the location of the file used for importing cookies \
for the crawl. These cookies will be preloaded into htdig's \
in-memory cookie jar, but aren't written back to the file. \
Cookies are specified according to Netscape's format \
(tab-separated fields). If this attribute is left blank, \
no cookie file will be read. \
For more information, see the sample cookies.txt file in the \
ht://Dig source distribution. \
" }, \
{ "create_image_list", "false", \
"boolean", "htdig", "", "all", "Extra Output", "create_image_list: yes", " \
If set to true, a file with all the image URLs that \
were seen will be created, one URL per line. This list \
will not be in any order and there will be lots of \
duplicates, so after htdig has completed, it should be \
piped through sort -u
to get a unique list. \
" }, \
{ "create_url_list", "false", \
"boolean", "htdig", "", "all", "Extra Output", "create_url_list: yes", " \
If set to true, a file with all the URLs that were seen \
will be created, one URL per line. This list will not \
be in any order and there will be lots of duplicates, \
so after htdig has completed, it should be piped \
through sort -u
to get a unique list. \
" }, \
{ "database_base", "${database_dir}/db", \
"string", "all", "", "all", "File Layout", "database_base: ${database_dir}/sales", " \
This is the common prefix for files that are specific \
to a search database. Many different attributes use \
this prefix to specify filenames. Several search \
databases can share the same directory by just changing \
this value for each of the databases. \
" }, \
{ "database_dir", DATABASE_DIR, \
"string", "all", "", "all", "File Layout", "database_dir: /var/htdig", " \
This is the directory which contains all database and \
other files related to ht://Dig. It is never used \
directly by any of the programs, but other attributes \
are defined in terms of this one. \
\
The default value of this attribute is determined at \
compile time. \
\
" }, \
{ "date_factor", "0", \
"number", "htsearch", "", "3.1.0", "Searching:Ranking", "date_factor: 0.35", " \
This factor, gives higher \
rankings to newer documents and lower rankings to older \
documents. Before setting this factor, it's advised to \
make sure your servers are returning accurate dates \
(check the dates returned in the long format). \
Additionally, setting this to a nonzero value incurs a \
small performance hit on searching. \
" }, \
{ "date_format", "", \
"string", "htsearch", "", "3.1.2", "Presentation:How", "date_format: %Y-%m-%d", " \
This format string determines the output format for \
modification dates of documents in the search results. \
It is interpreted by your system's strftime \
function. Please refer to your system's manual page \
for this function, for a description of available \
format codes. If this format string is empty, as it \
is by default, \
htsearch \
will pick a format itself. In this case, the iso_8601 attribute can be used \
to modify the appearance of the date. \
" }, \
{ "description_factor", "150", \
"number", "htsearch", "", "3.1.0b3", "Searching:Ranking", "description_factor: 350", " \
Plain old \"descriptions\" are the text of a link pointing \
to a document. This factor gives weight to the words of \
these descriptions of the document. Not surprisingly, \
these can be pretty accurate summaries of a document's \
content. See also heading_factor \
and meta_description_factor. \
" }, \
{ "description_meta_tag_names", "description", \
"string list", "htdig", "", "3.1.6", "Searching:Ranking", "description_meta_tag_names: \"description htdig-description\"", " \
The words in this list are used to search for descriptions in HTML \
META tags. This list can contain any number of strings \
that each will be seen as the name for whatever description \
convention is used. While words in any of the specified \
description contents will be indexed, only the last meta tag \
containing a description will be kept for the \
\
variable in search results. The order in \
which the names are specified in this configuration attribute \
is irrelevant, as it is the order in which the tags appear in \
the documents that matters.
The META tags have the \
following format:
\
<META name=\"somename\" \
content=\"somevalue\">
\
See also meta_description_factor. \
" }, \
{ "disable_cookies", "true", \
"boolean", "htdig", "Server", "3.2.0b4", "Indexing:Connection", "disable_cookies: true", " \
This option, if set to true, will disable HTTP cookies. \
" }, \
{ "doc_db", "${database_base}.docdb", \
"string", "all", "", "all", "File Layout", "doc_db: ${database_base}documents.db", " \
This file will contain a Berkeley database of documents \
indexed by document number. It contains all the information \
gathered for each document, except the document excerpts \
which are stored in the \
doc_excerpt file. \
" }, \
{ "doc_excerpt", "${database_base}.excerpts", \
"string", "all", "", "3.2.0b1", "File Layout", "doc_excerpt: ${database_base}excerpts.db", " \
This file will contain a Berkeley database of document excerpts \
indexed by document number. It contains all the text \
gathered for each document, so this file can become \
rather large if \
max_head_length is set to a large value. \
The size can be reduced by setting the \
compression_level, \
if supported on your system. \
" }, \
{ "doc_index", "${database_base}.docs.index", \
"string", "htdig", "", "all", "File Layout", "doc_index: documents.index.db", " \
This file contains a mapping of document numbers to URLs and is \
used by htdig during indexing. It is used on updates if it exists. \
" }, \
{ "doc_list", "${database_base}.docs", \
"string", "htdig htdump htload", "", "all", "File Layout", "doc_list: /tmp/documents.text", " \
This file is basically a text version of the file \
specified in doc_db. Its \
only use is to have a human readable database of all \
documents. The file is easy to parse with tools like \
perl or tcl. \
" }, \
{ "endday", "", \
"integer", "htsearch", "", "3.1.6", "Searching:Method", "endday: 31", " \
Day component of last date allowed as last-modified date \
of returned docutments. \
This is most usefully specified as a \
GCI argument. \
See also startyear. \
" }, \
{ "end_ellipses", " ...
", \
"string", "htsearch", "", "all", "Presentation:Text", "end_ellipses: ...", " \
When excerpts are displayed in the search output, this \
string will be appended to the excerpt if there is text \
following the text displayed. This is just a visual \
reminder to the user that the excerpt is only part of \
the complete document. \
" }, \
{ "end_highlight", "", \
"string", "htsearch", "", "3.1.4", "Presentation:Text", "end_highlight: </font>", " \
When excerpts are displayed in the search output, matched \
words will be highlighted using \
start_highlight and this string. \
You should ensure that highlighting tags are balanced, \
that is, this string should close any formatting \
tag opened by start_highlight. \
" }, \
{ "endings_affix_file", "${common_dir}/english.aff", \
"string", "htfuzzy", "", "all", "File Layout", "endings_affix_file: /var/htdig/affix_rules", " \
Specifies the location of the file which contains the \
affix rules used to create the endings search algorithm \
databases. Consult the documentation on \
htfuzzy for more information on the \
format of this file. \
" }, \
{ "endings_dictionary", "${common_dir}/english.0", \
"string", "htfuzzy", "", "all", "File Layout", "endings_dictionary: /var/htdig/dictionary", " \
Specifies the location of the file which contains the \
dictionary used to create the endings search algorithm \
databases. Consult the documentation on \
htfuzzy for more information on the \
format of this file. \
" }, \
{ "endings_root2word_db", "${common_dir}/root2word.db", \
"string", "htfuzzy htsearch", "", "all", "File Layout", "endings_root2word_db: /var/htdig/r2w.db", " \
This attributes specifies the database filename to be \
used in the 'endings' fuzzy search algorithm. The \
database maps word roots to all legal words with that \
root. For more information about this and other fuzzy \
search algorithms, consult the \
htfuzzy documentation.
\
Note that the default value uses the \
common_dir attribute instead of the \
database_dir attribute. \
This is because this database can be shared with \
different search databases. \
" }, \
{ "endings_word2root_db", "${common_dir}/word2root.db", \
"string", "htfuzzy htsearch", "", "all", "File Layout", "endings_word2root_db: /var/htdig/w2r.bm", " \
This attributes specifies the database filename to be \
used in the 'endings' fuzzy search algorithm. The \
database maps words to their root. For more information \
about this and other fuzzy search algorithms, consult \
the htfuzzy \
documentation.
\
Note that the default value uses the \
common_dir attribute instead of the \
database_dir attribute. \
This is because this database can be shared with \
different search databases. \
" }, \
{ "endmonth", "", \
"integer", "htsearch", "", "3.1.6", "Searching:Method", "endmonth: 12", " \
Month component of last date allowed as last-modified date \
of returned docutments. \
This is most usefully specified as a \
GCI argument. \
See also startyear. \
" }, \
{ "endyear", "", \
"integer", "htsearch", "", "3.1.6", "Searching:Method", "endyear: 2002", " \
Year component of last date allowed as last-modified date \
of returned docutments. \
This is most usefully specified as a \
GCI argument. \
See also startyear. \
" }, \
{ "excerpt_length", "300", \
"integer", "htsearch", "", "all", "Presentation:How", "excerpt_length: 500", " \
This is the maximum number of characters the displayed \
excerpt will be limited to. The first matched word will \
be highlighted in the middle of the excerpt so that there is \
some surrounding context.
\
The \
start_ellipses and \
end_ellipses are used to \
indicate that the document contains text before and \
after the displayed excerpt respectively. \
The start_highlight and \
end_highlight are used to \
specify what formatting tags are used to highlight matched words. \
" }, \
{ "excerpt_show_top", "false", \
"boolean", "htsearch", "", "all", "Presentation:How", "excerpt_show_top: yes", " \
If set to true, the excerpt of a match will always show \
the top of the matching document. If it is false (the \
default), the excerpt will attempt to show the part of \
the document that actually contains one of the words. \
" }, \
{ "exclude", "", \
"pattern list", "htsearch", "", "3.2.0b4", "Searching:Method", "exclude: myhost.com/mailarchive/", " \
If a URL contains any of the space separated patterns, it will be \
discarded in the searching phase. This is used to exclude certain \
URLs from search results. The list can be specified from within \
the configuration file, and can be overridden with the \"exclude\" \
input parameter in the search form. \
" }, \
{ "exclude_urls", "/cgi-bin/ .cgi", \
"pattern list", "htdig", "URL", "all", "Indexing:Where", "exclude_urls: students.html cgi-bin", " \
If a URL contains any of the space separated patterns, \
it will be rejected. This is used to exclude such \
common things such as an infinite virtual web-tree \
which start with cgi-bin. \
" }, \
{ "external_parsers", "", \
"quoted string list", "htdig", "", "3.0.7", "External:Parsers", "external_parsers: text/html /usr/local/bin/htmlparser \\
\
application/pdf /usr/local/bin/parse_doc.pl \\
\
application/msword->text/plain \"/usr/local/bin/mswordtotxt -w\" \\
\
application/x-gunzip->user-defined /usr/local/bin/ungzipper", " \
This attribute is used to specify a list of \
content-type/parsers that are to be used to parse \
documents that cannot by parsed by any of the internal \
parsers. The list of external parsers is examined \
before the builtin parsers are checked, so this can be \
used to override the internal behavior without \
recompiling htdig.
\
The external parsers are specified as pairs of \
strings. The first string of each pair is the \
content-type that the parser can handle while the \
second string of each pair is the path to the external \
parsing program. If quoted, it may contain parameters, \
separated by spaces.
\
External parsing can also be done with external \
converters, which convert one content-type to \
another. To do this, instead of just specifying \
a single content-type as the first string \
of a pair, you specify two types, in the form \
type1->type2, \
as a single string with no spaces. The second \
string will define an external converter \
rather than an external parser, to convert \
the first type to the second. If the second \
type is user-defined, then \
it's up to the converter script to put out a \
\"Content-Type: type\" header followed \
by a blank line, to indicate to htdig what type it \
should expect for the output, much like what a CGI \
script would do. The resulting content-type must \
be one that htdig can parse, either internally, \
or with another external parser or converter.
\
Only one external parser or converter can be \
specified for any given content-type. However, \
an external converter for one content-type can be \
chained to the internal parser for the same type, \
by appending -internal to the \
second type string (e.g. text/html->text/html-internal) \
to perform external preprocessing on documents of \
this type before internal parsing. \
There are two internal parsers, for text/html and \
text/plain. \
The parser program takes four command-line \
parameters, not counting any parameters already \
given in the command string:
\
infile content-type URL configuration-file
\
\
\
Parameter | \
Description | \
Example | \
\
\
infile | \
A temporary file with the contents to be parsed. | \
/var/tmp/htdext.14242 | \
\
\
content-type | \
The MIME-type of the contents. | \
text/html | \
\
\
URL | \
The URL of the contents. | \
http://www.htdig.org/attrs.html | \
\
\
configuration-file | \
The configuration-file in effect. | \
/etc/htdig/htdig.conf | \
\
\
The external parser is to write information for \
htdig on its standard output. Unless it is an \
external converter, which will output a document \
of a different content-type, then its output must \
follow the format described here.
\
The output consists of records, each record terminated \
with a newline. Each record is a series of (unless \
expressively allowed to be empty) non-empty tab-separated \
fields. The first field is a single character \
that specifies the record type. The rest of the fields \
are determined by the record type. \
\
\
Record type | \
Fields | \
Description | \
\
\
w | \
word | \
A word that was found in the document. | \
\
\
location | \
\
A number indicating the normalized location of \
the word within the document. The number has to \
fall in the range 0-1000 where 0 means the top of \
the document. \
| \
\
\
heading level | \
\
A heading level that is used to compute the \
weight of the word depending on its context in \
the document itself. The level is in the range of \
0-11 and are defined as follows: \
\
- 0
- Normal text
\
- 1
- Title text
\
- 2
- Heading 1 text
\
- 3
- Heading 2 text
\
- 4
- Heading 3 text
\
- 5
- Heading 4 text
\
- 6
- Heading 5 text
\
- 7
- Heading 6 text
\
- 8
- text alternative to images
\
- 9
- Keywords
\
- 10
- Meta-description
\
- 11
- Author
\
\
| \
\
\
u | \
document URL | \
\
A hyperlink to another document that is \
referenced by the current document. It must be \
complete and non-relative, using the URL parameter to \
resolve any relative references found in the document. \
| \
\
\
hyperlink description | \
\
For HTML documents, this would be the text \
between the <a href...> and </a> \
tags. \
| \
\
\
t | \
title | \
The title of the document | \
\
\
h | \
head | \
\
The top of the document itself. This is used to \
build the excerpt. This should only contain \
normal ASCII text \
| \
\
\
a | \
anchor | \
\
The label that identifies an anchor that can be \
used as a target in an URL. This really only \
makes sense for HTML documents. \
| \
\
\
i | \
image URL | \
\
An URL that points at an image that is part of \
the document. \
| \
\
\
m | \
http-equiv | \
\
The HTTP-EQUIV attribute of a \
META tag. \
May be empty. \
| \
\
\
name | \
\
The NAME attribute of this \
META tag. \
May be empty. \
| \
\
\
contents | \
\
The CONTENTS attribute of this \
META tag. \
May be empty. \
| \
\
\
See also FAQ questions 4.8 and \
4.9 for more examples.
\
" }, \
{ "external_protocols", "", \
"quoted string list", "htdig", "", "3.2.0b1", "External:Protocols", "external_protocols: https /usr/local/bin/handler.pl \\
\
ftp /usr/local/bin/ftp-handler.pl", " \
This attribute is a bit like \
external_parsers since it specifies \
a list of protocols/handlers that are used to download documents \
that cannot be retrieved using the internal methods. This enables \
htdig to index documents with URL schemes it does not understand, \
or to use more advanced authentication for the documents it is \
retrieving. This list is checked before HTTP or other methods, \
so this can override the internal behavior without writing additional \
code for htdig.
\
The external protocols are specified as pairs of strings, the first \
being the URL scheme that the script can handle while the second \
is the path to the script itself. If the second is \
quoted, then additional command-line arguments may be given.
\
If the external protocol does not contain a colon (:), it is assumed \
to have the standard format \
\"protocol://[usr[:password]@]address[:port]/path\". \
If it ends with a colon, then it is assumed to have the simpler format \
\"protocol:path\". If it ends with \"://\" then the standard form is \
again assumed.
\
The program takes three command-line parameters, not counting any \
parameters already given in the command string:
\
protocol URL configuration-file
\
\
\
Parameter | \
Description | \
Example | \
\
\
protocol | \
The URL scheme to be used. | \
https | \
\
\
URL | \
The URL to be retrieved. | \
https://www.htdig.org:8008/attrs.html | \
\
\
configuration-file | \
The configuration-file in effect. | \
/etc/htdig/htdig.conf | \
\
\
The external protocol script is to write information for htdig on the \
standard output. The output must follow the form described here. The \
output consists of a header followed by a blank line, followed by \
the contents of the document. Each record in the header is terminated \
with a newline. Each record is a series of (unless expressively \
allowed to be empty) non-empty tab-separated fields. The first field \
is a single character that specifies the record type. The rest of \
the fields are determined by the record type. \
\
\
Record type | \
Fields | \
Description | \
\
\
s | \
status code | \
\
An HTTP-style status code, e.g. 200, 404. Typical codes include: \
\
- 200
\
- Successful retrieval
\
- 304
\
- \
Not modified (for example, if the document hasn\'t \
changed since the last dig) \
\
- 301
\
- Redirect (to another URL)
\
- 401
\
- Not authorized
\
- 404
\
- Not found
\
\
| \
\
\
r | \
reason | \
\
A text string describing the status code, \
e.g \"Redirect\" or \"Not Found.\" \
| \
\
\
m | \
status code | \
\
The modification time of this document. While the code is \
fairly flexible about the time/date formats it accepts, it \
is recommended to use something standard, like \
RFC1123: Sun, 06 Nov 1994 08:49:37 GMT, or \
ISO-8601: 1994-11-06 08:49:37 GMT. \
| \
\
\
t | \
content-type | \
\
A valid MIME type for the document, like text/html or text/plain. \
| \
\
\
l | \
content-length | \
\
The length of the document on the server, which may not \
necessarily be the length of the buffer returned. \
| \
\
\
u | \
url | \
\
The URL of the document, or in the case of a redirect, the \
URL that should be indexed as a result of the redirect. \
| \
\
\
" }, \
{ "extra_word_characters", "", \
"string", "htdig htsearch", "", "3.1.2", "Indexing:What", "extra_word_characters: _", " \
These characters are considered part of a word. \
In contrast to the characters in the \
valid_punctuation \
attribute, they are treated just like letter \
characters. See also the allow_numbers\
attribute.
\
Note that the locale attribute \
is normally used to configure which characters \
constitute letter characters.
\
Note also that it is an error to have characters in both \
extra_word_characters and \
valid_punctuation. \
To add one of the characters in the default valid_punctuation to \
extra_word_characters, an explicit valid_punctuation entry must be \
added to the configuration file.
\
See also the comments about special characters at \
valid_punctuation. \
" }, \
{ "head_before_get", "true", \
"boolean", "htdig", "Server", "3.2.0b1", "Indexing:Connection", "head_before_get: false", " \
If set to true, an HTTP/1.1 HEAD \
call is made in order to retrieve header information about a document. \
If the status code and the content-type returned show that the \
document is parsable, then a subsequent 'GET' call is made. In \
general, it is recommended that this attribute be set to 'true', \
as it can really improve performance (especially when used with \
persistent connections). This is particularly so during an \
incremental dig, since in this case 'htdig' can ask the server if the \
document has been modified since last dig. However there are a few \
cases when it is better to switch it off: \
\
- the majority of documents are parsable (HTML or a type for which \
an external parser has been provided) and must be retrieved anyway \
(initial dig);
\
- the server does not support the HEAD method or it is \
disabled;
\
- in some cases persistent_connections may \
not work properly and either the 'head_before_get' attribute or the \
'persistent_connections' attribute must be turned off.
\
\
" }, \
{ "heading_factor", "5", \
"number", "htsearch", "", "3.2.0b1", "Searching:Ranking", "heading_factor: 20", " \
This is a factor which will be used to multiply the \
weight of words between <h1> and </h1> \
tags, as well as headings of levels <h2> through \
<h6>. It is used to assign the level of importance \
to headings. Setting a factor to 0 will cause words \
in these headings to be ignored. The number may be a \
floating point number. See also \
author_factor \
backlink_factor \
caps_factor \
date_factor \
description_factor \
keywords_factor \
meta_description_factor \
text_factor \
title_factor \
url_text_factor \
" }, \
{ "htnotify_prefix_file", "", \
"string", "htnotify", "", "3.2.0b3", "Extra Output", "htnotify_prefix_file: ${common_dir}/notify_prefix.txt", " \
Specifies the file containing text to be inserted in each mail \
message sent by htnotify before the list of expired webpages. If omitted, \
nothing is inserted. \
" }, \
{ "htnotify_replyto", "", \
"string", "htnotify", "", "3.2.0b3", "Extra Output", "htnotify_replyto: design-group@foo.com", " \
This specifies the email address that htnotify email messages \
include in the Reply-to: field. \
" }, \
{ "htnotify_sender", "webmaster@www", \
"string", "htnotify", "", "all", "Extra Output", "htnotify_sender: bigboss@yourcompany.com", " \
This specifies the email address that htnotify email \
messages get sent out from. The address is forged using \
/usr/lib/sendmail. Check htnotify/htnotify.cc for \
detail on how this is done. \
" }, \
{ "htnotify_suffix_file", "", \
"string", "htnotify", "", "3.2.0b3", "Extra Output", "htnotify_suffix_file: ${common_dir}/notify_suffix.txt", " \
Specifies the file containing text to be inserted in each mail message \
sent by htnotify after the list of expired webpages. If omitted, htnotify \
will insert a standard message. \
" }, \
{ "htnotify_webmaster", "ht://Dig Notification Service", \
"string", "htnotify", "", "3.2.0b3", "Extra Output", "htnotify_webmaster: Notification Service", " \
This provides a name for the From field, in addition to the email \
address for the email messages sent out by htnotify. \
" }, \
{ "http_proxy", "", \
"string", "htdig", "URL", "3.0", "Indexing:Connection", "http_proxy: http://proxy.bigbucks.com:3128", " \
When this attribute is set, all HTTP document \
retrievals will be done using the HTTP-PROXY protocol. \
The URL specified in this attribute points to the host \
and port where the proxy server resides.
\
Later, this should be able to be overridden by the \
http_proxy
environement variable, but it currently cannot.\
The use of a proxy server greatly improves performance \
of the indexing process.
\
See also \
http_proxy_authorization and \
#http_proxy_exclude. \
" }, \
{ "http_proxy_authorization", "", \
"string", "htdig", "URL", "3.2.0b4", "Indexing:Connection", "http_proxy_authorization: myusername:mypassword", " \
This tells htdig to send the supplied \
username:password with each HTTP request, \
when using a proxy with authorization requested. \
The credentials will be encoded using the \"Basic\" authentication \
scheme. There must be a colon (:) between the username and \
password.
\
If you use this option, be sure to protect the configuration file \
so it is readable only by you, and do not \
use that same configuration file for htsearch. \
" }, \
{ "http_proxy_exclude", "", \
"pattern list", "htdig", "", "3.1.0b3", "Indexing:Connection", "http_proxy_exclude: http://intranet.foo.com/", " \
When this is set, URLs matching this will not use the \
proxy. This is useful when you have a mixture of sites \
near to the digging server and far away. \
" }, \
{ "ignore_alt_text", "false", \
"boolean", "htdig", "", "3.1.6", "Indexing:What", "ignore_alt_text: true", " \
If set, this causes the text of the ALT field in an <IMG...> tag \
not to be indexed as part of the text of the document, nor included in \
excerpts. \
" }, \
{ "ignore_dead_servers", "true", \
"boolean", "htdig", "", "3.1.6", "Indexing:Connection", "ignore_dead_servers: false", " \
Determines whether htdig will continue to index URLs from a \
server after an attempted connection to the server fails as \
"no host found" or "host not found (port)." If \
set to false, htdig will try every URL from that server. \
" }, \
{ "image_list", "${database_base}.images", \
"string", "htdig", "", "all", "Extra Output", "image_list: allimages", " \
This is the file that a list of image URLs gets written \
to by htdig when the \
create_image_list is set to \
true. As image URLs are seen, they are just appended to \
this file, so after htdig finishes it is probably a \
good idea to run sort -u
on the file to \
eliminate duplicates from the file. \
" }, \
{ "image_url_prefix", IMAGE_URL_PREFIX, \
"string", "htsearch", "", "all", "Presentation:Text", "image_url_prefix: /images/htdig", " \
This specifies the directory portion of the URL used \
to display star images. This attribute isn't directly \
used by htsearch, but is used in the default URL for \
the star_image and \
star_blank attributes, and \
other attributes may be defined in terms of this one. \
\
The default value of this attribute is determined at \
compile time. \
\
" }, \
{ "include", "", \
"string", "all", "", "3.1.0", "", "include: ${config_dir}/htdig.conf", " \
This is not quite a configuration attribute, but \
rather a directive. It can be used within one \
configuration file to include the definitions of \
another file. The last definition of an attribute \
is the one that applies, so after including a file, \
any of its definitions can be overridden with \
subsequent definitions. This can be useful when \
setting up many configurations that are mostly the \
same, so all the common attributes can be maintained \
in a single configuration file. The include directives \
can be nested, but watch out for nesting loops. \
" }, \
{ "iso_8601", "false", \
"boolean", "htsearch htnotify", "", "3.1.0b2", "Presentation:How,Extra Output", "iso_8601: true", " \
This sets whether dates should be output in ISO 8601 \
format. For example, this was written on: 1998-10-31 11:28:13 EST. \
See also the date_format attribute, which \
can override any date format that \
htsearch \
picks by default.
\
This attribute also affects the format of the date \
htnotify expects to find \
in a htdig-notification-date field. \
" }, \
{ "keywords", "", \
"string list", "htsearch", "", "??", "Searching:Method", "keywords: documentation", " \
Keywords which must be found on all pages returned, \
even if the \"or\" (\"Any\") method is \
selected. \
" }, \
{ "keywords_factor", "100", \
"number", "htsearch", "", "all", "Searching:Ranking", "keywords_factor: 12", " \
This is a factor which will be used to multiply the \
weight of words in the list of \
meta keywords of a document. \
The number may be a floating point number. See also the \
heading_factor attribute. \
" }, \
{ "keywords_meta_tag_names", "keywords htdig-keywords", \
"string list", "htdig", "", "3.0.6", "Indexing:What", "keywords_meta_tag_names: keywords description", " \
The words in this list are used to search for keywords \
in HTML META tags. This list can contain any \
number of strings that each will be seen as the name \
for whatever keyword convention is used.
\
The META tags have the following format:
\
\
<META name=\"somename\" content=\"somevalue\"> \
\
" }, \
{ "limit_normalized", "", \
"pattern list", "htdig", "", "3.1.0b2", "Indexing:Where", "limit_normalized: http://www.mydomain.com", " \
This specifies a set of patterns that all URLs have to \
match against in order for them to be included in the \
search. Unlike the limit_urls_to attribute, this is done \
after the URL is normalized and the \
server_aliases \
attribute is applied. This allows filtering after any \
hostnames and DNS aliases are resolved. Otherwise, this \
attribute is the same as the limit_urls_to attribute. \
" }, \
{ "limit_urls_to", "${start_url}", \
"pattern list", "htdig", "", "all", "Indexing:Where", "limit_urls_to: .sdsu.edu kpbs [.*\\.html]", " \
This specifies a set of patterns that all URLs have to \
match against in order for them to be included in the \
search. Any number of strings can be specified, \
separated by spaces. If multiple patterns are given, at \
least one of the patterns has to match the URL.
\
Matching, by default, is a case-sensitive string match on the URL \
to be used, unless the case_sensitive \
attribute is false. The match will be performed after \
the relative references have been converted to a valid \
URL. This means that the URL will always start \
with a transport specifier (http://
if none is \
specified).
\
Granted, this is not the perfect way of doing this, \
but it is simple enough and it covers most cases.
\
To limit URLs in htsearch, use \
restrict. \
" }, \
{ "local_default_doc", "index.html", \
"string list", "htdig", "Server", "3.0.8b2", "Indexing:Where", "local_default_doc: default.html default.htm index.html index.htm", " \
Set this to the default documents in a directory used by the \
server. This is used for local filesystem access, \
using local_urls, to \
translate URLs like http://foo.com/ into something like \
/home/foo.com/index.html \
(see also remove_default_doc). \
The list should only contain names that the local server \
recognizes as default documents for directory URLs, as defined \
by the DirectoryIndex setting in Apache's srm.conf, for example. \
As of version 3.1.5, this can be a string list rather than a single \
name, and htdig will use the first name that works. Since this \
requires a loop, setting the most common name first will improve \
performance. Special characters can be embedded in these names \
using %xx hex encoding. \
" }, \
{ "local_urls", "", \
"string list", "htdig", "", "3.0.8b2", "Indexing:Where", "local_urls: http://www.foo.com/=/usr/www/htdocs/", " \
Set this to tell ht://Dig to access certain URLs through \
local filesystems. At first ht://Dig will try to access \
pages with URLs matching the patterns through the \
filesystems specified. If it cannot find the file, or \
if it doesn't recognize the file name extension, it will \
try the URL through HTTP instead. Note the example--the \
equal sign and the final slashes in both the URL and the \
directory path are critical. \
The fallback to HTTP can be disabled by setting the \
local_urls_only attribute to true. \
To access user directory URLs through the local filesystem, \
set local_user_urls. \
File types which need processing by the HTTP server may be \
specified by the \
bad_local_extensions \
attribute. \
As of version 3.1.5, you can provide multiple mappings of a given \
URL to different directories, and htdig will use the first \
mapping that works. \
Special characters can be embedded in these names using %xx hex encoding. \
For example, you can use %3D to embed an \"=\" sign in an URL pattern. \
\
See also local_default_doc. \
" }, \
{ "local_urls_only", "false", \
"boolean", "htdig", "", "3.1.4", "Indexing:Where", "local_urls_only: true", " \
Set this to tell ht://Dig to access files only through the \
local filesystem, for URLs matching the patterns in the \
local_urls or \
local_user_urls attribute. If it \
cannot find the file, it will give up rather than trying HTTP or \
another protocol. With this option, even file://
urls \
are not retrieved, except throught the local_urls mechanism.\
" }, \
{ "local_user_urls", "", \
"string list", "htdig", "", "3.0.8b2", "Indexing:Where", "local_user_urls: http://www.my.org/=/home/,/www/", " \
Set this to access user directory URLs through the local \
filesystem. If you leave the \"path\" portion out, it will \
look up the user's home directory in /etc/password (or NIS \
or whatever). As with local_urls, \
if the files are not found, ht://Dig will try with HTTP or the \
appropriate protocol. Again, note the \
example's format. To map http://www.my.org/~joe/foo/bar.html \
to /home/joe/www/foo/bar.html, try the example below. \
The fallback to HTTP can be disabled by setting the \
local_urls_only attribute to true. \
As of version 3.1.5, you can provide multiple mappings of a given \
URL to different directories, and htdig will use the first \
mapping that works. \
Special characters can be embedded in these names using %xx hex encoding. \
For example, you can use %3D to embed an \"=\" sign in an URL pattern. \
" }, \
{ "locale", "C", \
"string", "htdig", "", "3.0", "Indexing:What,Presentation:How", "locale: en_US", " \
Set this to whatever locale you want your search \
database cover. It affects the way international \
characters are dealt with. On most systems a list of \
legal locales can be found in /usr/lib/locale. Also \
check the setlocale(3C) man page. \
Note that depending the locale you choose, and whether \
your system's locale implementation affects floating \
point input, you may need to specify the decimal point \
as a comma rather than a period. This will affect \
settings of search_algorithm \
and any of the scoring factors. \
" }, \
{ "logging", "false", \
"boolean", "htsearch", "", "3.1.0b2", "Extra Output", "logging: true", " \
This sets whether htsearch should use the syslog() to log \
search requests. If set, this will log requests with a \
default level of LOG_INFO and a facility of LOG_LOCAL5. For \
details on redirecting the log into a separate file or other \
actions, see the syslog.conf(5) man \
page. To set the level and facility used in logging, change \
LOG_LEVEL and LOG_FACILITY in the include/htconfig.h file \
before compiling. \
\
- \
Each line logged by htsearch contains the following: \
\
- \
REMOTE_ADDR [config] (match_method) [words] \
[logicalWords] (matches/matches_per_page) - \
page, HTTP_REFERER \
\
\
where any of the above are null or empty, it \
either puts in '-' or 'default' (for config). \
" }, \
{ "maintainer", "bogus@unconfigured.htdig.user", \
"string", "htdig", "Server", "all", "Indexing:Out", "maintainer: ben.dover@uptight.com", " \
This should be the email address of the person in \
charge of the digging operation. This string is added \
to the user-agent: field when the digger sends a \
request to a server. \
" }, \
{ "match_method", "and", \
"string", "htsearch", "", "3.0", "Searching:Method", "match_method: boolean", " \
This is the default method for matching that htsearch \
uses. The valid choices are: \
\
This attribute will only be used if the HTML form that \
calls htsearch didn't have the \
method value set. \
" }, \
{ "matches_per_page", "10", \
"integer", "htsearch", "", "3.0", "Searching:Method", "matches_per_page: 999", " \
If this is set to a relatively small number, the \
matches will be shown in pages instead of all at once. \
This attribute will only be used if the HTML form that \
calls htsearch didn't have the \
matchesperpage value set. \
" }, \
{ "max_connection_requests", "-1", \
"integer", "htdig", "", "3.2.0b1", "Indexing:Connection", "max_connection_requests: 100", " \
This attribute tells htdig to limit the number of requests it will \
send to a server using a single, persistent HTTP connection. This \
only applies when the \
persistent_connections \
attribute is set. You may set the limit as high as you want, \
but it must be at least 1. A value of -1 specifies no limit. \
Requests in the queue for a server will be combined until either \
the limit is reached, or the queue is empty. \
" }, \
{ "max_description_length", "60", \
"integer", "htdig", "", "all", "Indexing:What", "max_description_length: 40", " \
While gathering descriptions of URLs, \
htdig will only record \
up to this many bytes of hyperlink descriptions for use in the \
DESCRIPTION template \
variable. This is used mostly to deal with broken HTML. (If a \
hyperlink is not terminated with a </a> the \
description will go on until the end of the document.) \
" }, \
{ "max_descriptions", "5", \
"integer", "htdig", "", "all", "Indexing:What", "max_descriptions: 1", " \
While gathering descriptions of \
URLs for the \
DESCRIPTIONS template \
variable, htdig will only record up to this \
number of descriptions, in the order in which it encounters \
them. This is used to prevent the database entry for a document \
from growing out of control if the document has a huge number \
of links to it.
\
Note that all descriptions are used for indexing. \
" }, \
{ "max_doc_size", "100000", \
"integer", "htdig", "URL", "3.0", "Indexing:What", "max_doc_size: 5000000", " \
This is the upper limit to the amount of data retrieved \
for documents (in bytes). This is mainly used to prevent \
unreasonable memory consumption since each document \
will be read into memory by \
htdig. \
" }, \
{ "max_excerpts", "1", \
"integer", "htsearch", "URL", "3.1.6", "Presentation:How", "max_excerpts: 10", " \
This value determines the maximum number of excerpts \
that can be displayed for one matching document in the \
search results. \
" }, \
{ "max_head_length", "512", \
"integer", "htdig", "", "all", "Indexing:How", "max_head_length: 50000", " \
For each document retrieved, the top of the document is \
stored. This attribute determines the size of this \
block (in bytes). The text that will be stored is only the text; \
no markup is stored.
\
We found that storing 50,000 bytes will store about \
95% of all the documents completely. This really \
depends on how much storage is available and how much \
you want to show. Currently, this is must not be 0. \
" }, \
{ "max_hop_count", "999999", \
"integer", "htdig", "", "all", "Indexing:Where", "max_hop_count: 4", " \
Instead of limiting the indexing process by URL \
pattern, it can also be limited by the number of hops \
or clicks a document is removed from the starting URL. \
\
The starting page or pages will have hop count 0. \
" }, \
{ "max_keywords", "-1", \
"integer", "htdig", "", "3.2.0b1", "Indexing:What", "max_keywords: 10", " \
This attribute can be used to limit the number of keywords \
per document that htdig will accept from meta keywords tags. \
A value of -1 or less means no limit. This can help combat meta \
keyword spamming, by limiting the amount of keywords that will be \
indexed, but it will not completely prevent irrelevant matches \
in a search if the first few keywords in an offending document \
are not relevant to its contents. \
" }, \
{ "max_meta_description_length", "512", \
"integer", "htdig", "", "3.1.0b1", "Indexing:How", "max_meta_description_length: 1000", " \
While gathering descriptions from meta description tags, \
htdig will only store up to \
this much of the text (in bytes) for each document to fill the \
METADESCRIPTION \
template variable. All words in the meta description are still \
used for indexing. \
" }, \
{ "max_prefix_matches", "1000", \
"integer", "htsearch", "", "3.1.0b1", "Searching:Method", "max_prefix_matches: 100", " \
The Prefix fuzzy algorithm \
could potentially match a \
very large number of words. This value limits the \
number of words each prefix can match. Note \
that this does not limit the number of documents that \
are matched in any way. \
" }, \
{ "max_retries", "3", \
"integer", "htdig", "", "3.2.0b1", "Indexing:Connection", "max_retries: 6", " \
This option set the maximum number of retries when retrieving a document \
fails (mainly for reasons of connection). \
" }, \
{ "max_stars", "4", \
"integer", "htsearch", "", "all", "Presentation:How", "max_stars: 6", " \
When stars are used to display the score of a match, \
this value determines the maximum number of stars that \
can be displayed. \
" }, \
{ "maximum_page_buttons", "${maximum_pages}", \
"integer", "htsearch", "", "3.2.0b3", "Presentation:How", "maximum_page_buttons: 20", " \
This value limits the number of page links that will be \
included in the page list at the bottom of the search \
results page. By default, it takes on the value of the \
maximum_pages \
attribute, but you can set it to something lower to allow \
more pages than buttons. In this case, pages above this \
number will have no corresponding button. \
" }, \
{ "maximum_pages", "10", \
"integer", "htsearch", "", "all", "Presentation:How", "maximum_pages: 20", " \
This value limits the number of page links that will be \
included in the page list at the bottom of the search \
results page. As of version 3.1.4, this will limit the \
total number of matching documents that are shown. \
You can make the number of page buttons smaller than the \
number of allowed pages by setting the \
maximum_page_buttons \
attribute. \
" }, \
{ "maximum_word_length", "32", \
"integer", "htdig htsearch htfuzzy", "", "3.1.3", "Indexing:What", "maximum_word_length: 15", " \
This sets the maximum length of words that will be \
indexed. Words longer than this value will be silently \
truncated when put into the index, or searched in the \
index. \
" }, \
{ "md5_db", "${database_base}.md5hash.db", \
"string", "htdig", "", "3.2.0b3", "File Layout", "md5_db: ${database_base}.md5.db", " \
This file holds a database of md5 and date hashes of pages to \
catch and eliminate duplicates of pages. See also the \
check_unique_md5 and \
check_unique_date attributes. \
" }, \
{ "meta_description_factor", "50", \
"number", "htsearch", "", "3.1.0b1", "Searching:Ranking", "meta_description_factor: 20", " \
This is a factor which will be used to multiply the \
weight of words in any META description tags in a document. \
The number may be a floating point number. See also the \
heading_factor attribute and the \
description_factor attribute. \
" }, \
{ "metaphone_db", "${database_base}.metaphone.db", \
"string", "htfuzzy htsearch", "", "all", "File Layout", "metaphone_db: ${database_base}.mp.db", " \
The database file used for the fuzzy \"metaphone\" search \
algorithm. This database is created by \
htfuzzy and used by \
htsearch. \
" }, \
{ "method_names", "and All or Any boolean Boolean", \
"quoted string list", "htsearch", "", "all", "Searching:UI", "method_names: or Or and And", " \
These values are used to create the \
method menu. It consists of pairs. The first \
element of each pair is one of the known methods, the \
second element is the text that will be shown in the \
menu for that method. This text needs to be quoted if \
it contains spaces. \
See the select list documentation \
for more information on how this attribute is used. \
" }, \
{ "mime_types", "${config_dir}/mime.types", \
"string", "htdig", "", "3.2.0b1", "Indexing:Where", "mime_types: /etc/mime.types", " \
This file is used by htdig for local file access and resolving \
file:// URLs to ensure the files are parsable. If you are running \
a webserver with its own MIME file, you should set this attribute \
to point to that file. \
\
See also content_classifier.\
"}, \
{ "minimum_prefix_length", "1", \
"integer", "htsearch", "", "3.1.0b1", "Searching:Method", "minimum_prefix_length: 2", " \
This sets the minimum length of prefix matches used by the \
\"prefix\" fuzzy matching algorithm. Words shorter than this \
will not be used in prefix matching. \
" }, \
{ "minimum_speling_length", "5", \
"integer", "htsearch", "", "3.2.0b1", "Searching:Method", "minimum_speling_length: 3", " \
This sets the minimum length of words used by the \
\"speling\" fuzzy matching algorithm. Words shorter than this \
will not be used in this fuzzy matching. \
" }, \
{ "minimum_word_length", "3", \
"integer", "htdig htsearch", "", "all", "Indexing:What", "minimum_word_length: 2", " \
This sets the minimum length of words that will be \
indexed. Words shorter than this value will be silently \
ignored but still put into the excerpt.
\
Note that by making this value less than 3, a lot more \
words that are very frequent will be indexed. It might \
be advisable to add some of these to the \
bad_words list. \
" }, \
{ "multimatch_factor", "1", \
"number", "htsearch", "", "3.1.6", "Searching:Ranking", "multimatch_factor: 1000", " \
This factor gives higher rankings to documents that have more than \
one matching search word when the or \
match_method is used. \
In version 3.1.6, the matching words' combined scores were multiplied \
by this factor for each additional matching word. Currently, this \
multiplier is applied at most once. \
" },
{ "next_page_text", "[next]", \
"string", "htsearch", "", "3.1.0", "Presentation:Text", "next_page_text: <img src=\"/htdig/buttonr.gif\">", " \
The text displayed in the hyperlink to go to the next \
page of matches. \
" }, \
{ "no_excerpt_show_top", "false", \
"boolean", "htsearch", "", "3.1.0b3", "Presentation:How", "no_excerpt_show_top: yes", " \
If no excerpt is available, this option will act the \
same as excerpt_show_top, that is, \
it will show the top of the document. \
" }, \
{ "no_excerpt_text", "(None of the search words were found in the top of this document.)", \
"string", "htsearch", "", "3.0", "Presentation:Text", "no_excerpt_text:", " \
This text will be displayed in place of the excerpt if \
there is no excerpt available. If this attribute is set \
to nothing (blank), the excerpt label will not be \
displayed in this case. \
" }, \
{ "no_next_page_text", "${next_page_text}", \
"string", "htsearch", "", "3.0", "Presentation:Text", "no_next_page_text:", " \
The text displayed where there would normally be a \
hyperlink to go to the next page of matches. \
" }, \
{ "no_page_list_header", "", \
"string", "htsearch", "", "3.0", "Presentation:Text", "no_page_list_header: <hr noshade size=2>All results on this page.<br>", " \
This text will be used as the value of the PAGEHEADER \
variable, for use in templates or the \
search_results_footer \
file, when all search results fit on a single page. \
" }, \
{ "no_page_number_text", "", \
"quoted string list", "htsearch", "", "3.0", "Presentation:Text", "no_page_number_text: \
<strong>1</strong> <strong>2</strong> \\
\
<strong>3</strong> <strong>4</strong> \\
\
<strong>5</strong> <strong>6</strong> \\
\
<strong>7</strong> <strong>8</strong> \\
\
<strong>9</strong> <strong>10</strong> \
", " \
The text strings in this list will be used when putting \
together the PAGELIST variable, for use in templates or \
the search_results_footer \
file, when search results fit on more than page. The PAGELIST \
is the list of links at the bottom of the search results page. \
There should be as many strings in the list as there are \
pages allowed by the maximum_page_buttons \
attribute. If there are not enough, or the list is empty, \
the page numbers alone will be used as the text for the links. \
An entry from this list is used for the current page, as the \
current page is shown in the page list without a hypertext link, \
while entries from the \
page_number_text list are used for the links to other pages. \
The text strings can contain HTML tags to highlight page numbers \
or embed images. The strings need to be quoted if they contain \
spaces. \
" }, \
{ "no_prev_page_text", "${prev_page_text}", \
"string", "htsearch", "", "3.0", "Presentation:Text", "no_prev_page_text:", " \
The text displayed where there would normally be a \
hyperlink to go to the previous page of matches. \
" }, \
{ "no_title_text", "filename", \
"string", "htsearch", "", "3.1.0", "Presentation:Text", "no_title_text: \"No Title Found\"", " \
This specifies the text to use in search results when no \
title is found in the document itself. If it is set to \
filename, htsearch will use the name of the file itself, \
enclosed in brackets (e.g. [index.html]). \
" }, \
{ "noindex_end", " ", \
"quoted string list", "htdig", "", "3.1.0", "Indexing:What", "noindex_end: </SCRIPT>", " \
This string marks the end of a section of an HTML file that should be \
completely ignored when indexing. Note that text between noindex_start\
and noindex_end isn't even counted as white space; the text \
\"foosomethingbar
\" \
matches the word \"foobar\", not the phrase \"foo bar\". White space \
following noindex_end is counted as white space. See also \
noindex_start. \
" }, \
{ "noindex_start", "