# # Part of the ht://Dig package # Copyright (c) 1999-2004 The ht://Dig Group # For copyright details, see the file COPYING in your distribution # or the GNU Library General Public License (LGPL) version 2 or later # # # $Id: t_parsing,v 1.4 2004/05/28 13:15:30 lha Exp $ # # Tests (or should eventually test) the following config attributes: # description_meta_tag_names # ignore_alt_text # max_doc_size # max_keywords # max_meta_description_length # max_description_length # max_descriptions # max_head_length # noindex_end # noindex_start # external_parsers # external_protocols # use_meta_description test_functions_action=--start-apache . ./test_functions config=$testdir/conf/htdig.conf.tmp tmp=/tmp/t_htsearch$$ # set up config file with chosen non-default values cp $testdir/conf/htdig.conf $config try() { comment="$1" shift query="$1" shift $htsearch -c $config "$query" > $tmp for pattern do if grep "$pattern" $tmp > /dev/null then : else $htsearch -v -c $config "$query" > /dev/null echo "Output doesn't match \"$pattern\"" fail "$htsearch -c $config '$query' >> $tmp -- $comment" fi done } # Tests (or should eventually test) the following config attributes: # description_meta_tag_names # ignore_alt_text # max_doc_size # max_keywords # max_meta_description_length # max_description_length (May put in t_templates) # max_descriptions (May put in t_templates) # max_head_length # noindex_end # noindex_start # external_parsers (TODO) # external_protocols # use_meta_description $htdig "$@" -t -i -c $config || fail "Couldn't do first dig" $htpurge -c $config || fail "Couldn't do first purge" try "Search for alt text 'earth'" \ "words=earth" \ '1 matches' 'site3.html' try "'claims and collections', unlimited doc size" \ "words=%22claims+and+collections%22" \ '1 matches' 'site4.html' try "Search for keyword 'martial', default max_keywords" \ "words=martial" \ '1 matches' 'site2.html' try "Search for 'service', default noindex_start/end" \ "words=technical" \ '1 matches' 'site%201.html' set_attr use_meta_description true try "Search for 'call handling' with default max_meta_description_length" \ "words=%22call+handling%22" \ '1 matches' 'script.html' 'call handling.*signalling' set_attr ignore_alt_text true set_attr max_doc_size 15112 set_attr max_keywords 5 set_attr noindex_start "'Software Distribution'" set_attr noindex_end "'Contact Information'" set_attr max_meta_description_length 80 set_attr description_meta_tag_names "description generator" set_attr max_head_length 30 $htdig "$@" -t -i -c $config || fail "Couldn't do second dig" $htpurge -c $config || fail "Couldn't do second purge" try "Search for alt text 'earth' with ignore_alt_text=true" \ "words=earth" \ 'No matches' try "'claims and collections', max_doc_size 15112" \ "words=%22claims+and+collections%22" \ '1 matches' 'site4.html' # (Martial is 6th keyword listed in site 2, but "Fu" is too short and omitted.) try "Search for keyword 'martial', max_keywords = 5" \ "words=martial" \ 'No matches' # Only occurrence of "technical" is between noindex_start and _end in site 1 try "Search for 'technical', noindex_start=Software Distribution, noindex_end=Contact Information" \ "words=technical" \ 'No matches' # Visitor occurs after noindex_end try "Search for 'visitor', noindex_start=Software Distribution, noindex_end=Contact Information" \ "words=visitor" \ '2 matches' 'site%201.html' 'site3.html' # Displaying meta description instead of excerpt, check it is truncated try "Search for 'call handling' with max_meta_description_length=80" \ "words=%22call+handling%22" \ '1 matches' 'script.html' 'means of
' # Check counts as a description try "Search for 'category', description_meta_tag_names includes 'generator'" \ "words=category" \ '1 matches' 'site3.html' 'FrontPage' # Check that only specified number of bytes of header is stored. # Header size is rounded up to contain the whole of the last word. try "Search for 'also', max_head_length=30" \ "words=also" \ '4 matches' 'bad_local.htm' 'site2.html' 'script.html' 'site4.html' \ 'WHERE.*Copyright
' set_attr max_doc_size 15042 set_attr max_keywords 6 set_attr noindex_start "'software distribution'" set_attr noindex_end "'contact information'" $htdig "$@" -t -i -c $config || fail "Couldn't do third dig" $htpurge -c $config || fail "Couldn't do third purge" try "Search for keyword 'martial', max_keywords = 6" \ "words=martial" \ '1 matches' 'site2.html' try "'claims and collections', max_doc_size 15042" \ "words=%22claims+and+collections%22" \ 'No matches' # Check noindex_start/end are case-insensitive try "Search for 'technical', noindex_start=software distribution, noindex_end=contact information" \ "words=technical" \ 'No matches' PROTOCOL=my-protocol echo '#!/bin/sh echo "s 200" echo "t text/html" echo echo "$2"' > $PROTOCOL chmod 755 $PROTOCOL set_attr external_protocols "echo: $PWD/$PROTOCOL" set_attr start_url "echo:foo.html" $htdig "$@" -t -i -c $config || fail "Couldn't do fourth dig" try "trying external protocol echo" \ "words=foo" \ "1 matches" "echo:foo.html" test_functions_action=--stop-apache . ./test_functions rm -f $tmp $PROTOCOL exit 0