#!/usr/local/bin/perl ######################################################### # part of the HTML Dictionary # Distributed under the GNU copyleft (any version of your choice) # No part of these documents may be printed in any for-profit publication # copyleft sunil@magnetic.demon.co.uk ######################################################### ######################################################### # this script makes use of Glimpse, developed by # Udi Manber, Burra Gopal: University of Arizona # Sun Wu : National Chung-Cheng University, Taiwan ######################################################### ######################################################### # CONFIGURE THESE ######################################################### ######################################################### # this little script assumes that the glimpse binaries and # databases are stored under $glimpse_dir # # ..... # | # glimpse_dir # | # +----------+----------+ # bin databases # | # +------+-------+-------+ # | | | | # this that other misc # | # +-----+----+ # | | # fat thin # | | # .glimpse_index .glimpse_index # .glimpse_stat.. .glimpse... # .... # # the fat index should have been indexed using # glimpseindex -o .... # glimpseindex -B -f -s .... ######################################################### ######################################################### # *** EXAMPLE *** #$glimpse_dir="/usr/local/lib/glimpse"; #$db_name="all"; #$title="search my server"; #$doc_root="/usr/share/htdocs/" #$doc_server="http://my_server:my_port/"; #$default_search_type = "fat"; ######################################################### $glimpse_dir="/where/everything/is_kept"; $db_name="index_to_search"; $title="title of search screen"; $doc_root="/physical_location_of_docs/"; $doc_server="http://my_server:my_port/"; $default_search_type = "fat"; #or thin $default_case_sensitive = 0; #or 1 ######################################################### # # Nothing to configure below here # ######################################################### $default_max_hits = 20; $glimpse_bin="$glimpse_dir/bin"; $glimpse_dbs="$glimpse_dir/databases"; $glimpse_delim=": "; require "www_lib.pl"; %FIELDS=&GET_FIELDS(); $my_url=&get_this_URL(); #this may fail on cern httpd $fat_db_dir="$glimpse_dbs/$db_name/fat"; $thin_db_dir="$glimpse_dbs/$db_name/thin"; $input_field="input_field"; $result_field="result_field"; $match_field="match_field"; $word_field="word_field"; $error_field="error_field"; $case_field="case_field"; @result_set; $this_is_a_subset = 0; %word_options= ( "Match whole words", "whole", "Allow fuzzy searching", "partial"); @case_options=( "Yes", "No" ); @match_options= ( "a ridiculously tiny 10", "a modest 20", "an exceptionally normal 30", "a larger than life 40", "I dont care give me the lot"); %result_options = ( "Just the number of matches", "thin", "with contextual text", "fat"); @error_options = (0,1,3,5,8); %TITLE_LIST; $| = 1; #no buffering ######################################################### # ######################################################### sub show_query_form { local (@keys); &PRINT_HEADER ("Search the $title"); @keys = keys %FIELDS; if (@keys) { &h2 ("Search Expression - You didnt enter an expression"); } else { &h2 ("Search Expression"); } #------------------------------------------------------------- &form ($my_url); print "Enter the search expression "; &h3("options"); &ul; &li(""); &gen_labelled_select ( "Result type", $result_field, keys (%result_options)); &li(""); &gen_labelled_select ( "number of matches", $match_field, @match_options); &li(""); &gen_labelled_select ( "Search Type", $word_field, keys (%word_options)); &li(""); &gen_labelled_select ( "match case?", $case_field, @case_options); &li(""); #too slow!!! &gen_labelled_select ( "errors allowed", $error_field, @error_options); &_ul; $_form; print "
"; #------------------------------------------------------------- &h2 ("Notes"); print "The search engine supports regular expressions"; &dl; &dt; &bold; print "Special characters"; &_bold; ⅆ print " The following characters are reserved to the search engine. They should be escaped by preceeding with a back-slash if you wish to search for them.

 ^   \$   *   [   ]   |   (   )   !   \   ;   ,   #   <   >   -   . 

^     matches the beginning of a line
\$     matches the end of a line
.     matches any single character
\#     matches any number of characters
*     matches any number of the previous character"; &p; &dt; &bold; print "Sets"; &_bold; ⅆ print " a set of characters inside [] matches any of the characters in that set.

"; &p; &dt; &bold; print "Complex operations"; &_bold; ⅆ print " You can contruct boolean expressions using \"AND\" and \"OR\". Complex expressions can be built by surrounding patterns with curly brackets {}.

'{political OR computer} AND science

will match 'political science' or 'computer science'."; &p; &dt; &bold; print "exact matches"; &_bold; ⅆ print " the default behaviour is to allow mistakes in the words being searched for. Surrounding an expression in angle brackets < > forces an exact match on that part of the expression.

"; &p; &dt; &bold; print "(ir)regular expressions"; &_bold; ⅆ print " "; &PRINT_FOOTER; } ######################################################### # ######################################################### sub process_query { local ($expression) = @_; local ($glimpse_expression, $item, @glimpse_cmd, $glimpse_pid); local ($max_hits,$search_type, $sensitive,$match_words, $allowed_errs); #----------------preamble-------------------------------- $max_hits = $FIELDS{$match_field}; if ( $max_hits ) { $max_hits =~ s/\D//g; } else { $max_hits = $default_max_hits; } #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - $search_type = $FIELDS{$result_field}; if ( $search_type ) { $search_type = $result_options{$search_type}; } else { $search_type = $default_search_type; } #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - $sensitive = $FIELDS{$case_field}; if ( $sensitive ) { if ($sensitive eq "Yes") { $sensitive =1; } else { $sensitive =0; } } else { $sensitive = $default_case_sensitive; } #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - $match_words = $FIELDS{$word_field} ; $match_words = ( $word_options{$match_words} eq "whole" ); $allowed_errs = $FIELDS{$error_field}; #----------------convert into glimpse format-------------- $expression =~ s/{/ { /g; $expression =~ s/}/ } /g; $expression = &trim ($expression); foreach $item ( split (/\s/, $expression )) { $item =~ s/^and$/\;/; $item =~ s/^or$/\,/; $glimpse_expression .= "$item"; } #----------------build command ------------------------------ push ( @glimpse_cmd, "$glimpse_bin/glimpse" ); if ($match_words ) { push ( @glimpse_cmd, "-w"); } if ($errs_allowed >0 ) { push ( @glimpse_cmd, "-${errs_allowed}"); } if ($search_type eq "thin") { push ( @glimpse_cmd, "-H" , $thin_db_dir); push ( @glimpse_cmd, "-c"); } else { push ( @glimpse_cmd, "-H" , $fat_db_dir); } if (! $sensitive) { push (@glimpse_cmd, "-i"); } push ( @glimpse_cmd, "-y", $glimpse_expression); #----------------Execute------------------------------ $glimpse_pid = open(GLIMPSE, "-|") || exec (@glimpse_cmd); &unbuffer ( GLIMPSE); while () { if ( $max_hits ) { if ( $. > $max_hits ) { $this_is_a_subset = 1; kill -9, $glimpse_pid; last; } } ($item = $_) =~ s#^$doc_root##; push (@result_set, $item); } close GLIMPSE; #--------------------heading on results -------------------------- &PRINT_HEADER ("Results of searching the $title"); if ( $this_is_a_subset ) { &h2("first $max_hits results"); &hr; print "The results only show the first $max_hits hits."; &ul; &li(" This is not the same as the best $max_hits matches as it is dependant on the order the documents were initially indexed."); if ($search_type eq "fat" ) { &li(" This will not show the first ten documents in which matches were found. For that you needed to select the show \"just the number of matches\" option on the search screen."); } &_ul; &hr; } else { &h2("results"); } $expression =~ s//>/g; $glimpse_expression =~ s//>/g; #--------------------display -------------------------- if ($search_type eq "thin") { &show_thin_results; } else { &show_fat_results; } #--------------------------------------------------------- &PRINT_FOOTER; } ######################################################### # ######################################################### sub show_link { local ($fname) = @_; local ($title, $hot_title, $subtitle); #-----------find title------------------------------------ $hot_title = &get_tag ("$doc_root/$fname","TITLE"); $hot_title = &unwebify ($hot_title); if ($hot_title) { #- - - - - seen before ? - - - - - - - - - - - if ($TITLE_LIST{$hot_title}++ > 0) { $subtitle = &get_tag("$doc_root/$fname","H1"); #$subtitle = &unwebify($subtitle); } } else { $hot_title = "No title found in $fname"; } #--------------show link--------------------------- &href ( "${doc_server}$fname", "$hot_title"); if ( "$subtitle" ) { print "- $subtitle"; } &newline; } ######################################################### # show the dcuments in order ######################################################### sub show_thin_results { local ($item, %assoc_array, $file, $value, $key); foreach $item (@result_set) { ($file, $value) = split ( /$glimpse_delim/ , $item); $assoc_array{$value} = $file; } ¢re; print " "; foreach $key ( sort sort_reverse_numerically (keys %assoc_array)) { print ""; } print "
No.
matches
name
$key"; &show_link ($assoc_array{$key}); print "
"; &_centre; } ######################################################### # ######################################################### sub show_fat_results { local ($item, $last_fname, $colon, $fname, $value, $last_fname); $last_fname= ""; &dl; foreach $item (@result_set) { $colon = index( $item, $glimpse_delim); $fname=substr($item,0,$colon); $value=substr($item,$colon); if ($fname ne $last_fname) { &p; &dt; &show_link ($fname); $last_fname = $fname; } $value = &unwebify($value); print "
$value\n"; } &_dl; &newline; } ######################################################### # ######################################################### sub unbuffer { local ($new_fh) = @_; local ($old_fh); $old_fh = select( $new_fh); $| = 1; select ($old_fh); } ######################################################### # ######################################################### sub main { local ($query); $query = $FIELDS{$input_field}; if ($query) { &process_query($query); } else { &show_query_form; } } &main;