#!/usr/bin/perl -w
#<html><pre>

#baseir.pl, a Baseline Information Retrieval engine
#by David A. Herrera and Nigel Ward, April 2003.
#University of Texas at El Paso

#This is a simple, basic information retrieval engine, 
#suitable for student modification and extension 
#as an NLP class assignment 

#to run, type something like
#    %perl basier.pl *html

#This code does the following:
#1 -    Create an array of hashes from documents passed to it
#       each document has its own entry in array the AoH, and
#       each entry is itself a hash of terms and number of occurrences
#       for that term in that document.
#2 -    Read user query, and return a list of the most relevant documents,
#       using an ad hoc measure.  This could be improved by following
#       section 17.3 of Jurafsky and Martin, 2000
 
%hash = ();                             #initialize hash
$multiline_html_tag = 0;                #initially false
while ( <> ) {                          #read from all files
        #filter the input
        s|\s+| |g;                      #remove extra white spaces
        s|<.*?>| |g;                    #remove html tags
        if (s|\A\s*<.*||g) {$multiline_html_tag=1};  #start of multiline html tag
        if (s|.*>$||g)     {$multiline_html_tag=0};  #end multiline html tag         
        if ($multiline_html_tag) {   
                s|.*||g;                #erase the whole line
        }
        s|&.*?[\s\n]| |g;               #remove html vars like &nbsp
        s|\s*["\(]+| |g;                #remove punctuation preceeding term
        s|[\?,"\)\.:;]+?[\s\n\t]| |g;   #remove punctuation tailing term

        #add term counts into hash
        for $token ( split ) {                          #split on white spaces
                $token = lc( $token );                  #change to lowercase
                if ( exists ( $hash{$token} )){         #check for an occurrence         
                        $hash{$token}++ ;               #and increment its count
                } else {                                #else create a new entry                
                        %hash = (%hash, $token, 1);     #and add it to the current hash
                }#if
        }#for
        if (eof) {                                      #finish-up processing for this doc 
                push @docnames, $ARGV;                  #  save the document name
                push @AoH, { %hash };                   #  add this hash to the array
                %hash = ();                             #  clear hash table for next doc
                $multiline_html_tag = 0;
         }#if
}#while

print "please enter a query:\n   ";

#loop: read user queries and print matches
while ( defined ( $line = <> )) {       
        @words = split(' ', $line );            #create a vector of query terms
        if ( !scalar( @words )) { 
                print "blank query line: goodbye\n"; 
                exit;   
        }
        for $doci ( 0 .. $#AoH ) {              #check each document
                $count = 0;                     
                foreach $term ( @words ) {      #iterate over @words in the query
                        if ( exists ( $AoH[$doci]{$term} )) {
                                push @matchvector, $AoH[$doci]{$term};
                                $count += $AoH[$doci]{$term};
                        }#if
                        else { push @matchvector, 0; }
                }#foreach
                push @matchvector, $count;           #tack hit count to the end of the vector and
                push @matchvector, $docnames[$doci]; #tack on document name to simplify sort
                push @results, [ @matchvector ];     #add this vector to my @results    
                @matchvector = ();                   #clear the vector
        }#for
        
        #output three best (most relevant) documents 
        #recommend documents where the first query term matches best (most), but
	# if equal, also look at the second query term
        @sorted = sort {$b->[0] <=> $a->[0] || $b->[1] <=> $a->[1]} @results;   
        print "the following documents best match your query, \"@words\"\n";
        for $i ( 0 .. 2 ) {                             #not all $#sorted documents
            printf("%-20s ", $sorted[$i][$#{ $sorted[$i] }]);    # document name
            for $j ( 0 .. $#{ $sorted[$i] }-2 ) {
                printf("%.2f \t",  $sorted[$i][$j]);
            }#for $j
                print "\n";
        }#for $i
        @results = ();                          #empty arrays to handle next query
        @words = ();
        @sorted = ();
	print "please enter a query:\n   ";
}#while     
print "goodbye\n";