#!/usr/bin/perl -w #
#baseir.pl, a Baseline Information Retrieval engine
#by David A. Herrera and Nigel Ward, April 2003.
#University of Texas at El Paso
#This is a simple, basic information retrieval engine,
#suitable for student modification and extension
#as an NLP class assignment
#to run, type something like
# %perl basier.pl *html
#This code does the following:
#1 - Create an array of hashes from documents passed to it
# each document has its own entry in array the AoH, and
# each entry is itself a hash of terms and number of occurrences
# for that term in that document.
#2 - Read user query, and return a list of the most relevant documents,
# using an ad hoc measure. This could be improved by following
# section 17.3 of Jurafsky and Martin, 2000
%hash = (); #initialize hash
$multiline_html_tag = 0; #initially false
while ( <> ) { #read from all files
#filter the input
s|\s+| |g; #remove extra white spaces
s|<.*?>| |g; #remove html tags
if (s|\A\s*<.*||g) {$multiline_html_tag=1}; #start of multiline html tag
if (s|.*>$||g) {$multiline_html_tag=0}; #end multiline html tag
if ($multiline_html_tag) {
s|.*||g; #erase the whole line
}
s|&.*?[\s\n]| |g; #remove html vars like  
s|\s*["\(]+| |g; #remove punctuation preceeding term
s|[\?,"\)\.:;]+?[\s\n\t]| |g; #remove punctuation tailing term
#add term counts into hash
for $token ( split ) { #split on white spaces
$token = lc( $token ); #change to lowercase
if ( exists ( $hash{$token} )){ #check for an occurrence
$hash{$token}++ ; #and increment its count
} else { #else create a new entry
%hash = (%hash, $token, 1); #and add it to the current hash
}#if
}#for
if (eof) { #finish-up processing for this doc
push @docnames, $ARGV; # save the document name
push @AoH, { %hash }; # add this hash to the array
%hash = (); # clear hash table for next doc
$multiline_html_tag = 0;
}#if
}#while
print "please enter a query:\n ";
#loop: read user queries and print matches
while ( defined ( $line = <> )) {
@words = split(' ', $line ); #create a vector of query terms
if ( !scalar( @words )) {
print "blank query line: goodbye\n";
exit;
}
for $doci ( 0 .. $#AoH ) { #check each document
$count = 0;
foreach $term ( @words ) { #iterate over @words in the query
if ( exists ( $AoH[$doci]{$term} )) {
push @matchvector, $AoH[$doci]{$term};
$count += $AoH[$doci]{$term};
}#if
else { push @matchvector, 0; }
}#foreach
push @matchvector, $count; #tack hit count to the end of the vector and
push @matchvector, $docnames[$doci]; #tack on document name to simplify sort
push @results, [ @matchvector ]; #add this vector to my @results
@matchvector = (); #clear the vector
}#for
#output three best (most relevant) documents
#recommend documents where the first query term matches best (most), but
# if equal, also look at the second query term
@sorted = sort {$b->[0] <=> $a->[0] || $b->[1] <=> $a->[1]} @results;
print "the following documents best match your query, \"@words\"\n";
for $i ( 0 .. 2 ) { #not all $#sorted documents
printf("%-20s ", $sorted[$i][$#{ $sorted[$i] }]); # document name
for $j ( 0 .. $#{ $sorted[$i] }-2 ) {
printf("%.2f \t", $sorted[$i][$j]);
}#for $j
print "\n";
}#for $i
@results = (); #empty arrays to handle next query
@words = ();
@sorted = ();
print "please enter a query:\n ";
}#while
print "goodbye\n";