# score4.py Python 2.5 script (fails on Python 2.4) # /home/research/isg/speech/social/python/score4.py # This script scores the quality of matches # for the Simlar Segments in Social Seach MediaEval 2013 Task # Nigel Ward, University of Texas at El Paso, March 2013 # Version 3, May 13, 2013: changed to omit query weighting # Version 4, July 2013: added more information to the performance summary # Changed so that the hunt-back-to-start cost is estimated at 50% of # the distance, instead of 100%, assuming people jump back 10 seconds # at a time, they listen for 4 seconds each time, and jump overhead is 1 sec. # Then changes so that there is no hunt-back-to-start behavior # (plausible especially when the regions are very long) # Changed to add the 'naive precision' and the recall # Given a query, this looks up the answerset for that query # and evaluates its quality by comparing it to the reference similarity set. # It implements the algorithm descrbed at # http://www.cs.utep.edu/nigel/ssss/faq.html # The output is, for each query, # a description of how well that query was handled. # and the Searcher Utility Ratio. # It also prints out the average Ratio over all queries. # invoke with python score3.py queries.txt sss.txt answers.txt # in ../baseline: # python ../python/score4.py all-possible-queries2.txt all-similarity-sets-clean2.txt all-infered-answers.txt; gives naive precision 9%, recall 27%, utility ratio .31, f-measure of 0.29 # ditto on random-guesses.txt: 6%, 30%, 24%, 0.27 # The inputs are three files: # The list of queries, provided by the organizers # The lists of answers, that is, list of jump-in points inferred for each # of the queries, as generated by a participant system. # The reference list of similar-segment sets, provided by the organizers. # For the final evaluation set this will not be revealed to the participants. # Note that all the constants are designed to reflect plausible # searcher behavior, with the exception of the noResults penalty. # Real searchers would probably prefer to have zero results than one # incorrect result, but for purposes of the evaluation, we don't want # to enable systems to get high scores by only the one or two queries # they're confident they can do well on. import sys # constants searchTimePerQuery = 120 # seconds maxLeftOffset = 5 # seconds minRightOffset = 3 # seconds; was 1, changed in version 4 falsePositivePenalty = 8 # seconds #searchBackPenaltyRatio = 2 # considered making it lower def skippable(line): if len(line) == 0: return True if line[0] == '#': return True if line.isspace(): return True return False # Here string1 and string2 both have format: filename start end # Returns true if they represent the same region # (regardless of whitespace variation and leading/trailing zeros) def sameRegion(string1, string2): triple1 = string1.split() triple2 = string2.split() return (triple1[0] == triple2[0] and float(triple1[1]) == float(triple2[1]) and float(triple1[2]) == float(triple2[2])) # Return all result-regions corresponding to the specified query, by # scaning through the answers file to find a line prefixed by "input:" # that contains the exact same filename, start time and end time, # and then snatching up all lines after that (up to the next "input:") def findResultSet(query, lines): jumpinset = [] nlines = len(lines) for i in range(nlines): if skippable(lines[i]): continue #print " answerfile has: ", lines[i], triple = (lines[i]).partition(" ") firstToken = triple[0] if firstToken != 'input:': continue remainingTokens = triple[2].strip() if not sameRegion(query,remainingTokens): continue # we've found the corresponding set; collect the relevant lines i = i + 1 while i < nlines: line = lines[i].strip() triple = lines[i].partition(" ") firstToken = triple[0] if skippable(line) or firstToken == 'input:': print ' in answerfile, jumpinset is: ', jumpinset return jumpinset # all done jumpinset.append(line.strip()) i = i + 1 return jumpinset # which will be null def findSimilarSet(query, sslines): similarset = [] nlines = len(sslines) for i in range(nlines): ssline = sslines[i] if skippable(ssline): continue if ssline.find("set") > -1: continue if not sameRegion(sslines[i].strip(),query): continue # search back collecting all lines until we see a 'set' token j = i - 1 while j > 0: ssline = sslines[j].strip() if skippable(ssline): break tokens = ssline.split() if tokens[0] == 'set:': break similarset.append(ssline) j = j - 1 # search forward analogously k = i + 1 while k < nlines: ssline = sslines[k].strip() if skippable(ssline): break tokens = ssline.split() if tokens[0] == 'set:': break similarset.append(ssline) k = k + 1 if similarset == []: print "warning, no similar regions in reference tagsets for", query return similarset print "warning, no region in reference tagset file matches", query # A query is a string: a filename, a start time, and an end time # We seek a corresponding answer set in the answers file. # Any answersets that don't correspond to a query are ignored. # If no corresponding answer set is found, the benefit for this # query is zero, and the noResults penalty is applied # We also seek a corresponding similarity set in the ss file. # Finally we call simulateUser to estimate the value of this answer set # as a set of jump-in points leading to discovery of the similarity set. def processQuery (query): global nqueries global cumulativePotential nqueries = nqueries + 1 print "\nProcessing query", query similarset = findSimilarSet(query, sslines) # reference tagset resultset = findResultSet(query, answerlines) # system guesses unmaxedPotentialValue = totalSetDuration(similarset) potentialValue = min(searchTimePerQuery, unmaxedPotentialValue) cumulativePotential = cumulativePotential + potentialValue simulateUser(query, resultset, similarset, potentialValue) def totalSetDuration(similarset): sum = 0 for region in similarset: triple = region.split() start = float(triple[1]) end = float(triple[2]) sum = sum + (end - start) return sum # simulateUser For each putative answer in the set, determine its # value and cost, and keep processing putative answers until the total # cost for this query exceeds 120 seconds # secondswo shared by the daughter function ProcessResult thisQueryCost = 0 thisQueryValue = 0 def simulateUser(query, resultset, tagset, potential): global cumulativeCost global cumulativeBenefit global cumulativeNoPutative # globals shared by the daughter function ProcessResult global thisQueryCost global thisQueryValue if resultset == []: print 'no putative results for', query cumulativeNoPutative = cumulativeNoPutative + 1 return thisQueryCost = 0 thisQueryValue = 0 liveTagSet = tagset # regions not already jumped-into by an answer for result in resultset: print ' processing result: ', result liveTagSet = processResult(result, liveTagSet) if thisQueryCost >= searchTimePerQuery: # if times up, stop processing results break thisQueryValue = min(thisQueryValue, searchTimePerQuery) print "for this query, cost is", thisQueryCost, 'and benefit', thisQueryValue, "of a possible", potential print "Searcher Utility Ratio for query '", query, "' is", print '%.2f' % (thisQueryValue / thisQueryCost) cumulativeCost = cumulativeCost + thisQueryCost cumulativeBenefit = cumulativeBenefit + thisQueryValue # There are three cases: # Case 1, if the answer is within an unused similar segment, # and no later than 1 second before the end, then # value = duration of that segment # cost = duration of that segment + twice the distance from # the jump-in point to the start of the segment # and mark that segment as used up # Case 2, if the answer is no more than 5 seconds before an # unused similar segment # value = duration of that segment # cost = time distance from jump-in point to segment onset # and mark that segment as used up # Case 3, if none of the first two applies, it's a false alarm # cost = 8 seconds # value = 0 seconds def processResult(result,ltset): global thisQueryCost global thisQueryValue global cumulativeFalseAlarms global cumulativeEarlyHits global cumulativeLateHits global cumulativeEarliness global cumulativeLateness triple = result.split() rfilename = triple[0] rtimepoint = float(triple[1]) # find the first matching item in the liveTagSet if ltset == None: # no (remaining live) regions in similarity set, stop scoring answers # thus there's no penalty if the system returns more answers than # there are regions, as long as the correct answers are at the top thisQueryCost = thisQueryCost + falsePositivePenalty cumulativeFalseAlarms = cumulativeFalseAlarms + 1 print ' Case 3: no similar regions (left to try to) match this jump-in point' return ltset for similarRegion in ltset: triple = similarRegion.split() sfilename = triple[0] sstartpoint = float(triple[1]) sendpoint = float(triple[2]) if sfilename != rfilename: continue # keep looking to find a similar region if rtimepoint > sstartpoint and rtimepoint <= sendpoint - minRightOffset: # Case 1 #value = sendpoint - sstartpoint #cost = value + searchBackPenaltyRatio * (rtimepoint - sstartpoint) value = sendpoint - rtimepoint cost = value thisQueryCost = thisQueryCost + cost thisQueryValue = thisQueryValue + value cumulativeLateHits = cumulativeLateHits + 1 cumulativeLateness = cumulativeLateness + (rtimepoint - sstartpoint) print " Case 1: jump-in point within similar region" ltset.remove(similarRegion) return ltset if rtimepoint >= sstartpoint - maxLeftOffset and rtimepoint <= sstartpoint: # Case 2 value = sendpoint - sstartpoint cost = value + (sstartpoint - rtimepoint) thisQueryCost = thisQueryCost + cost thisQueryValue = thisQueryValue + value cumulativeEarlyHits = cumulativeEarlyHits + 1 cumulativeEarliness = cumulativeEarliness + (sstartpoint - rtimepoint) print " Case 2: jump-in point preceeds similar region" ltset.remove(similarRegion) return ltset # Case 3: we've scanned all similar regions but found no match thisQueryCost = thisQueryCost + falsePositivePenalty cumulativeFalseAlarms = cumulativeFalseAlarms + 1 print ' Case 3: no similar regions match this jump-in point' return ltset # ---- main ---- nargs = len(sys.argv) - 1 print sys.argv print nargs if nargs != 3: print "Score Error: expected 3 arguments, got", nargs print "invoke with 'python score4.py queryfile tagsetfile answerfile'" queryfile = sys.argv[1] ssfile = sys.argv[2] # similarity-sets file answerfile = sys.argv[3] qfp = open(queryfile, 'r') sfp = open(ssfile, 'r') afp = open(answerfile, 'r') ofp = open('performance.txt', 'w') queries = qfp.readlines() answerlines = afp.readlines() sslines = sfp.readlines() nqueries = 0 cumulativePotential = 0 cumulativeCost = 0 cumulativeBenefit = 0 cumulativeNoPutative = 0 cumulativeFalseAlarms = 0 cumulativeEarlyHits = 0 cumulativeLateHits = 0 cumulativeEarliness = 0.0 cumulativeLateness = 0.0 for query in queries: if skippable(query): continue processQuery(query.strip()) ofp.write('all done processing queries') totalHits = cumulativeEarlyHits + cumulativeLateHits totalGuesses = totalHits + cumulativeFalseAlarms naivePrecision = 1.0 * totalHits / totalGuesses recall = cumulativeBenefit / cumulativePotential print '\nSUMMARY:' print ' processed', nqueries, 'queries' print ' of which', cumulativeNoPutative, 'lacked answers entirely' print ' the answers examined (within', searchTimePerQuery, 'sec. per query) included:' print ' ', cumulativeFalseAlarms, 'false alarms and', totalHits, " total hits" print ' ', cumulativeEarlyHits, 'successful and exact-or-early jump-in points' if cumulativeEarlyHits > 0: print ' averaging %.1f' % (cumulativeEarliness / cumulativeEarlyHits), 'seconds early' print ' ', cumulativeLateHits, 'successful but late jump-in points' if cumulativeLateHits > 0: print ' averaging %.1f' % (cumulativeLateness / cumulativeLateHits), 'seconds late' print ' naivePrecision = %2.0f%%' % (naivePrecision * 100), "=", totalHits, "/", totalGuesses print ' ' print ' raw recall = %2.0f%%' % (recall * 100), "=", int(cumulativeBenefit), "/", int(cumulativePotential), "seconds" searcherUtilityRatio = (cumulativeBenefit / cumulativeCost) print ' raw searcher utility ratio: %.3f' % searcherUtilityRatio, " = %.1f" % (cumulativeBenefit / nqueries * 1.00) , "/ %.1f" % (cumulativeCost / nqueries * 1.00) print ' (average cost / average benefit (both per query, both in seconds))' normalizedSUR = searcherUtilityRatio / 0.290 normalizedR = recall / 0.275 print ' Normalized Searcher Utility Ratio: %0.3f ' % normalizedSUR print ' Normalized Recall: %0.3f ' % normalizedR print " F-measure %.2f" % ((10 * normalizedSUR * normalizedR) / (normalizedSUR + 9*normalizedR))