# score4.py     Python 2.5 script (fails on Python 2.4)

# /home/research/isg/speech/social/python/score4.py

# This script scores the quality of matches
#   for the Simlar Segments in Social Seach MediaEval 2013 Task
# Nigel Ward, University of Texas at El Paso, March 2013

# Version 3, May 13, 2013: changed to omit query weighting
# Version 4, July 2013: added more information to the performance summary
# Changed so that the hunt-back-to-start cost is estimated at 50% of
#   the distance, instead of 100%, assuming people jump back 10 seconds
#   at a time, they listen for 4 seconds each time, and jump overhead is 1 sec.
# Then changes so that there is no hunt-back-to-start behavior 
#  (plausible especially when the regions are very long)
# Changed to add the 'naive precision' and the recall

# Given a query, this looks up the answerset for that query 
#   and evaluates its quality by comparing it to the reference similarity set. 
# It implements the algorithm descrbed at 
#    http://www.cs.utep.edu/nigel/ssss/faq.html
# The output is, for each query, 
#   a description of how well that query was handled. 
#   and the Searcher Utility Ratio.
# It also prints out the average Ratio over all queries.

# invoke with   python score3.py queries.txt sss.txt answers.txt

# in ../baseline:
# python ../python/score4.py all-possible-queries2.txt all-similarity-sets-clean2.txt all-infered-answers.txt; gives naive precision 9%, recall 27%, utility ratio .31, f-measure of 0.29
# ditto on random-guesses.txt: 6%, 30%, 24%, 0.27


# The inputs are three files:
# The list of queries, provided by the organizers
# The lists of answers, that is, list of jump-in points inferred for each
#   of the queries, as generated by a participant system.
# The reference list of similar-segment sets, provided by the organizers.  
#   For the final evaluation set this will not be revealed to the participants. 

# Note that all the constants are designed to reflect plausible
# searcher behavior, with the exception of the noResults penalty.
# Real searchers would probably prefer to have zero results than one
# incorrect result, but for purposes of the evaluation, we don't want
# to enable systems to get high scores by only the one or two queries
# they're confident they can do well on.

import sys

# constants 
searchTimePerQuery = 120    # seconds
maxLeftOffset = 5     # seconds
minRightOffset = 3    # seconds; was 1, changed in version 4
falsePositivePenalty = 8 # seconds
#searchBackPenaltyRatio = 2  # considered making it lower  

def skippable(line):
  if len(line) == 0:
     return True
  if line[0] == '#':
     return True
  if line.isspace():
     return True
  return False
       
# Here string1 and string2 both have format: filename  start  end
# Returns true if they represent the same region 
#   (regardless of whitespace variation and leading/trailing zeros)
def sameRegion(string1, string2):
   triple1 = string1.split()
   triple2 = string2.split()
   return (triple1[0] == triple2[0] and float(triple1[1]) == float(triple2[1]) and float(triple1[2]) == float(triple2[2]))


# Return all result-regions corresponding to the specified query, by
#  scaning through the answers file to find a line prefixed by "input:"
#  that contains the exact same filename, start time and end time,
#  and then snatching up all lines after that (up to the next "input:")
def findResultSet(query, lines):
   jumpinset = []
   nlines = len(lines)
   for i in range(nlines):
       if skippable(lines[i]):
          continue
       #print "  answerfile has: ", lines[i], 
       triple = (lines[i]).partition(" ")
       firstToken = triple[0]
       if firstToken != 'input:':
           continue
       remainingTokens = triple[2].strip()
       if not sameRegion(query,remainingTokens):
           continue
       # we've found the corresponding set; collect the relevant lines
       i = i + 1
       while i < nlines:
           line = lines[i].strip()
           triple = lines[i].partition(" ")
           firstToken = triple[0]
           if skippable(line) or firstToken == 'input:':
              print '  in answerfile, jumpinset is: ', jumpinset
              return jumpinset  # all done
           jumpinset.append(line.strip())
           i = i + 1
   return jumpinset   # which will be null 

def findSimilarSet(query, sslines):
    similarset = []
    nlines = len(sslines)
    for i in range(nlines):
        ssline = sslines[i]
        if skippable(ssline):
          continue
        if ssline.find("set") > -1:
          continue
        if not sameRegion(sslines[i].strip(),query):
          continue
        # search back collecting all lines until we see a 'set' token
        j = i - 1
        while j > 0:
            ssline = sslines[j].strip()
            if skippable(ssline):
                break
            tokens = ssline.split()
            if tokens[0] == 'set:':
               break
            similarset.append(ssline)
            j = j - 1
        # search forward analogously
        k = i + 1
        while k < nlines:
            ssline = sslines[k].strip()
            if skippable(ssline):
                break
            tokens = ssline.split()
            if tokens[0] == 'set:':
               break
            similarset.append(ssline)
            k = k + 1
        if similarset == []:
          print "warning, no similar regions in reference tagsets for", query
        return similarset
    print "warning, no region in reference tagset file matches", query


# A query is a string: a filename, a start time, and an end time
# We seek a corresponding answer set in the answers file.  
# Any answersets that don't correspond to a query are ignored.
# If no corresponding answer set is found, the benefit for this 
#   query is zero, and the noResults penalty is applied
# We also seek a corresponding similarity set in the ss file.
# Finally we call simulateUser to estimate the value of this answer set
#  as a set of jump-in points leading to discovery of the similarity set.
def processQuery (query):
    global nqueries
    global cumulativePotential
    nqueries = nqueries + 1
    print "\nProcessing query", query
    similarset = findSimilarSet(query, sslines)  # reference tagset
    resultset =  findResultSet(query, answerlines) # system guesses
    unmaxedPotentialValue = totalSetDuration(similarset)
    potentialValue = min(searchTimePerQuery, unmaxedPotentialValue)
    cumulativePotential = cumulativePotential + potentialValue
    simulateUser(query, resultset, similarset, potentialValue)
    
def totalSetDuration(similarset):
   sum = 0
   for region in similarset:
     triple = region.split()
     start = float(triple[1])
     end = float(triple[2])
     sum = sum + (end - start)
   return sum


# simulateUser For each putative answer in the set, determine its
# value and cost, and keep processing putative answers until the total
# cost for this query exceeds 120 seconds

#  secondswo shared by the daughter function ProcessResult
thisQueryCost = 0
thisQueryValue = 0

def simulateUser(query, resultset, tagset, potential):
    global cumulativeCost
    global cumulativeBenefit
    global cumulativeNoPutative 
    #  globals shared by the daughter function ProcessResult
    global thisQueryCost
    global thisQueryValue

    if resultset == []:
        print 'no putative results for', query
        cumulativeNoPutative = cumulativeNoPutative  + 1
        return
    thisQueryCost = 0
    thisQueryValue = 0
    liveTagSet = tagset  # regions not already jumped-into by an answer
    for result in resultset:
       print '   processing result: ', result
       liveTagSet = processResult(result, liveTagSet)
       if thisQueryCost >= searchTimePerQuery:
         # if times up, stop processing results
           break

    thisQueryValue = min(thisQueryValue, searchTimePerQuery)
    print "for this query, cost is", thisQueryCost, 'and benefit', thisQueryValue, "of a possible", potential
    print "Searcher Utility Ratio for query '", query, "' is", 
    print '%.2f' % (thisQueryValue / thisQueryCost)
    cumulativeCost = cumulativeCost + thisQueryCost
    cumulativeBenefit = cumulativeBenefit + thisQueryValue


# There are three cases: 
#  Case 1, if the answer is within an unused similar segment,
#    and no later than 1 second before the end, then
#    value = duration of that segment 
#    cost = duration of that segment + twice the distance from 
#         the jump-in point to the start of the segment
#    and mark that segment as used up 
#  Case 2, if the answer is no more than 5 seconds before an
#     unused similar segment
#    value = duration of that segment
#    cost = time distance from jump-in point to segment onset
#    and mark that segment as used up 
#  Case 3, if none of the first two applies, it's a false alarm
#    cost = 8 seconds
#    value = 0 seconds
def processResult(result,ltset):
    global thisQueryCost
    global thisQueryValue
 
    global cumulativeFalseAlarms
    global cumulativeEarlyHits
    global cumulativeLateHits
    global cumulativeEarliness 
    global cumulativeLateness 

    triple = result.split()
    rfilename = triple[0]
    rtimepoint = float(triple[1])
    # find the first matching item in the liveTagSet
    if ltset == None:
       # no (remaining live) regions in similarity set, stop scoring answers
       # thus there's no penalty if the system returns more answers than
       # there are regions, as long as the correct answers are at the top
      thisQueryCost = thisQueryCost + falsePositivePenalty
      cumulativeFalseAlarms = cumulativeFalseAlarms + 1
      print '     Case 3: no similar regions (left to try to) match this jump-in point'
      return ltset
    for similarRegion in ltset:
        triple = similarRegion.split()
        sfilename = triple[0]
        sstartpoint = float(triple[1])
        sendpoint = float(triple[2])
        if sfilename != rfilename:
            continue  # keep looking to find a similar region
        if rtimepoint > sstartpoint and rtimepoint <= sendpoint - minRightOffset:
            # Case 1
            #value = sendpoint - sstartpoint
            #cost = value + searchBackPenaltyRatio * (rtimepoint - sstartpoint)
            value  = sendpoint - rtimepoint
            cost = value
            thisQueryCost = thisQueryCost + cost
            thisQueryValue = thisQueryValue + value
            cumulativeLateHits = cumulativeLateHits + 1
            cumulativeLateness = cumulativeLateness + (rtimepoint - sstartpoint)
            print "     Case 1: jump-in point within similar region"
            ltset.remove(similarRegion)
            return ltset
        if rtimepoint >= sstartpoint - maxLeftOffset and rtimepoint <= sstartpoint:
            # Case 2
            value = sendpoint - sstartpoint
            cost = value + (sstartpoint - rtimepoint)
            thisQueryCost = thisQueryCost + cost
            thisQueryValue = thisQueryValue + value
            cumulativeEarlyHits = cumulativeEarlyHits + 1
            cumulativeEarliness = cumulativeEarliness + (sstartpoint - rtimepoint)
            print "     Case 2: jump-in point preceeds similar region"
            ltset.remove(similarRegion)
            return ltset

    # Case 3: we've scanned all similar regions but found no match
    thisQueryCost = thisQueryCost + falsePositivePenalty
    cumulativeFalseAlarms = cumulativeFalseAlarms + 1
    print '     Case 3: no similar regions match this jump-in point'
    return ltset


# ---- main ----

nargs = len(sys.argv) - 1 
print sys.argv
print nargs
if nargs != 3:
   print "Score Error: expected 3 arguments, got", nargs
   print "invoke with 'python score4.py queryfile tagsetfile answerfile'"
queryfile = sys.argv[1]
ssfile = sys.argv[2]  # similarity-sets file 
answerfile = sys.argv[3]

qfp = open(queryfile, 'r')
sfp = open(ssfile, 'r')
afp = open(answerfile, 'r')
ofp = open('performance.txt', 'w')

queries = qfp.readlines()
answerlines = afp.readlines()
sslines = sfp.readlines()

nqueries = 0 
cumulativePotential = 0
cumulativeCost = 0
cumulativeBenefit = 0
cumulativeNoPutative = 0
cumulativeFalseAlarms = 0
cumulativeEarlyHits = 0
cumulativeLateHits = 0
cumulativeEarliness = 0.0
cumulativeLateness = 0.0

for query in queries:
   if skippable(query):
      continue
   processQuery(query.strip())
ofp.write('all done processing queries')

totalHits = cumulativeEarlyHits + cumulativeLateHits
totalGuesses = totalHits + cumulativeFalseAlarms
naivePrecision = 1.0 * totalHits / totalGuesses
recall = cumulativeBenefit / cumulativePotential

print '\nSUMMARY:'
print '  processed', nqueries, 'queries'
print '   of which', cumulativeNoPutative, 'lacked answers entirely'
print '   the answers examined (within', searchTimePerQuery, 'sec. per query) included:'
print '    ', cumulativeFalseAlarms, 'false alarms and', totalHits, " total hits"
print '    ', cumulativeEarlyHits, 'successful and exact-or-early jump-in points'
if cumulativeEarlyHits > 0:
  print '       averaging %.1f' % (cumulativeEarliness / cumulativeEarlyHits), 'seconds early'
print '    ', cumulativeLateHits, 'successful but late jump-in points'
if cumulativeLateHits > 0:
  print '       averaging %.1f' % (cumulativeLateness / cumulativeLateHits), 'seconds late'
print '   naivePrecision = %2.0f%%' % (naivePrecision * 100), "=", totalHits, "/", totalGuesses

print ' '
print '  raw recall = %2.0f%%' % (recall * 100), "=", int(cumulativeBenefit), "/", int(cumulativePotential), "seconds"
searcherUtilityRatio = (cumulativeBenefit / cumulativeCost)
print '  raw searcher utility ratio: %.3f' % searcherUtilityRatio, " = %.1f" % (cumulativeBenefit / nqueries * 1.00) , "/ %.1f" % (cumulativeCost / nqueries * 1.00)
print '    (average cost  / average benefit (both per query, both in seconds))'
normalizedSUR =  searcherUtilityRatio / 0.290
normalizedR = recall / 0.275

print '  Normalized Searcher Utility Ratio: %0.3f ' % normalizedSUR
print '  Normalized Recall: %0.3f ' % normalizedR

print "  F-measure %.2f" % ((10 * normalizedSUR * normalizedR) / (normalizedSUR + 9*normalizedR))