# score3.py     Python 2.5 script (fails on Python 2.4)

# This script scores the quality of matches
#   for the Simlar Segments in Social Seach MediaEval 2013 Task
# Nigel Ward, University of Texas at El Paso, March 2013
#   version 3, May 13, 2013: changed to omit query weighting

# Given a query, this looks up the answerset for that query 
#   and evaluates its quality by comparing it to the reference similarity set. 
# It implements the algorithm descrbed at 
#    http://www.cs.utep.edu/nigel/ssss/faq.html
# The output is, for each query, 
#   a description of how well that query was handled. 
#   and the Searcher Utility Ratio.
# It also prints out the average Ratio over all queries.

# invoke with   python score3.py queries.txt sss.txt answers.txt

# The inputs are three files:
# The list of queries, provided by the organizers
# The lists of answers, that is, list of jump-in points inferred for each
#   of the queries, as generated by a participant system.
# The reference list of similar-segment sets, provided by the organizers.  
#   For the final evaluation set this will not be revealed to the participants. 

import sys

searchTimePerQuery = 120    # seconds
maxLeftOffset = 5     # seconds
minRightOffset = 1    # seconds
falsePositivePenalty = 8 # seconds
noResultsPenalty = 10 # seconds
searchBackPenaltyRatio = 2 

def skippable(line):
  if len(line) == 0:
     return True
  if line[0] == '#':
     return True
  if line.isspace():
     return True
  return False
       
# Here string1 and string2 both have format: filename  start  end
# Returns true if they represent the same region 
#   (regardless of whitespace variation)
def sameRegion(string1, string2):
   triple1 = string1.split()
   triple2 = string2.split()
   return triple1 == triple2


# Return all result-regions corresponding to the specified query, by
#  scaning through the answers file to find a line prefixed by "input:"
#  that contains the exact same filename, start time and end time,
#  and then snatching up all lines after that (up to the next "input:")
def findResultSet(query, lines):
   jumpinset = []
   nlines = len(lines)
   for i in range(nlines):
       if skippable(lines[i]):
          continue
       #print "  answerfile has: ", lines[i], 
       triple = (lines[i]).partition(" ")
       firstToken = triple[0]
       if firstToken != 'input:':
           continue
       remainingTokens = triple[2].strip()
       if not sameRegion(query,remainingTokens.strip()):
           continue
       # we've found the corresponding set; collect the relevant lines
       i = i + 1
       while i < nlines:
           line = lines[i]
           triple = lines[i].partition(" ")
           firstToken = triple[0]
           if skippable(line) or firstToken == 'input:':
              print '  in answerfile, jumpinset is: ', jumpinset
              return jumpinset  # all done
           jumpinset.append(triple[2].strip())
           i = i + 1
   return jumpinset   # which will be null 

def findSimilarSet(query, sslines):
    similarset = []
    nlines = len(sslines)
    for i in range(nlines):
        if not sameRegion(sslines[i].strip(),query):
            continue
        # search back collecting all lines until we see a 'set' token
        j = i - 1
        while j > 0:
            ssline = sslines[j].strip()
            if skippable(ssline):
                break
            tokens = ssline.split()
            if tokens[0] == 'set:':
               break
            similarset.append(ssline)
            j = j - 1
        # search forward analogously
        k = i + 1
        while k < nlines:
            ssline = sslines[k].strip()
            if skippable(ssline):
                break
            tokens = ssline.split()
            if tokens[0] == 'set:':
               break
            similarset.append(ssline)
            k = k + 1
        print '  similarset', similarset
        return similarset


# A query is a string: a filename, a start time, and an end time
# We seek a corresponding answer set in the answers file.  
# Any answersets that don't correspond to a query are ignored.
# If no corresponding answer set is found, the benefit for this 
#   query is zero. 
#  since we can't evaluate the search results without a reference answerset.
# We also seek a corresponding similarity set in the ss file.
# Finally we call simulateUser to estimate the value of this answer set
#  as a set of jump-in points leading to discovery of the similarity set.
# Otherwise we load up the similarity-set for that query, and 
# compute the value
def processQuery (query):
    global nqueries
    nqueries = nqueries + 1
    print "Processing query", query
    similarset = findSimilarSet(query, sslines)
    resultset =  findResultSet(query, answerlines)
    simulateUser(query, resultset, similarset)
    

# simulateUser
# For each answer in the set, determine its value and cost,
#  and continue until the total cost for this query exceeds 120 seconds

# three globals shared by the daughter function ProcessResult
liveSimilarSet = []
perQueryCost = 0
perQueryValue = 0

def simulateUser(query, resultset, similarset):
    global cumulativeCost
    global cumulativeBenefit
    # three globals shared by the daughter function ProcessResult
    global liveSimilarSet   # regions not already jumped-into by an answer
    global perQueryCost
    global perQueryValue

    if resultset == []:
        cumulativeCost = cumulativeCost + noResultsPenalty
        print 'no results found for', query, 'penalty is', noResultsPenalty
        return
    perQueryCost = 0
    perQueryValue = 0
    liveSimilarSet = similarset
    for result in resultset:
       print '   processing result: ', result
       processResult(result, liveSimilarSet)
       if perQueryCost > searchTimePerQuery:
           return   # simulated time out

    print "for this query, cost is", perQueryCost, 'and benefit', perQueryValue
    print "Searcher Utility Ratio for query '", query, "' is", 
    print '%.2f' % (perQueryValue / perQueryCost)
    print
    cumulativeCost = cumulativeCost + perQueryCost
    cumulativeBenefit = cumulativeBenefit + perQueryValue


# There are three cases: 
#  First, if the answer is within an unused similar segment,
#    and no later than 1 second before the end, then
#    value = duration of that segment 
#    cost = duration of that segment + twice the distance from 
#         the jump-in point to the start of the segment
#    and mark that segment as used up 
#  Second, if the answer is no more than 5 seconds before an
#     unused similar segment
#    value = duration of that segment
#    cost = time distance from jump-in point to segment onset
#    and mark that segment as used up 
#  Third, if none of the first two applies, 
#    cost = 8 seconds
#    value = 0 seconds
def processResult(result, lsset):
    global liveSimilarSet   # regions not already jumped-into by an answer
    global perQueryCost
    global perQueryValue
    triple = result.split()
    rfilename = triple[0]
    rtimepoint = float(triple[1])
    # find the first matching item in the lsset
    if lsset == None:
       print "Scoring Error!!: Query File entry lacking similarity set partner"
    for similarRegion in lsset:
        triple = similarRegion.split()
        sfilename = triple[0]
        sstartpoint = float(triple[1])
        sendpoint = float(triple[2])
        if sfilename != rfilename:
            continue  # keep looking to find a similar region
        if rtimepoint >= sstartpoint and rtimepoint <= sendpoint - minRightOffset:
            # Case 1
            value = sendpoint - sstartpoint
            cost = value + searchBackPenaltyRatio * (rtimepoint - sstartpoint)
            perQueryCost = perQueryCost + cost
            perQueryValue = perQueryValue + value
            liveSimilarset = lsset.remove(similarRegion)
            print "     Case 1: jump-in point within similar region"
            return
        if rtimepoint >= sstartpoint - maxLeftOffset and rtimepoint <= sstartpoint:
            # Case 2
            value = sendpoint - sstartpoint
            cost = value + sstartpoint - rtimepoint 
            perQueryCost = perQueryCost + cost
            perQueryValue = perQueryValue + value
            liveSimilarset = lsset.remove(similarRegion)
            print "     Case 2: jump-in point preceeds similar region"
            return

    # Case 3: we've scanned all similar regions but found no match
    perQueryCost = perQueryCost + falsePositivePenalty
    print '     Case 3: no similar regions match this jump-in point'

# main

nargs = len(sys.argv) - 1
if nargs != 3:
   print "Score Error: expected 3 arguments, got", nargs
queryfile = sys.argv[1]
ssfile = sys.argv[2]  # similarity-sets file 
answerfile = sys.argv[3]

qfp = open(queryfile, 'r')
sfp = open(ssfile, 'r')
afp = open(answerfile, 'r')
ofp = open('performance.txt', 'w')

queries = qfp.readlines()
answerlines = afp.readlines()
sslines = sfp.readlines()

nqueries = 0 
cumulativeCost = 0
cumulativeBenefit = 0

for query in queries:
   if skippable(query):
      continue
   processQuery(query.strip())
ofp.write('all done processing queries')

print 'Summary:'
print '  processed', nqueries, 'queries'
print '  average cost: %.1f' % (cumulativeCost / nqueries * 1.00)
print '  average benefit: %.1f ' % (cumulativeBenefit / nqueries * 1.00)
print '  Searcher Utility Ratio: %.3f' % (cumulativeBenefit / cumulativeCost)