# Nigel Ward, October 2018 # Code Fragments SLP Assignment F: Word Vectors import numpy as np import math def loadGloveModel(gloveFile): # from Karishma Malkan on stackoverflow print "Loading Glove Model" f = open(gloveFile,'r') model = {} for line in f: splitLine = line.split() word = splitLine[0] embedding = np.array([float(val) for val in splitLine[1:]]) model[word] = embedding print "Done.",len(model)," words loaded!" return model def distance(vec1, vec2): return math.sqrt(sum([(x-y)*(x-y) for x,y in zip( vec1, vec2)])) def mostSimilar(query): distances = [distance(model[x], model[query]) for x in model.keys()[:200000]] # distances = [distance(model[x], model[query]) for x in model.keys()] minval = min(distances) print 'for %s: min distance of %.2f found for %s' %(query, minval, model.keys()[distances.index(minval)]) def printDistance(word1, word2): print 'distance from %s to %s: %.2f' % (word1, word2, distance(model[word1], model[word2])) #--------------------------------- model = loadGloveModel('glove.6B.50d.txt') # 17 seconds on my desktop #print model['texas-el'] #print model['paso'] #print model['pasos'] printDistance('rabbit', 'food') printDistance('bunny', 'food') printDistance('rabbit', 'texas-el') printDistance('juarez', 'texas-el') printDistance('albuquerque', 'texas-el') printDistance('dallas', 'texas-el') printDistance('paso', 'texas-el') printDistance('chicago', 'texas-el') mostSimilar('texas') mostSimilar('texas-el') mostSimilar('paso') mostSimilar('dallas') mostSimilar('juarez') mostSimilar('computer')