# Hunter Schafer (hschafer) # This program computes the most similar wikipedia page to page for # a particualr name import pandas as pd def count_words(text): """ Takes the text of a document and converets it to a dictionary of word counts. """ result = {} for word in text.split(): if word in result: result[word] += 1 else: result[word] = 1 return result def bag_of_words(data): """ Takes a dataframe with name and text columns and converts each text into a word-count dictionary. Returns a dictionary with names as keys and values are word-count dictionary for that person's text. """ result = {} for i in data.index: row = data.loc[i] result[row['name']] = count_words(row['text']) return result def euclidian_distance(d1, d2): """ Takes two bag-of-word dictionaries and computes the Euclidian distance between them. The dictionaries are sparse, so any missing words have count 0. """ # We need to loop over words in both vectors words = d1.keys() | d2.keys() total = 0 for word in words: # If the word is missing in either vector, make the count 0 c1 = d1[word] if word in d1 else 0 c2 = d2[word] if word in d2 else 0 total += (c1 - c2) ** 2 return total ** 0.5 def magnitude(d): """ Returns the magnitude of the given bag-of-words vector. The magnitude is defined as the square root of the sum the components squared. """ total = 0 for count in d.values(): total += count ** 2 return total ** 0.5 def cosine_distance(d1, d2): """ Computes the cosine distance between the two bag-of-word vectors which is related to the angle between them. Vectors that have a small angle between them, will have a small distance, while vectors with a large angle will have a large difference. """ words = d1.keys() & d2.keys() total = 0 for word in words: total += d1[word] * d2[word] return 1 - total / (magnitude(d1) * magnitude(d2)) def nearest_neighbor(name, bow): """ Takes a name and a bag of words (keys are name, values are word-count dictionaries) and returns the nearest neighbor of the given name in the dataset. The name itself is not considered in the nearest neighbor computation. """ closest_person = None closest_distance = None for person in bow.keys(): # find distance using euclidean or cosine! distance = cosine_distance(bow[name], bow[person]) if person != name and \ (closest_person is None or distance < closest_distance): closest_person = person closest_distance = distance return closest_person, closest_distance def main(): df = pd.read_csv('people_wiki.csv') bow = bag_of_words(df) print(nearest_neighbor('Barack Obama', bow)) print(nearest_neighbor('Joe Biden', bow)) if __name__ == "__main__": main()