# Design Example # Exercise 1: Text Analysis def read_words(filename): """Given a filename, return a dictionary mapping each word in filename to its frequency in the file""" wordfile = open(filename) worddata = wordfile.read() word_list = worddata.split() wordfile.close() wordcounts_dict = {} for word in word_list: # See the lecture slides for more on setdefault count = wordcounts_dict.setdefault(word, 0) wordcounts_dict[word] = count + 1 return wordcounts_dict def word_count(wordcounts_dict, word): """Given a dictionary mapping word to counts, return the count of the given word in the dictionary. """ # Could also do: return wordcounts_dict.get(word, 0) if wordcounts_dict.has_key(word): return wordcounts_dict[word] else: return 0 def topk(wordcounts_dict, k=10): """Given a dictionary mapping word to counts, return a list of (count, word) tuples of the top k most frequent words in the dictionary, sorted from most to least frequent. If less then k unique words in wordcounts_dict, return all words. If more than one word has a certain count, the ordering between them is unspecified. Returns at most k words. If the k+1th word has the same count as the kth word, pick which one to return arbitrarily.""" # Could also do sorting as we have done before using itemgetter # and multiple sorts. Here we use a list comprehension to create # a list of (count, word) tuples. counts_with_words = [(c, w) for (w, c) in wordcounts_dict.items()] counts_with_words.sort(reverse=True) return counts_with_words[0:k] def total_words(wordcounts_dict): """Given a dictionary mapping word to counts, return the total number of words used to create the dictionary""" return sum(wordcounts_dict.values()) def main(): wordcounts = read_words("sample_text.txt") print "wordcounts :", wordcounts k = 1 print "top", k, " :", topk(wordcounts, k) print "top 10 :", topk(wordcounts) print "total words in file:", total_words(wordcounts) print "count of fox:", word_count(wordcounts, "fox") print "count of zebra:", word_count(wordcounts, "zebra") if __name__ == '__main__': main()