// CSE 143, Winter 2010, Marty Stepp // // This program counts the number of words in a large file. // It demonstrates the use of a Set collection. import java.io.*; import java.util.*; // list -> 22021 ms // hashset -> 882 ms // treeset -> 1051 ms public class SetWordCount { public static void main(String[] args) throws FileNotFoundException { // store a set of of each word seen so far (faster than a list) // List words = new ArrayList(); Set words = new TreeSet(); long start = System.currentTimeMillis(); System.out.println("Reading file..."); Scanner input = new Scanner(new File("mobydick.txt")); while (input.hasNext()) { String word = input.next(); // if you wanted to remove punctuation and ignore case, you could say: // word = word.toLowerCase().replaceAll("[^a-z]", ""); // don't need to call contains because set already ignores duplicates // if (!words.contains(word)) { ... words.add(word); } long end = System.currentTimeMillis(); long elapsed = end - start; System.out.println("The file has " + words.size() + " words."); System.out.println("Took " + elapsed + " ms."); // sets don't have indexes, so you can't use a "for int i" loop // for (int i = 0; i < words.size(); i++) { // instead, use a "for-each" loop // for (String word : words) { // System.out.println(word); // } // use an iterator to loop over the elements and filter out some of them Iterator itr = words.iterator(); while (itr.hasNext()) { String word = itr.next(); if (word.length() <= 3) { // have to call remove on the iterator, not on the set // words.remove(word); itr.remove(); } } System.out.println("The file has " + words.size() + " words."); } }