/* * FeatureFinder.java * * Andy Menz * Reads all Ling-Spam messages and creates the .arff file used to build a classifier. * Algorithm inspired by: * Feature Selection: http://bluehawk.monmouth.edu/~drucker/SVM_spam_article_compete.PDF * Feature Selection: http://arxiv.org/ftp/cs/papers/0009/0009009.pdf * Word Stemming: http://telemat.det.unifi.it/book/2001/wchange/download/stem_porter.html */ /********************************************************************** * Note1: Stemming is done via Porter's algm (see code for details) * Note2: TF and TF-IDF feature vectors produce discretized data sets ***********************************************************************/ import java.util.*; import java.io.*; import java.math.*; import weka.core.*; import weka.filters.*; //arguments - FeatureFinder // is the type of feature vector to use // -b = (boolean) feature represented by 1/0 (either a word is in a message or it isn't) (default) // -t = (TF) feature vector counts how many times a word is in a message // -td = (TF, Discretized) feature vector counts how many times a word is in a message, output data is two-class descritized // -T = (TF-IDF) feature vector uses TF * IDF (inverse document frequency) // -Td = (TF-IDF, Discretized) feature vector uses TF * IDF (inverse document frequency), output data is two-class descritized // determines whether words are stemmed or not // -s = features (words) are stemmed // -ns = features (words) are NOT stemmed (default) // determines whether stop terms are used or not // -stop = use stop terms // -nstop = dtop terms NOT used // is the number of features to use when creating the .arff file // -# = number of features to use (-500 default) // is the name used for the relation and the .arff file // e.g test123 // note: string must contain no spaces class FeatureFinder{ static final boolean DEBUG = false; static final int BOOL = 0; //feature represented by 1/0 (either a word is in a message or it isn't) static final int TF = 1; //feature vector counts how many times a word is in a message static final int TF_IDF = 2; //feature vector uses TF * IDF (inverse document frequency) static final String directoryName = "Ling-Spam/lingspam_public/bare/"; static final Integer ZERO = new Integer(0); static final Integer ONE = new Integer(1); //used as second index in m_vFeatureCount static final int LEGIT = 0; static final int SPAM = 1; //the number of subdirectories where the messages are stored (should be 10) static final int NUM_SUBDIR = 10; static final String sTempOuputFile = "TempOutputFile"; //default parameters static int m_FeatureVectorType = BOOL; //use boolean feature vectors static boolean m_Stem = false; //determines whether words are stemmed or not, default = no stemming static boolean m_StopTerms = false; //do not use stop terms static int m_iMaxNumFeatures = 500; //the number of features to use in the final feature vector static String m_sOutputFileName = "LingSpam"; //the name used for the output file and relation name static boolean m_bDiscretizeData = false; //whether to discretize the data into 2-value attributes static Diagnostic diagnostic = new Diagnostic(); //create a HashTable to store all words found in the messages //key = feature string (word) //value = feature index (int) static Hashtable m_hFeatures = new Hashtable(); //holds the total number of msgs that containe each feature for each class index //index1 = feature index of m_hFeatures (holds vector) //index2 = (inner vector) class index static Vector m_vFeatureCount = new Vector(); //the MI of all features (index = feature index) static Vector m_vMI = new Vector(); //strings of m_iMaxNumFeatures features with highest MI //index = new feature index static Vector m_vFeatureStrings = new Vector(); //the collection of all feature vectors (one for each msg) //note that the structure of the feature vector is determined by the command line parameter static boolean[][] m_bFeatureVectors; //for boolean feature vector static int[][] m_iTFFeatureVectors; //for TF feature vectors static double[][] m_dTFIDFFeatureVectors; //for TF-IDF feature vectors //counters - all 0 based static int m_iFeatureCount = -1; //counts the total number of features found static int m_iFileCount = -1; //counts the number of files read static int m_iSpamCount = -1; //counts the total number of spam files read static int m_iLegitCount = -1; //counts the total number of legit files read static int m_iNumWordsStemmed = -1; //counts the total number of words that were stemmed; //the main method public static void main(String args[]) { //analyze the arguments and set class variables if(!analyzeArguments(args)) return; //go through all the messages and determine all features //controls m_hFeatures and m_vFeatureCount and several counter variables if(!collectInitialFeatures()) return; //calculate the mutual information for all features calculateAllFeatureMI(); //error checking if(m_iMaxNumFeatures > m_iFeatureCount) { System.out.println(""); System.out.println("Error: The number of features to use must be less then the total number of features."); return; } //get a vector of the top m_iMaxNumFeatures features (sorted greatest to least by feature MI) //each index is a vector of 1. feature index 2. feature's MI Vector vBestFeatures = sortFeaturesByMI(); //debug - print the top features //printTopFeatures(vBestFeatures); //write diagnostic data to a file //writeDiagnosticData(vBestFeatures); saveTopFeatures(vBestFeatures); //create a new hashtable of the best features, and create a vector holding all features as strings (index = new feature index) //(i.e. rebuild m_hFeatures and create m_vFeatureStrings buildNewFeatureStructures(vBestFeatures); //go through all the messages and create a feature vector for each message using the best m_iMaxNumFeatures features (by highgest MI) //determine the type of feature vectors to be created if(m_FeatureVectorType == BOOL) { //create boolean feature vectors createBOOLFeatureVectorForAllMsgs(); } else if((m_FeatureVectorType == TF) || (m_FeatureVectorType == TF_IDF)) { //create term frequency feature vectors createTFFeatureVectorForAllMsgs(); } //use the feature vectors to create a .arff file for use in building a classfier buildARFF(); //write the diagnostic data to a file diagnostic.writeToFile(); System.out.println(""); } //end main //returns the stem of the given word (e.g. "building" becomes "build") //Algm implemented (in part) according to "An algorithm for suffix stripping" by M.F.Porter //http://telemat.det.unifi.it/book/2001/wchange/download/stem_porter.html private static String stem(String sWord) { //don't stem if the word is too short if(sWord.length() <= 4) return sWord; m_iNumWordsStemmed++; sWord = sWord.toLowerCase(); // SSES -> SS // IES -> I // SS -> SS // S -> if(sWord.endsWith("sses")) return (sWord.substring(0,sWord.length() -4) + "ss"); if(sWord.endsWith("ies")) return (sWord.substring(0,sWord.length() -3) + "i"); if(sWord.endsWith("ss")) return sWord; if(sWord.endsWith("s")) return (sWord.substring(0,sWord.length() -1)); //Step 1a //(m>0) EED -> EE feed -> feed //(*v*) ED -> plastered -> plaster //(*v*) ING -> motoring -> motor //determine m //m is the measure of the word, and determined by [C](VC)^m [V], where C = consanat, and V = vowel //get a char[] representation of the word char[] cWord = sWord.toCharArray(); //constuct a binary representation of the word where 0=C, 1=V int iWordLength = sWord.length(); boolean[] bWord = new boolean[iWordLength]; for(int i = 0; i < iWordLength; i++) { char cLetter = cWord[i]; //determine if this is a vowel or consonant if(cLetter == 'a' || cLetter == 'e' || cLetter == 'i' || cLetter == 'o' || cLetter == 'u') { //letter is a vowel bWord[i] = true; } // if the char is 'y', determine if it's a vowel or consonant else if(cLetter == 'y') { //y is only a vowel if preceeded by a consonant if((i == 0) || bWord[i-1] ) { //this 'y' is a consonant bWord[i] = false; } else { //this y is a vowel bWord[i] = true; } } else { //letter is a consonant bWord[i] = false; } } //find the measure of the word by looking for the m in [C](VC)^m [V] //false=0=C, true=1=V int iMeasure = 0; for(int i = 1; i < iWordLength; i++) { //if this letter's a consonant and the previous letter is a vowel, increment iMeasure if(!bWord[i] && bWord[i-1]) { iMeasure++; } } //Step 1b //(m>0) EED -> EE feed -> feed //(*v*) ED -> plastered -> plaster //(*v*) ING -> motoring -> motor //If the second or third of the rules in Step 1b is successful, the following is done: //AT -> ATE conflat(ed) -> conflate //BL -> BLE troubl(ed) -> trouble //IZ -> IZE siz(ed) -> size //(*d and not (*L or *S or *Z)) -> single letter // // hopp(ing) -> hop // tann(ed) -> tan // fall(ing) -> fall // hiss(ing) -> hiss // fizz(ed) -> fizz //(m=1 and *o) -> E fail(ing) -> fail // fil(ing) -> file //NOTE: *v* = the stem contains a vowel // *d - the stem ends with a double consonant (e.g. -TT, -SS) // *S - the stem ends with S (and similarly for the other letters). // *o - the stem ends cvc, where the second c is not W, X or Y (e.g. -WIL, -HOP). if (iMeasure > 0) { if(sWord.endsWith("eed")) return (sWord.substring(0,sWord.length() -3) + "ee"); } //if any char (other than the last 2) is a vowel... (*v*) for(int iCharIndex = 0; iCharIndex <= iWordLength - 3; iCharIndex++) { //if this is a vowel if(bWord[iCharIndex]) { if(sWord.endsWith("ed")) { //strip the "ed" String sTemp = sWord.substring(0,iWordLength -2); if(sTemp.endsWith("at") || sTemp.endsWith("bl") || sTemp.endsWith("iz")) { return (sTemp + "e"); } //(*d and not (*L or *S or *Z)) -> single letter //determine if the last two letter are double consanants (but not ll, ss, or zz) if((iWordLength - 4) >= 0) { if(!bWord[iWordLength -3] && !bWord[iWordLength -4]) { if(!sTemp.endsWith("ll") && !sTemp.endsWith("ss") && !sTemp.endsWith("zz")) { return sTemp.substring(0, sTemp.length() - 1); } } } //(m=1 and *o) -> E if(((iWordLength - 5) >= 0) && (iMeasure == 1)) { if(!bWord[iWordLength -3] && bWord[iWordLength -4] && !bWord[iWordLength -5]) { return (sTemp + "e"); } } } } } //if any char (other than the last 2) is a vowel... (*v*) for(int iCharIndex = 0; iCharIndex <= iWordLength - 4; iCharIndex++) { //if this is a vowel if(bWord[iCharIndex]) { if(sWord.endsWith("ing")) { //strip the "ing" String sTemp = sWord.substring(0,iWordLength -3); if(sTemp.endsWith("at") || sTemp.endsWith("bl") || sTemp.endsWith("iz")) { return (sTemp + "e"); } //(*d and not (*L or *S or *Z)) -> single letter //determine if the last two letter are double consanants (but not ll, ss, or zz) if((iWordLength - 5) >= 0) { if(!bWord[iWordLength -4] && !bWord[iWordLength -5]) { if(!sTemp.endsWith("ll") && !sTemp.endsWith("ss") && !sTemp.endsWith("zz")) { return sTemp.substring(0, sTemp.length() - 1); } } } //(m=1 and *o) -> E if(((iWordLength - 6) >= 0) && (iMeasure == 1)) { if(!bWord[iWordLength -4] && bWord[iWordLength -5] && !bWord[iWordLength -6]) { return (sTemp + "e"); } } } } } //Step 1c //(*v*) Y -> I happy -> happi // sky -> sky for(int iCharIndex = 0; iCharIndex <= iWordLength - 3; iCharIndex++) { //if this is a vowel if(bWord[iCharIndex]) { if(sWord.endsWith("y")) { sWord = sWord.substring(0, iWordLength -1) + "i"; } } } //Step 2 //(m>0) ATIONAL -> ATE relational -> relate //(m>0) TIONAL -> TION conditional -> condition // rational -> rational //(m>0) ENCI -> ENCE valenci -> valence //(m>0) ANCI -> ANCE hesitanci -> hesitance //(m>0) IZER -> IZE digitizer -> digitize //(m>0) ABLI -> ABLE conformabli -> conformable //(m>0) ALLI -> AL radicalli -> radical //(m>0) ENTLI -> ENT differentli -> different //(m>0) ELI -> E vileli - > vile //(m>0) OUSLI -> OUS analogousli -> analogous //(m>0) IZATION -> IZE vietnamization -> vietnamize //(m>0) ATION -> ATE predication -> predicate //(m>0) ATOR -> ATE operator -> operate //(m>0) ALISM -> AL feudalism -> feudal //(m>0) IVENESS -> IVE decisiveness -> decisive //(m>0) FULNESS -> FUL hopefulness -> hopeful //(m>0) OUSNESS -> OUS callousness -> callous //(m>0) ALITI -> AL formaliti -> formal //(m>0) IVITI -> IVE sensitiviti -> sensitive //(m>0) BILITI -> BLE sensibiliti -> sensible if(iMeasure > 0) { if(sWord.endsWith("ational")) return (sWord.substring(0, iWordLength - 7) + "ate"); if(sWord.endsWith("tional")) return (sWord.substring(0, iWordLength - 6) + "tion"); if(sWord.endsWith("enci")) return (sWord.substring(0, iWordLength - 4) + "ence"); if(sWord.endsWith("anci")) return (sWord.substring(0, iWordLength - 4) + "ance"); if(sWord.endsWith("izer")) return sWord.substring(0, iWordLength - 1); if(sWord.endsWith("abli")) return (sWord.substring(0, iWordLength - 4) + "able"); if(sWord.endsWith("alli")) return sWord.substring(0, iWordLength - 2); if(sWord.endsWith("entli")) return sWord.substring(0, iWordLength - 2); if(sWord.endsWith("eli")) return sWord.substring(0, iWordLength - 2); if(sWord.endsWith("ousli")) return sWord.substring(0, iWordLength - 2); if(sWord.endsWith("ization")) return (sWord.substring(0, iWordLength - 5) + "e"); if(sWord.endsWith("ation")) return (sWord.substring(0, iWordLength - 3) + "e"); if(sWord.endsWith("ator")) return (sWord.substring(0, iWordLength - 2) + "e"); if(sWord.endsWith("alism")) return sWord.substring(0, iWordLength - 3); if(sWord.endsWith("iveness")) return sWord.substring(0, iWordLength - 4); if(sWord.endsWith("fulness")) return sWord.substring(0, iWordLength - 4); if(sWord.endsWith("ousness")) return sWord.substring(0, iWordLength - 4); if(sWord.endsWith("aliti")) return sWord.substring(0, iWordLength - 3); if(sWord.endsWith("iviti")) return (sWord.substring(0, iWordLength - 3) + "e"); if(sWord.endsWith("biliti")) return (sWord.substring(0, iWordLength - 5) + "e"); } //Step 3 //(m>0) ICATE -> IC triplicate -> triplic //(m>0) ATIVE -> formative -> form //(m>0) ALIZE -> AL formalize -> formal //(m>0) ICITI -> IC electriciti -> electric //(m>0) ICAL -> IC electrical -> electric //(m>0) FUL -> hopeful -> hope //(m>0) NESS -> goodness -> good if(iMeasure > 0) { if(sWord.endsWith("icate")) return sWord.substring(0, iWordLength - 3); if(sWord.endsWith("ative")) return sWord.substring(0, iWordLength - 5); if(sWord.endsWith("alize")) return sWord.substring(0, iWordLength - 3); if(sWord.endsWith("iciti")) return sWord.substring(0, iWordLength - 3); if(sWord.endsWith("ical")) return sWord.substring(0, iWordLength - 2); if(sWord.endsWith("ful")) return sWord.substring(0, iWordLength - 3); if(sWord.endsWith("ness")) return sWord.substring(0, iWordLength - 4); } //Step 4 //(m>1) AL -> revival -> reviv //(m>1) ANCE -> allowance -> allow //(m>1) ENCE -> inference -> infer //(m>1) ER -> airliner -> airlin //(m>1) IC -> gyroscopic -> gyroscop //(m>1) ABLE -> adjustable -> adjust //(m>1) IBLE -> defensible -> defens //(m>1) ANT -> irritant -> irrit //(m>1) EMENT -> replacement -> replac //(m>1) MENT -> adjustment -> adjust //(m>1) ENT -> dependent -> depend //(m>1 and (*S or *T)) ION -> adoption -> adopt //(m>1) OU -> homologou -> homolog //(m>1) ISM -> communism -> commun //(m>1) ATE -> activate -> activ //(m>1) ITI -> angulariti -> angular //(m>1) OUS -> homologous -> homolog //(m>1) IVE -> effective -> effect //(m>1) IZE -> bowdlerize -> bowdler if(iMeasure > 1) { if(sWord.endsWith("al")) return sWord.substring(0, iWordLength - 2); if(sWord.endsWith("ance")) return sWord.substring(0, iWordLength - 4); if(sWord.endsWith("ence")) return sWord.substring(0, iWordLength - 4); if(sWord.endsWith("er")) return sWord.substring(0, iWordLength - 2); if(sWord.endsWith("ic")) return sWord.substring(0, iWordLength - 2); if(sWord.endsWith("able")) return sWord.substring(0, iWordLength - 4); if(sWord.endsWith("ible")) return sWord.substring(0, iWordLength - 4); if(sWord.endsWith("ant")) return sWord.substring(0, iWordLength - 3); if(sWord.endsWith("ement")) return sWord.substring(0, iWordLength - 4); if(sWord.endsWith("ment")) return sWord.substring(0, iWordLength - 4); if(sWord.endsWith("ent")) return sWord.substring(0, iWordLength - 3); if(sWord.endsWith("sion")) return sWord.substring(0, iWordLength - 3); if(sWord.endsWith("tion")) return sWord.substring(0, iWordLength - 3); if(sWord.endsWith("ou")) return sWord.substring(0, iWordLength - 2); if(sWord.endsWith("ism")) return sWord.substring(0, iWordLength - 3); if(sWord.endsWith("ate")) return sWord.substring(0, iWordLength - 3); if(sWord.endsWith("iti")) return sWord.substring(0, iWordLength - 3); if(sWord.endsWith("ous")) return sWord.substring(0, iWordLength - 3); if(sWord.endsWith("ive")) return sWord.substring(0, iWordLength - 3); if(sWord.endsWith("ize")) return sWord.substring(0, iWordLength - 3); } //Step 5a //(m>1) E -> probate -> probat // rate -> rate //(m=1 and not *o) E -> cease -> ceas if((iMeasure > 1) && sWord.endsWith("e")) return sWord.substring(0, iWordLength -1); if((iMeasure == 1) && (sWord.length() >= 4)) { if(!bWord[iWordLength - 2] && bWord[iWordLength - 3] && !bWord[iWordLength - 4]) { if(sWord.endsWith("e")) return sWord.substring(0, sWord.length() -1); } } //Step 5b //(m > 1 and *d and *L) -> single letter // controll -> control // roll -> roll if((iMeasure > 1) && sWord.endsWith("ll")) { return sWord.substring(0, sWord.length() -1); } //default - if nothing else catches return sWord; } // end stem() //returns true if sWord is a stop term private static boolean stopTerm(String sWord) { if(sWord.equalsIgnoreCase("is") || sWord.equalsIgnoreCase("a")|| sWord.equalsIgnoreCase("of") || sWord.equalsIgnoreCase("the")|| sWord.equalsIgnoreCase("an") || sWord.equalsIgnoreCase("and")) { return true; } return false; } // computes the base 2 log of dNumber private static double logBase2(double dNumber) { double base = 2; return Math.log(dNumber)/Math.log(base); } //debug print statements private static void dprint(String sMsg) { if(DEBUG) { System.out.println(sMsg); } } //sets the class variables dependent on the program arguments private static boolean analyzeArguments(String[] args) { System.out.println(""); System.out.println("**************************************************"); if ((args.length == 0) || (args.length > 5)) { System.out.println("Invalid number of arguments."); System.out.println("java FeatureFinder [] [] [] []"); System.out.println(""); System.out.println("e.g. java FeatureFinder -b -ns -nstop -500 test123"); System.out.println(""); System.out.println("Note: the last 4 parameters are optional - but they must be called in order."); System.out.println(""); System.out.println(" is the type of feature vector to use"); System.out.println(" -b = (boolean) feature represented by 1/0 (either a word is in a message or it isn't) (default)"); System.out.println(" -t = (TF) feature vector counts how many times a word is in a message"); System.out.println(" -td = (TF, 2-Discretized) TF with two-value descritized output data"); System.out.println(" -T = (TF-IDF) feature vector uses TF * IDF (inverse document frequency)"); System.out.println(" -Td = (TF-IDF, 2-Discretized) TF-IDF with two-value descritized output data"); //System.out.println(" -b = feature represented by 1/0 (either a word is in a message or it isn't)"); //System.out.println(" -t = feature vector counts how many times a word is in a message"); //System.out.println(" -T = feature vector uses TF * IDF (inverse document frequency)"); System.out.println(""); System.out.println(" determines whether words are stemmed or not"); System.out.println(" -s = features (words) are stemmed"); System.out.println(" -ns = features (words) are NOT stemmed (default)"); System.out.println(""); System.out.println(" determines whether stop terms are used or not"); System.out.println(" -stop = use stop terms"); System.out.println(" -nstop = stop terms NOT used (default)"); System.out.println(""); System.out.println(" determines how many features are chosen for the feature vector"); System.out.println(" -# = where # is the number of features to use (-500 default)"); System.out.println(""); System.out.println(" is the name used for the relation and the .arff file"); System.out.println(" e.g test123 (default is LingSpam)"); System.out.println(" note: string must contain no spaces"); System.out.println("**************************************************"); return false; } //get the type of feature vector to use if (args.length > 0) { if (args[0].equals("-t")) { m_FeatureVectorType = TF; m_bDiscretizeData = false; System.out.println("Feature Vectors = TF"); System.out.println("2-Discretized = False"); } else if (args[0].equals("-td")) { m_FeatureVectorType = TF; m_bDiscretizeData = true; System.out.println("Feature Vectors = TF"); System.out.println("2-Discretized = True"); } else if (args[0].equals("-T")) { System.out.println("Feature Vectors = TF-IDF"); System.out.println("2-Discretized = False"); m_FeatureVectorType = TF_IDF; m_bDiscretizeData = false; } else if (args[0].equals("-Td")) { System.out.println("Feature Vectors = TF-IDF"); System.out.println("2-Discretized = True"); m_bDiscretizeData = true; m_FeatureVectorType = TF_IDF; } else if (args[0].equals("-b")) { m_bDiscretizeData = false; System.out.println("Feature Vectors = Boolean"); System.out.println("2-Discretized = False"); } else { System.out.println("Invalid argument: " + args[0]); return false; } } //determine if word stemming is used if (args.length > 1) { if (args[1].equals("-s")) { System.out.println("Word Stemming = On"); m_Stem = true; } else if(args[1].equals("-ns")) { System.out.println("Word Stemming = Off"); m_Stem = false; } else { System.out.println("Invalid argument: " + args[1]); return false; } } //determine if stop terms are used if (args.length > 2) { if (args[2].equals("-stop")) { System.out.println("Stop Terms = On"); m_StopTerms = true; } else if (args[2].equals("-nstop")) { System.out.println("Stop Terms = Off"); m_StopTerms = false; } else { System.out.println("Invalid argument: " + args[2]); return false; } } //get the number of features to use if (args.length > 3) { String sNumFeatures = args[3].substring(1); // take off the "-" Integer intNumFeatures; try { intNumFeatures = new Integer(sNumFeatures); } catch (Exception e) { System.out.println("Invalid argument: " + sNumFeatures + " is not a number."); return false; } int iNumFeatures = intNumFeatures.intValue(); if(iNumFeatures <= 0) { System.out.println("Invalid argument: " + sNumFeatures + " is not greater than zero."); return false; } m_iMaxNumFeatures = iNumFeatures; System.out.println("Num Features = " + m_iMaxNumFeatures); } //get the file and relation name if (args.length > 4) { m_sOutputFileName = args[4]; System.out.println("File Name = " + m_sOutputFileName); } if(args.length == 1) { System.out.println("Word Stemming = Off"); System.out.println("Stop Terms = Off"); System.out.println("Num Features = 500"); System.out.println("File Name = LingSpam"); } else if (args.length ==2) { System.out.println("Stop Terms = Off"); System.out.println("Num Features = 500"); System.out.println("File Name = LingSpam"); } else if (args.length ==3) { System.out.println("Num Features = 500"); System.out.println("File Name = LingSpam"); } else if (args.length ==4) { System.out.println("File Name = LingSpam"); } System.out.println("**************************************************"); return true; } // end analyzeArguments() //reads all the email messages and collects the features //controls m_hFeatures and m_vFeatureCount and several counter variables //returns false if an error is encountered, true o/w static boolean collectInitialFeatures() { //read all message files in path ..../pathX/ where 1 <= X <= 10 (10 subdirectories) //and create hash table for all features (words) found String sSubDir = "part"; //for each subdirectory for(int i = 1; i <= NUM_SUBDIR; i++) { Integer intSubDirCount = new Integer(i); String sDir = directoryName + sSubDir + intSubDirCount.toString() + "/"; //create the File object for the directory File fDir = new File(sDir); //determine all files in the current directory String[] sFiles = fDir.list(); //for each possible file in this subdirectory for(int iFileIndex = 0; iFileIndex <= 291; iFileIndex++) //KEEP { try { //will jump to Catch block if index DNE String sFileName = sFiles[iFileIndex]; //get the data from this file as a string String sFileData = Utilities.getFileContents(sDir + sFileName); //convert to lower case sFileData = sFileData.toLowerCase(); //increment the file counter m_iFileCount++; //keep track of the features seen in this message so you don't count a feature twice for the same msg //note that the key's values are not used Hashtable hCurrentMsgFeaturesSeen = new Hashtable(); //determine from the file name whether sFileName is spam int iClassIndex = determineSpam(sFileName); //increment the count of spam/legit msgs if(iClassIndex == SPAM) { m_iSpamCount++; } else { m_iLegitCount++; } StringTokenizer st = new StringTokenizer(sFileData); String sToken; while (st.hasMoreTokens()) { sToken = st.nextToken(); //if stop terms are to be applied and this token is a stop term, go to the next token //or, if the feature is "subject:", go to next token if(! ( (m_StopTerms && stopTerm(sToken)) || sToken.equalsIgnoreCase("subject:")) ) { //check if word stemming is to be applied if (m_Stem) { //find the stem of the word sToken = stem(sToken); } //try to find the feature index of this feature Integer intFeatureIndex = (Integer)(m_hFeatures.get(sToken)); //if the word is in the hashtable (i.e. has a feature index) if(intFeatureIndex != null) { //check if this feature has already been seen in this msg Integer intCurrentFeatureIndex = (Integer)(hCurrentMsgFeaturesSeen.get(sToken)); //if this feature has not yet been seen in this msg... if(intCurrentFeatureIndex == null) { //get this feature's index int iFeatureIndex = intFeatureIndex.intValue(); //add the feature to the hashtable keeping track of which features this msg has seen hCurrentMsgFeaturesSeen.put(sToken, intFeatureIndex); //get the current number of msgs that have this feature for this class (the class of the msg) int iMsgFeatureCount = ((Integer)((Vector)m_vFeatureCount.elementAt(iFeatureIndex)).elementAt(iClassIndex)).intValue(); //increment the count //iMsgFeatureCount++; Integer intIncrement = new Integer(iMsgFeatureCount + 1); //add the incremented count to m_vFeatureCount ((Vector)m_vFeatureCount.elementAt(iFeatureIndex)).setElementAt(intIncrement, iClassIndex); } else { //found a feature twice in the same msg - do nothing } } //else, the word is not in the hashtable (we found a new feature) else { //increment the total feature count by one m_iFeatureCount++; Integer intFeatureCount = new Integer(m_iFeatureCount); //add the new feature to the hashtable with m_iFeatureCount as its value (index) m_hFeatures.put(sToken, intFeatureCount); //add the new feature to the hashtable of features seen in the current msg hCurrentMsgFeaturesSeen.put(sToken, intFeatureCount); //add a new vector to the end of m_vFeatureCount for the new feature //and set this new feature's count to 1 for the appropriate class (SPAM/LEGIT) Vector vNewFeature = new Vector(); if(iClassIndex == LEGIT) { vNewFeature.addElement(ONE); vNewFeature.addElement(ZERO); } else //SPAM { vNewFeature.addElement(ZERO); vNewFeature.addElement(ONE); } m_vFeatureCount.addElement(vNewFeature); } } // end if(!(m_StopTerms && stopTerm(sToken))) } // end while(st.hasMoreTokens()) } //will get here if there aren't enough files in the string[] - sloppy, I know, but it works catch(Exception e) { dprint("Caught Exception: " + e); } } // end for(int iFileIndex = 0; iFileIndex <= 291; iFileIndex++) } // end for(int i = 1; i <= 10; i++) //we now have all information needed to compute the mutual infromation of each feature w/ each class //error checking if(m_vFeatureCount.size() != m_iFeatureCount + 1) { //there was a processing error System.out.println("ERROR: Feature Counts don't match up."); return false; } //signal that no errors occured return true; } //end collectInitialFeatures() //go through all the messages and create a BOOLEAN feature vector for each message using the best m_iMaxNumFeatures features (by highgest MI) //Note: may only be called after the top features have been selected according to MI //Note: feature vector is boolean - only says whether a particular message contains a feature or not (not how many times the feature appears) static void createBOOLFeatureVectorForAllMsgs() { //read all message files in path ..../pathX/ where 1 <= X <= 10 (10 subdirectories) //and create hash table for all features (words) found //set the size for the booelan feature vector //index 0 = msg index (file index) //index 1 = feature index m_bFeatureVectors = new boolean[m_iFileCount + 1][m_iMaxNumFeatures + 1]; String sSubDir = "part"; //used to keep track of which file we're on int iGlobalFileIndex = -1; //for each subdirectory for(int i = 1; i <= NUM_SUBDIR; i++) { Integer intSubDirCount = new Integer(i); String sDir = directoryName + sSubDir + intSubDirCount.toString() + "/"; //create the File object for the directory File fDir = new File(sDir); //determine all files in the current directory String[] sFiles = fDir.list(); //for each possible file in this subdirectory for(int iLocalFileIndex = 0; iLocalFileIndex <= 291; iLocalFileIndex++) { try { //will jump to Catch block if index DNE String sFileName = sFiles[iLocalFileIndex]; //create a feature vector (boolean array) for this msg (initialized to false) //boolean[] bFVector = new boolean[m_iMaxNumFeatures]; //increment the global file index iGlobalFileIndex++; //determine from the file name whether sFileName is spam int iClassIndex = determineSpam(sFileName); //insert the class label into this msg's feature vector if(iClassIndex == SPAM) { m_bFeatureVectors[iGlobalFileIndex][m_iMaxNumFeatures] = true; } else { m_bFeatureVectors[iGlobalFileIndex][m_iMaxNumFeatures] = false; } //get the data from this file as a string String sFileData = Utilities.getFileContents(sDir + sFileName); //convert to lower case sFileData = sFileData.toLowerCase(); StringTokenizer st = new StringTokenizer(sFileData); String sToken; while (st.hasMoreTokens()) { sToken = st.nextToken(); //if stop terms are to be applied and this token is a stop term, go to the next token //or, if the feature is "subject:", go to next token if(!((m_StopTerms && stopTerm(sToken)) || sToken.equalsIgnoreCase("subject:"))) { //check if word stemming is to be applied if (m_Stem) { //find the stem of the word sToken = stem(sToken); } //determine if this token is a feature in the feature vector Integer intFeatureIndex = ((Integer)m_hFeatures.get(sToken)); //if the word is in the hashtable (i.e. has a feature index) if(intFeatureIndex != null) { //indicate in this msg's feature vector that this msg contains this feature int iFeatureIndex = intFeatureIndex.intValue(); m_bFeatureVectors[iGlobalFileIndex][iFeatureIndex] = true; } } // end if(!... } // end while (st.... } //will get here if there's an array OOB on sFiles[] - sloppy, I know, but it works catch(Exception e) { dprint("Caught Exception: " + e); } } // end for(int iFileIndex = 0; iFileIndex <= 291; iFileIndex++) } // end for(int i = 1; i <= 10; i++) } // end createBOOLFeatureVectorForAllMsgs() //go through all the messages and create a TF feature vector for each message using the best m_iMaxNumFeatures features (by highgest MI) //Note: may only be called after the top features have been selected according to MI //Note: feature vector is TF or IDF - counts the number of times each feature appears in a message static void createTFFeatureVectorForAllMsgs() { //read all message files in path ..../pathX/ where 1 <= X <= 10 (10 subdirectories) //and create hash table for all features (words) found //set the size for the TF feature vector //index 0 = msg index (file index) //index 1 = feature index m_iTFFeatureVectors = new int [m_iFileCount + 1][m_iMaxNumFeatures + 1]; String sSubDir = "part"; //used to keep track of which file we're on int iGlobalFileIndex = -1; //for each subdirectory for(int i = 1; i <= NUM_SUBDIR; i++) { Integer intSubDirCount = new Integer(i); String sDir = directoryName + sSubDir + intSubDirCount.toString() + "/"; //create the File object for the directory File fDir = new File(sDir); //determine all files in the current directory String[] sFiles = fDir.list(); //for each possible file in this subdirectory for(int iLocalFileIndex = 0; iLocalFileIndex <= 291; iLocalFileIndex++) { try { //will jump to Catch block if index DNE String sFileName = sFiles[iLocalFileIndex]; //create a feature vector (boolean array) for this msg (initialized to false) //boolean[] bFVector = new boolean[m_iMaxNumFeatures]; //increment the global file index iGlobalFileIndex++; //determine from the file name whether sFileName is spam int iClassIndex = determineSpam(sFileName); //insert the class label into this msg's feature vector if(iClassIndex == SPAM) { m_iTFFeatureVectors[iGlobalFileIndex][m_iMaxNumFeatures] = SPAM; } else { m_iTFFeatureVectors[iGlobalFileIndex][m_iMaxNumFeatures] = LEGIT; } //get the data from this file as a string String sFileData = Utilities.getFileContents(sDir + sFileName); //convert to lower case sFileData = sFileData.toLowerCase(); StringTokenizer st = new StringTokenizer(sFileData); String sToken; while (st.hasMoreTokens()) { sToken = st.nextToken(); //if stop terms are to be applied and this token is a stop term, go to the next token //or, if the feature is "subject:", go to next token if(!((m_StopTerms && stopTerm(sToken)) || sToken.equalsIgnoreCase("subject:"))) { //check if word stemming is to be applied if (m_Stem) { //find the stem of the word sToken = stem(sToken); } //determine if this token is a feature in the feature vector Integer intFeatureIndex = ((Integer)m_hFeatures.get(sToken)); //if the word is in the hashtable (i.e. has a feature index) if(intFeatureIndex != null) { //indicate in this msg's feature vector that this msg contains this feature int iFeatureIndex = intFeatureIndex.intValue(); m_iTFFeatureVectors[iGlobalFileIndex][iFeatureIndex] ++; } } // end if(!... } // end while (st.... } //will get here if there's an array OOB on sFiles[] - sloppy, I know, but it works catch(Exception e) { dprint("Caught Exception: " + e); } } // end for(int iFileIndex = 0; iFileIndex <= 291; iFileIndex++) } // end for(int i = 1; i <= 10; i++) //if using inverse document frequency (IDF), then compute this for each feature vector if(m_FeatureVectorType == TF_IDF) { //Note: IDF(f) = log( |D|/DF(f) ) // where |D| is the number of documents // and DF(f) is the numeber of times feature f appears in all documents //TF-IDF = TF(f) * IDF(f), where TF(f) is the term frequency of f //initialize the TF-IDF feature vectors m_dTFIDFFeatureVectors = new double[m_iFileCount + 1][m_iMaxNumFeatures + 1]; //copy the each msgs's class value to the TFIDF feature vector for(int iFileIndex = 0; iFileIndex <= m_iFileCount; iFileIndex++) { m_dTFIDFFeatureVectors[iFileIndex][m_iMaxNumFeatures] = m_iTFFeatureVectors[iFileIndex][m_iMaxNumFeatures]; } //holds the document frequency for each feature - the number of time each feature appears in all documents double[] dDF = new double[m_iMaxNumFeatures]; //determine the DF for each feature and save it (excluding the class data) for(int iFeatureIndex = 0; iFeatureIndex < m_iMaxNumFeatures; iFeatureIndex++) { for(int iFileIndex = 0; iFileIndex <= m_iFileCount; iFileIndex++) { dDF[iFeatureIndex] += m_iTFFeatureVectors[iFileIndex][iFeatureIndex]; } } //determine the TF-IDF for each feature in each message (excluding class attribute) double dTotalDocumets = m_iFileCount + 1; for(int iFeatureIndex = 0; iFeatureIndex < m_iMaxNumFeatures; iFeatureIndex++) { for(int iFileIndex = 0; iFileIndex <= m_iFileCount; iFileIndex++) { double dIDF = logBase2(dTotalDocumets/dDF[iFeatureIndex]); double dTF = m_iTFFeatureVectors[iFileIndex][iFeatureIndex]; double dTFIDF = dTF * dIDF; m_dTFIDFFeatureVectors[iFileIndex][iFeatureIndex] = dTFIDF; } } //delete the integer feature vector (to clear up memory) m_iTFFeatureVectors = new int[1][1]; } // end if(m_FeatureVectorType == TF_IDF) } // end createTFFeatureVectorForAllMsgs() //calculates the mutual information for all features static void calculateAllFeatureMI() { //holds the MI for each feature, index = feature index m_vMI =new Vector(); /* Below are the probabilities used to calculate the mutual information P_S = P(C=SPAM) P_L = P(C=LEGIT) P_1S = P(X=1, C=SPAM) - X=1 implies the feature is present P_1L = P(X=1, C=LEGIT) P_0S = P(X=0, C=SPAM) - X=0 implies the feature is NOT present P_0L = P(X=0, C=LEGIT) P_1 = P(X=1) - X=1 implies the feature is present P_0 = P(X=0) - X=0 implies the feature is NOT present */ //determine P(C=SPAM), and P(C=LEGIT) - the frequency of msgs that were SPAM and LEGIT, respectively double dTotalSpam = (double)(m_iSpamCount + 1); double dTotalLegit = (double)(m_iLegitCount + 1); double dTotalMsgs = (double)(m_iFileCount + 1); // since m_iFileCoutn is 0 based double P_S = dTotalSpam/dTotalMsgs; double P_L = dTotalLegit/dTotalMsgs; //go through m_vFeatureCount and determine the MI for each feature double dTotalFeatures = (double)(m_iFeatureCount + 1); // since m_iFeatureCoutn is 0 based double dOne = 1; double dZero = 0; for(int iFeatureIndex = 0; iFeatureIndex < m_vFeatureCount.size(); iFeatureIndex++) { double dSpamWithFeature = ((Integer)((Vector)m_vFeatureCount.elementAt(iFeatureIndex)).elementAt(SPAM)).doubleValue(); double dLegitWithFeature = ((Integer)((Vector)m_vFeatureCount.elementAt(iFeatureIndex)).elementAt(LEGIT)).doubleValue(); double P_1 = (dSpamWithFeature + dLegitWithFeature)/dTotalMsgs; double P_0 = dOne - P_1; double P_1S = dSpamWithFeature/dTotalMsgs; double P_1L = dLegitWithFeature/dTotalMsgs; double P_0S = (dTotalSpam - dSpamWithFeature)/dTotalMsgs; double P_0L = (dTotalLegit - dLegitWithFeature)/dTotalMsgs; //error check if((P_1 > 1) || (P_0 > 1) || (P_1S > 1) || (P_1L > 1) || (P_0S > 1) || (P_0L > 1) || (P_S > 1) || (P_L > 1)) { dprint("ERROR: probability calculations yielded value > 1."); } //Note: P_0, P_0S, P_0L, P_1S, P_1L could legitimately be zero double dSum1 = 0; double dSum2 = 0; double dSum3 = 0; double dSum4 = 0; //find P_0S * log( P_0S/(P_0 * P_S) ) if((P_0 != dZero) && (P_0S != dZero)) { dSum1 = P_0S * logBase2(P_0S/(P_0 * P_S)); } //find P_1S * log( P_1S/(P_1 * P_S) ) if(P_1S != dZero) { dSum2 = P_1S * logBase2( P_1S/(P_1 * P_S)); } //find P_0L * log( P_0L/(P_0 * P_L) ) if((P_0 != dZero) && (P_0L != dZero)) { dSum3 = P_0L * logBase2( P_0L/(P_0 * P_L)); } //find P_1L * log( P_1L/(P_1 * P_L) ) if(P_1L != dZero) { dSum4 = P_1L * logBase2( P_1L/(P_1 * P_L)); } double dMI = dSum1 + dSum2 + dSum3 + dSum4; Double dubMI = new Double(dMI); //error check if(dubMI.isNaN()) { dprint(""); dprint("Found a NaN MI"); dprint("Sums on FeatureIndex = " + iFeatureIndex); dprint("P_1 = " + P_1); dprint("P_0 = " + P_0); dprint("P_1S = " + P_1S); dprint("P_1L = " + P_1L); dprint("P_0S = " + P_0S); dprint("P_0L = " + P_0L); dprint("" + dSum1); dprint("" + dSum2); dprint("" + dSum3); dprint("" + dSum4); dprint("MI = " + dubMI); } //add the MI to m_vMI m_vMI.addElement(dubMI); } // end for(int iFeatureIndex = 0; iFeatureIndex < m_vFeatureCount.size(); iFeatureIndex++) } // end calculateAllFeatureMI() //returns a vector with Integers denoting the feature indices of the features with the highest MI's static Vector sortFeaturesByMI() { //determine the X features with the largest MI's (where X = the user defined m_iMaxNumFeatures //these will be used in the feature vectors that will be used to build the classifiers. //contains vectors holding 1. feature index 2. MI for the feature with that index Vector vSortedFeatures = new Vector(); //create the vSortedFeatures vector for(int iFeatureIndex = 0; iFeatureIndex < m_vFeatureCount.size(); iFeatureIndex++) { Vector vInner = new Vector(); Integer intFeatureIndex = new Integer(iFeatureIndex); vInner.addElement(intFeatureIndex); // feature index vInner.addElement(m_vMI.elementAt(iFeatureIndex)); // feature's MI vSortedFeatures.addElement(vInner); } //sort vSortedFeatures by the features' MI's to find the features with the top m_iMaxNumFeatures MI's //goes from left to right, finds max, puts at front, then starts at next pos'n looking for max, repeat... for(int iSortIndex1 = 0; iSortIndex1 < m_iMaxNumFeatures; iSortIndex1++) { double dMaxMI = -1; // min value is 0 int iIndexOfMaxMI = -1; //find the max of the unsorted and insert it at pos'n iSortIndex1 for(int iSortIndex2 = iSortIndex1; iSortIndex2 < vSortedFeatures.size(); iSortIndex2++) { double dCurrentMI = ((Double)((Vector)vSortedFeatures.elementAt(iSortIndex2)).elementAt(1)).doubleValue(); if (dCurrentMI > dMaxMI) { dMaxMI = dCurrentMI; iIndexOfMaxMI = iSortIndex2; } } //get a copy of the min vector Vector vCurrentMin = ((Vector)((Vector)vSortedFeatures.elementAt(iIndexOfMaxMI)).clone()); //delete the vector's previous position vSortedFeatures.removeElementAt(iIndexOfMaxMI); //put the local max feature in the leftmost unsorted slot vSortedFeatures.insertElementAt(vCurrentMin, iSortIndex1); } //clear m_vMI - make memory available m_vMI = new Vector(); //make vBestFeatures an ordered vector of feature indices (the features with the highest MI's) Vector vBestFeatures = new Vector(); for(int iIndex = 0; iIndex < m_iMaxNumFeatures; iIndex++) { //add the feature's index vBestFeatures.addElement(((Vector)vSortedFeatures.elementAt(iIndex)).elementAt(0)); } //return the best X sorted features (X = m_iMaxNumFeatures) return vBestFeatures; } // end sortFeaturesByMI() //determines from the file name whether the given file is spam //returns SPAM (int 1) if the file is spam static private int determineSpam(String sFileName) { if(sFileName.startsWith("spm")) { return SPAM; } else { return LEGIT; } } //saves the features used for creating the .arff file (in order of importance) //if called, must be called before buildNewFeatureStructures() static void saveTopFeatures(Vector vBestFeatures) { //ordered vector of feature strings by MI Vector vTopFeatureStrings = new Vector(); //set vTopFeatureStrings to have size = m_iMaxNumFeatures for(int iIndex = 0; iIndex < m_iMaxNumFeatures; iIndex ++) { vTopFeatureStrings.addElement(""); } //create a vector of the best features' strings Enumeration enumFeatureStrings = m_hFeatures.keys(); while(enumFeatureStrings.hasMoreElements()) { String sFeature = ((String)enumFeatureStrings.nextElement()); //get the index of this feature Integer intFeatureIndex = ((Integer)m_hFeatures.get(sFeature)); //check if this feature is in the list of best features (check by index) for(int iIndex = 0; iIndex < vBestFeatures.size(); iIndex++) { if(intFeatureIndex.equals(((Integer)vBestFeatures.elementAt(iIndex)))) { vTopFeatureStrings.setElementAt(sFeature, iIndex); } } } //print each feature in order of MI dprint("Top " + m_iMaxNumFeatures + " features (ordered by MI)"); for(int iIndex = 0; iIndex < m_iMaxNumFeatures; iIndex++) { dprint(iIndex + " " + ((String)vTopFeatureStrings.elementAt(iIndex))); } //save the top features to the diagnostic diagnostic.vTopFeatures = vTopFeatureStrings; } // end saveTopFeatures //creates a new hashtable of the best features, and creates a vector holding all features as strings (index = new feature index) //(i.e. rebuild m_hFeatures and create m_vFeatureStrings) static void buildNewFeatureStructures(Vector vBestFeatures) { int iBestFeatureCount = -1; // used to determine new feature indices Hashtable hBestFeatures = new Hashtable(); //create a vector of the best features' strings Enumeration enumFeatureStrings = m_hFeatures.keys(); while(enumFeatureStrings.hasMoreElements()) { String sFeature = ((String)enumFeatureStrings.nextElement()); //get the index of this feature Integer intFeatureIndex = ((Integer)m_hFeatures.get(sFeature)); //check if this feature is in the list of best features (check by index) for(int iIndex = 0; iIndex < vBestFeatures.size(); iIndex++) { if(intFeatureIndex.equals(((Integer)vBestFeatures.elementAt(iIndex)))) { iBestFeatureCount++; //add this feature to the feature string (in pos'n iBestFeatureCount) m_vFeatureStrings.addElement(sFeature); Integer intNewFeatureIndex = new Integer(iBestFeatureCount); //add this feature to the new hash table (with key = iBestFeatureCount = new feature index) hBestFeatures.put(sFeature, intNewFeatureIndex); } } } //replace the old feature hash table with the new one m_hFeatures = hBestFeatures; } // end buildNewFeatureStructures() //returns a vector with all feature strings (keys in m_hFeatures) where index = feature index static Vector getFeatureStringVector(Vector vSortedFeatures) { //make a vector with all feature strings where index = feature index Enumeration enumFeatureStrings = m_hFeatures.keys(); Vector vFeatureStrings = new Vector(); while(enumFeatureStrings.hasMoreElements()) { String sFeature = ((String)enumFeatureStrings.nextElement()); vFeatureStrings.addElement(sFeature); } return vFeatureStrings; } //use the feature vectors to create an .arff file (to be used for building a classifier) static void buildARFF() { System.out.println(""); System.out.println("Building the .arff file - this may take a while..."); System.out.println(""); //.arff format: //@relation Nursery //@attribute parents {usual, pretentious, great_pret} //@data //usual,proper,complete,1,convenient,convenient,nonprob,recommended,recommend String sNL = "\r\n"; String sAtt = "@attribute "; //write relation name String sARFF = "@relation " + m_sOutputFileName + sNL + sNL; //write the attribute information for(int iFeatureIndex = 0; iFeatureIndex < m_iMaxNumFeatures; iFeatureIndex++) { String sFeatureName = ((String)m_vFeatureStrings.elementAt(iFeatureIndex)); if(sFeatureName.indexOf("%") != -1) { //replace "%" with "p" - it gives Weka trouble sFeatureName = sFeatureName.replace('%','p'); } if(sFeatureName.indexOf("'") != -1) { //replace "'" with "" - it gives Weka trouble sFeatureName = sFeatureName.replace('\'','_'); } //add the data type (dependan on the feature vector type) if(m_FeatureVectorType == BOOL) { sARFF += sAtt + sFeatureName + " {0,1}" + sNL; } else if(m_FeatureVectorType == TF) { sARFF += sAtt + sFeatureName + " NUMERIC" + sNL; } else if(m_FeatureVectorType == TF_IDF) { sARFF += sAtt + sFeatureName + " NUMERIC" + sNL; } } //add the class relation if(m_FeatureVectorType == TF_IDF) { sARFF += sAtt + "IS_SPAM {0.0,1.0}" + sNL; //sARFF += sAtt + "IS_SPAM {0,1}" + sNL; } else { sARFF += sAtt + "IS_SPAM {0,1}" + sNL; } // add the @data tag sARFF += sNL + "@data" + sNL; //for each file, write the apporpriate line to the arff file string for(int iFileIndex = 0; iFileIndex <= m_iFileCount; iFileIndex++) { String sData = ""; for(int iFeatureIndex = 0; iFeatureIndex < m_iMaxNumFeatures; iFeatureIndex++) { //add the data based on the type of feature vector used if(m_FeatureVectorType == BOOL) { if(m_bFeatureVectors[iFileIndex][iFeatureIndex]) sData += "1,"; else sData += "0,"; } else if(m_FeatureVectorType == TF) { sData += m_iTFFeatureVectors[iFileIndex][iFeatureIndex] + ","; } else if(m_FeatureVectorType == TF_IDF) { sData += m_dTFIDFFeatureVectors[iFileIndex][iFeatureIndex] + ","; } } //add the class label if(m_FeatureVectorType == BOOL) { if(m_bFeatureVectors[iFileIndex][m_iMaxNumFeatures]) sData += "1" + sNL; else sData += "0" + sNL; } else if(m_FeatureVectorType == TF) { sData += m_iTFFeatureVectors[iFileIndex][m_iMaxNumFeatures] + sNL; } else if(m_FeatureVectorType == TF_IDF) { sData += m_dTFIDFFeatureVectors[iFileIndex][m_iMaxNumFeatures] + sNL; } //add this line of data to the arff string sARFF += sData; } //if the feature vector is boolean or discretization is off, write the data to the final .arff file if((m_FeatureVectorType == BOOL) || !m_bDiscretizeData) { System.out.println("Creating final output file: " + m_sOutputFileName + ".arff"); Utilities.writeStringToFile(sARFF,m_sOutputFileName + ".arff"); } //otherwise, write to a temporary file to be 2-value discretized else { try { System.out.println("Creating temporary file: " + sTempOuputFile + ".arff"); //write the temporary arff string to an .arff file with name sTempOuputFile.arff Utilities.writeStringToFile(sARFF,sTempOuputFile + ".arff"); //use this temporary file to create a discretized file // Read all the instances in the temporary file FileReader reader = new FileReader(sTempOuputFile + ".arff"); Instances data = new Instances(reader); // Make the last attribute be the class data.setClassIndex(data.numAttributes() - 1); //discretize all numeric values //Filter filter = new weka.filters.DiscretizeFilter(); weka.filters.DiscretizeFilter filter = new weka.filters.DiscretizeFilter(); //set the input/ouput format filter.inputFormat(data); //default - set bins at num classes (2) //filter.setBins(5); for (int i = 0; i < data.numInstances(); i++) { filter.input(data.instance(i)); } filter.batchFinished(); Instances newData = filter.outputFormat(); Instance processed; while ((processed = filter.output()) != null) { newData.add(processed); } data = newData; //now write the discretized data to the final .arff file //write relation name sARFF = "@relation " + m_sOutputFileName + sNL + sNL; //write the attribute information Enumeration enumAtt = data.enumerateAttributes(); while (enumAtt.hasMoreElements()) { Attribute attr = (Attribute) enumAtt.nextElement(); sARFF += attr.toString() + sNL; } //add the class attribute if(m_FeatureVectorType == TF_IDF) { sARFF += sAtt + "IS_SPAM {0.0,1.0}" + sNL; } else { sARFF += sAtt + "IS_SPAM {0,1}" + sNL; } // add the @data tag sARFF += sNL + "@data" + sNL; //add all the data Enumeration enumData = data.enumerateInstances(); while (enumData.hasMoreElements()) { Instance inst = (Instance) enumData.nextElement(); sARFF += inst.toString() + sNL; } System.out.println("Creating final output file: " + m_sOutputFileName + ".arff"); //write the string to the final .arff file Utilities.writeStringToFile(sARFF,m_sOutputFileName + ".arff"); } catch(Exception e) { System.out.println("ERROR - " + e); } } // end if(m_FeatureVectorType == BOOL) //save the diagnostic data saveDiagnosticData(); } // end buildARFF //saves diagnostic data private static void saveDiagnosticData() { diagnostic.bStopTermsUsed = m_StopTerms; diagnostic.bWordStemmingUsed = m_Stem; diagnostic.endTime = new Date(); diagnostic.iMaxFeatures = m_iMaxNumFeatures; diagnostic.iNumFiles = m_iFileCount+1; diagnostic.iTotalNumFeatures = m_iFeatureCount +1; if(m_FeatureVectorType == BOOL) diagnostic.sFeatureVectorType = "BOOLEAN"; if(m_FeatureVectorType == TF) diagnostic.sFeatureVectorType = "TERM FREQUENCY"; if(m_FeatureVectorType == TF_IDF) diagnostic.sFeatureVectorType = "TF-IDF"; diagnostic.sRelationFileName = m_sOutputFileName; diagnostic.iNumWordsStemmed = m_iNumWordsStemmed; if(m_bDiscretizeData) diagnostic.s2Discretized = "Yes"; } }//end class FeatureFinder //Class of basic utitlities //Note: this was reused from a 572 lab. class Utilities { // In Microsoft's J++, the console window can close up before you get a chance to read it, // so this method can be used to wait until you're ready to proceed. public static void waitHere(String msg) { System.out.println(""); System.out.print(msg); try { System.in.read(); } catch(Exception e) {} // Ignore any errors while reading. } // This method will read the contents of a file, returning it as a string. //Note, this was taken from source code given in Lab 1 public static synchronized String getFileContents(String fileName) { File File = new File(fileName); DataInputStream inputDataStream; String results = null; //debug //System.out.println(""); //System.out.println("Open file " + fileName + " ..."); try { int length = (int)File.length(), bytesRead; byte byteArray[] = new byte[length]; ByteArrayOutputStream bytesBuffer = new ByteArrayOutputStream(length); FileInputStream InputStream = new FileInputStream(File); bytesRead = InputStream.read(byteArray); bytesBuffer.write(byteArray, 0, bytesRead); InputStream.close(); results = bytesBuffer.toString(); } catch(Exception e) { System.out.println("Exception in getFileContents(" + fileName + "), msg=" + e); } return results; } // Note: this does NOT add a final "linefeed" to the file. Might not want to // get that upon a subsequent "read" of the file. static synchronized boolean writeStringToFile(String contents, String fileName) { try { java.io.File File = new java.io.File(fileName); // Want this to be an auto-flush file. java.io.PrintWriter stream = new java.io.PrintWriter(new java.io.FileOutputStream(File), true); stream.print(contents); stream.close(); return true; } catch(Exception ioe) { //Error("Exception writing to " + fileName + ". Error msg: " + ioe); System.out.println("Exception writing to " + fileName + ". Error msg: " + ioe); return false; } } } // end class Utilities //Class of diagnostic measure class Diagnostic { static public Date startTime = new Date(); static public Date endTime; static public int iNumFiles; static public int iTotalNumFeatures; static public int iMaxFeatures; static public int iNumWordsStemmed; static public String sFeatureVectorType; static public boolean bStopTermsUsed; static public boolean bWordStemmingUsed; static public String sRelationFileName; static public String s2Discretized = "No"; static public Vector vTopFeatures; //writes all diagnostic data to file public static void writeToFile() { endTime = new Date(); String sNL = "\r\n"; String sOutput = sRelationFileName + ".arff" + sNL; //basic data sOutput += startTime.toString() + sNL; sOutput += endTime.toString() + sNL; sOutput += (endTime.getTime() - startTime.getTime()) + sNL; sOutput += iNumFiles + sNL; sOutput += iTotalNumFeatures + sNL; //parameters sOutput += sFeatureVectorType + sNL; if(bWordStemmingUsed) sOutput += "Yes" + sNL; else sOutput += "No" + sNL; if(bStopTermsUsed) sOutput += "Yes" + sNL; else sOutput += "No " + sNL; sOutput += iMaxFeatures + sNL; //stemming sOutput += (iNumWordsStemmed +1) + sNL; //discretized sOutput += s2Discretized + sNL; //top features (max to least MI) for(int iIndex = 0; iIndex < iMaxFeatures; iIndex++) { sOutput += ((iIndex +1) + " " + ((String)vTopFeatures.elementAt(iIndex)) + sNL); } System.out.println("Creating diagnostic file: " + sRelationFileName + "_diagnostic.txt"); Utilities.writeStringToFile(sOutput,"diagnostic_" + sRelationFileName + ".txt"); } } // end class Diagnostic