import java.io.*; public class HTMLWordParser { /** * Start HTML tag character constant. */ protected static final char startHTMLTag = '<'; /** * End HTML tag character constant. */ protected static final char endHTMLTag = '>'; /** * The stream tokenizer constructed from the input stream. */ protected StreamTokenizer st; /** * Creates an HTMLWordParser for the specified InputStream. * * @param is the input stream to read from. */ public HTMLWordParser(InputStream is) { st = new StreamTokenizer(new BufferedReader(new InputStreamReader(is))); // Reset the parsing table. st.resetSyntax(); // Set all printing characters to be part of words. st.wordChars('\u0020', '\uFFFF'); // Control characters and the space characters are all white space. st.whitespaceChars('\u0000', '\u0020'); // The HTML delimiters are not parts of words. st.ordinaryChar(startHTMLTag); st.ordinaryChar(endHTMLTag); } /** * Tests if the character is the start of an HTML tag * * @param c the character to test. * @return true if the character is the start of an HTML tag, otherwise * false */ protected boolean isStartHTMLTag(char c) { return c == startHTMLTag; } /** * Tests if the character is the end of an HTML tag. * * @param c the character to test. * @return true if the character is the end of an HTML tag, otherwise * false */ protected boolean isEndHTMLTag(char c) { return c == endHTMLTag; } /** * Reads and discards HTML tags from the input stream. * * @return false if the end of the file is reached, * otherwise true. * @throws java.io.IOException an I/O error occurred. */ protected boolean skipHTMLTag() throws IOException { int ttype; while ((ttype = st.nextToken()) != StreamTokenizer.TT_EOF) if (isEndHTMLTag((char) ttype)) return true; // The end of the HTML tag has been seen. return false; } /** * Reads the next word from the input stream, ignoring HTML tags. * * @return the next word from the input stream (excluding * HTML tags) or null if the end of the stream is * reached. * @exception IOException if an I/O error occurs. */ public String nextWord() throws IOException { int ttype; while ((ttype = st.nextToken()) != StreamTokenizer.TT_EOF) if (ttype == StreamTokenizer.TT_WORD) // A word has been seen. return st.sval; else if (isStartHTMLTag((char) ttype)) { // The start of an HTML tag has been seen. if (!skipHTMLTag()) // The end of the input has been seen without seeing // the end of an HTML tag. return null; // Else keep scanning the input. } else if (isEndHTMLTag((char) ttype)) // A rogue '>' has been seen. return ">"; // The end of the input has been seen. return null; } }