import java.io.*;
public class HTMLWordParser {
/**
* Start HTML tag character constant.
*/
protected static final char startHTMLTag = '<';
/**
* End HTML tag character constant.
*/
protected static final char endHTMLTag = '>';
/**
* The stream tokenizer constructed from the input stream.
*/
protected StreamTokenizer st;
/**
* Creates an HTMLWordParser for the specified InputStream.
*
* @param is the input stream to read from.
*/
public HTMLWordParser(InputStream is) {
st = new StreamTokenizer(new BufferedReader(new InputStreamReader(is)));
// Reset the parsing table.
st.resetSyntax();
// Set all printing characters to be part of words.
st.wordChars('\u0020', '\uFFFF');
// Control characters and the space characters are all white space.
st.whitespaceChars('\u0000', '\u0020');
// The HTML delimiters are not parts of words.
st.ordinaryChar(startHTMLTag);
st.ordinaryChar(endHTMLTag);
}
/**
* Tests if the character is the start of an HTML tag
*
* @param c the character to test.
* @return true if the character is the start of an HTML tag, otherwise
* false
*/
protected boolean isStartHTMLTag(char c) {
return c == startHTMLTag;
}
/**
* Tests if the character is the end of an HTML tag.
*
* @param c the character to test.
* @return true if the character is the end of an HTML tag, otherwise
* false
*/
protected boolean isEndHTMLTag(char c) {
return c == endHTMLTag;
}
/**
* Reads and discards HTML tags from the input stream.
*
* @return false if the end of the file is reached,
* otherwise true.
* @throws java.io.IOException an I/O error occurred.
*/
protected boolean skipHTMLTag() throws IOException {
int ttype;
while ((ttype = st.nextToken()) != StreamTokenizer.TT_EOF)
if (isEndHTMLTag((char) ttype))
return true; // The end of the HTML tag has been seen.
return false;
}
/**
* Reads the next word from the input stream, ignoring HTML tags.
*
* @return the next word from the input stream (excluding
* HTML tags) or null if the end of the stream is
* reached.
* @exception IOException if an I/O error occurs.
*/
public String nextWord() throws IOException {
int ttype;
while ((ttype = st.nextToken()) != StreamTokenizer.TT_EOF)
if (ttype == StreamTokenizer.TT_WORD)
// A word has been seen.
return st.sval;
else if (isStartHTMLTag((char) ttype)) {
// The start of an HTML tag has been seen.
if (!skipHTMLTag())
// The end of the input has been seen without seeing
// the end of an HTML tag.
return null;
// Else keep scanning the input.
}
else if (isEndHTMLTag((char) ttype))
// A rogue '>' has been seen.
return ">";
// The end of the input has been seen.
return null;
}
}