websphinx
Class HTMLParser
java.lang.Object
|
+--websphinx.HTMLParser
- public class HTMLParser
- extends java.lang.Object
HTML parser. Parses an input stream or String and
converts it to a sequence of Tags and a tree of Elements.
HTMLParser is used by Page to parse pages.
Field Summary |
private static int |
AFTEREQ
|
private static int |
ATTR
|
(package private) java.lang.StringBuffer |
attrName
|
private static int |
ATTRNAME
|
(package private) java.util.Vector |
attrs
|
(package private) java.lang.StringBuffer |
attrVal
|
private static int |
ATTRVAL
|
private static int |
ATTRVAL_DQ
|
private static int |
ATTRVAL_SQ
|
private static int |
BANG
|
private static int |
BANG_DASH
|
(package private) static java.util.Hashtable |
blocktag
|
(package private) char[] |
buf
|
(package private) static int |
BUFFER_SIZE
|
private static int |
CMT
|
private static int |
CMT_DASH
|
private static int |
CMT_DASHDASH
|
(package private) java.lang.StringBuffer |
contentBuf
|
(package private) static java.util.Hashtable |
context
|
(package private) Form |
currentForm
|
private static int |
DIRECTIVE
|
private static int |
DONE
|
(package private) java.util.Stack |
elems
|
private static java.util.Hashtable |
empty
|
private static java.util.Hashtable |
entities
|
(package private) java.lang.StringBuffer |
entity
|
private static int |
ENTITY
|
private static int |
ENTNUM
|
private static int |
ENTREF
|
private static int |
EQ
|
private static int |
ETAG
|
private static java.util.Hashtable |
forcesClosed
|
(package private) static java.util.Hashtable |
headtag
|
private static int |
INWORD
|
(package private) static java.util.Hashtable |
linktag
|
(package private) static java.util.Hashtable |
literal
|
private static int |
LT
|
(package private) int |
maxBytes
|
(package private) int[] |
openElems
|
(package private) int |
openPtr
|
(package private) static java.util.Hashtable |
savetext
|
private static int |
STAG
|
private static int |
START
|
(package private) java.lang.StringBuffer |
tagName
|
(package private) java.lang.StringBuffer |
text
|
(package private) java.util.Vector |
vElements
|
(package private) java.util.Vector |
vLinks
|
(package private) java.lang.StringBuffer |
wordBuf
|
Method Summary |
(package private) static void |
()
|
(package private) static void |
()
|
private void |
buildParseTree(Page page)
|
private void |
close(Element elem,
int end)
|
private void |
close(Element elem,
Tag tag)
|
private void |
closeAll(int end)
|
void |
dontParse(Page page,
java.io.InputStream stream)
Download an input stream without parsing it. |
void |
dontParse(Page page,
java.io.Reader stream)
Download an input stream without parsing it. |
private Element |
findOpenElement(java.util.Hashtable tags)
|
private Element |
findOpenElement(java.lang.String tagname)
|
private static java.lang.String |
indentation(int indent)
|
private static boolean |
isWhitespace(char c)
|
private static java.lang.Character |
lookupEntityRef(java.lang.String name)
|
static void |
main(java.lang.String[] args)
|
private Element |
makeElement(java.net.URL base,
Tag tag)
|
private void |
open(Element e)
|
void |
parse(Page page,
java.io.InputStream stream)
Parse an input stream. |
void |
parse(Page page,
java.io.Reader stream)
Parse an input stream. |
void |
parse(Page page,
java.lang.String content)
Parse a string. |
private static void |
printout(Element element,
int indent)
|
private static void |
printout(Link[] elements,
int indent)
|
private void |
tokenize(Page page,
java.io.Reader stream,
boolean saveContent)
|
private static void |
union(java.util.Hashtable map,
java.lang.Object tagname,
java.util.Hashtable tagset)
|
Methods inherited from class java.lang.Object |
clone, equals, finalize, getClass, hashCode, notify, notifyAll, registerNatives, toString, wait, wait, wait |
BUFFER_SIZE
static final int BUFFER_SIZE
maxBytes
int maxBytes
START
private static final int START
INWORD
private static final int INWORD
ENTITY
private static final int ENTITY
LT
private static final int LT
BANG
private static final int BANG
BANG_DASH
private static final int BANG_DASH
CMT
private static final int CMT
CMT_DASH
private static final int CMT_DASH
CMT_DASHDASH
private static final int CMT_DASHDASH
DIRECTIVE
private static final int DIRECTIVE
STAG
private static final int STAG
ETAG
private static final int ETAG
ATTR
private static final int ATTR
ATTRNAME
private static final int ATTRNAME
EQ
private static final int EQ
AFTEREQ
private static final int AFTEREQ
ATTRVAL
private static final int ATTRVAL
ATTRVAL_SQ
private static final int ATTRVAL_SQ
ATTRVAL_DQ
private static final int ATTRVAL_DQ
DONE
private static final int DONE
ENTNUM
private static final int ENTNUM
ENTREF
private static final int ENTREF
buf
char[] buf
contentBuf
java.lang.StringBuffer contentBuf
wordBuf
java.lang.StringBuffer wordBuf
tagName
java.lang.StringBuffer tagName
attrName
java.lang.StringBuffer attrName
attrVal
java.lang.StringBuffer attrVal
attrs
java.util.Vector attrs
entity
java.lang.StringBuffer entity
entities
private static java.util.Hashtable entities
vElements
java.util.Vector vElements
vLinks
java.util.Vector vLinks
text
java.lang.StringBuffer text
empty
private static java.util.Hashtable empty
blocktag
static java.util.Hashtable blocktag
forcesClosed
private static java.util.Hashtable forcesClosed
context
static java.util.Hashtable context
literal
static java.util.Hashtable literal
linktag
static java.util.Hashtable linktag
savetext
static java.util.Hashtable savetext
headtag
static java.util.Hashtable headtag
elems
java.util.Stack elems
openElems
int[] openElems
openPtr
int openPtr
currentForm
Form currentForm
HTMLParser
public HTMLParser()
- Make an HTMLParser.
HTMLParser
public HTMLParser(DownloadParameters dp)
- Make an HTMLParser which retrieves pages
using the specified download parameters. Pages
larger than dp.getMaxPageSize() are rejected by parse()
with an IOException.
- Parameters:
dp
- download parameters used during parsing
parse
public void parse(Page page,
java.io.InputStream stream)
throws java.io.IOException
- Parse an input stream.
- Parameters:
page
- Page to receive parsed HTMLinput
- stream containing HTML
parse
public void parse(Page page,
java.io.Reader stream)
throws java.io.IOException
- Parse an input stream.
- Parameters:
page
- Page to receive parsed HTMLinput
- stream containing HTML
parse
public void parse(Page page,
java.lang.String content)
throws java.io.IOException
- Parse a string.
- Parameters:
page
- Page to receive parsed HTMLcontent
- String containing HTML
dontParse
public void dontParse(Page page,
java.io.InputStream stream)
throws java.io.IOException
- Download an input stream without parsing it.
- Parameters:
page
- Page to receive the downloaded contentinput
- stream containing content
dontParse
public void dontParse(Page page,
java.io.Reader stream)
throws java.io.IOException
- Download an input stream without parsing it.
- Parameters:
page
- Page to receive the downloaded contentr
- stream containing content
tokenize
private void tokenize(Page page,
java.io.Reader stream,
boolean saveContent)
throws java.io.IOException
isWhitespace
private static boolean isWhitespace(char c)
lookupEntityRef
private static java.lang.Character lookupEntityRef(java.lang.String name)
static void ()
static void ()
union
private static void union(java.util.Hashtable map,
java.lang.Object tagname,
java.util.Hashtable tagset)
buildParseTree
private void buildParseTree(Page page)
makeElement
private Element makeElement(java.net.URL base,
Tag tag)
open
private void open(Element e)
findOpenElement
private Element findOpenElement(java.lang.String tagname)
findOpenElement
private Element findOpenElement(java.util.Hashtable tags)
close
private void close(Element elem,
Tag tag)
close
private void close(Element elem,
int end)
closeAll
private void closeAll(int end)
main
public static void main(java.lang.String[] args)
throws java.lang.Exception
indentation
private static java.lang.String indentation(int indent)
printout
private static void printout(Element element,
int indent)
printout
private static void printout(Link[] elements,
int indent)