websphinx
Class HTMLParser

java.lang.Object
  |
  +--websphinx.HTMLParser

public class HTMLParser
extends java.lang.Object

HTML parser. Parses an input stream or String and converts it to a sequence of Tags and a tree of Elements. HTMLParser is used by Page to parse pages.


Field Summary
private static int AFTEREQ
           
private static int ATTR
           
(package private)  java.lang.StringBuffer attrName
           
private static int ATTRNAME
           
(package private)  java.util.Vector attrs
           
(package private)  java.lang.StringBuffer attrVal
           
private static int ATTRVAL
           
private static int ATTRVAL_DQ
           
private static int ATTRVAL_SQ
           
private static int BANG
           
private static int BANG_DASH
           
(package private) static java.util.Hashtable blocktag
           
(package private)  char[] buf
           
(package private) static int BUFFER_SIZE
           
private static int CMT
           
private static int CMT_DASH
           
private static int CMT_DASHDASH
           
(package private)  java.lang.StringBuffer contentBuf
           
(package private) static java.util.Hashtable context
           
(package private)  Form currentForm
           
private static int DIRECTIVE
           
private static int DONE
           
(package private)  java.util.Stack elems
           
private static java.util.Hashtable empty
           
private static java.util.Hashtable entities
           
(package private)  java.lang.StringBuffer entity
           
private static int ENTITY
           
private static int ENTNUM
           
private static int ENTREF
           
private static int EQ
           
private static int ETAG
           
private static java.util.Hashtable forcesClosed
           
(package private) static java.util.Hashtable headtag
           
private static int INWORD
           
(package private) static java.util.Hashtable linktag
           
(package private) static java.util.Hashtable literal
           
private static int LT
           
(package private)  int maxBytes
           
(package private)  int[] openElems
           
(package private)  int openPtr
           
(package private) static java.util.Hashtable savetext
           
private static int STAG
           
private static int START
           
(package private)  java.lang.StringBuffer tagName
           
(package private)  java.lang.StringBuffer text
           
(package private)  java.util.Vector vElements
           
(package private)  java.util.Vector vLinks
           
(package private)  java.lang.StringBuffer wordBuf
           
 
Constructor Summary
HTMLParser()
          Make an HTMLParser.
HTMLParser(DownloadParameters dp)
          Make an HTMLParser which retrieves pages using the specified download parameters.
 
Method Summary
(package private) static void ()
           
(package private) static void ()
           
private  void buildParseTree(Page page)
           
private  void close(Element elem, int end)
           
private  void close(Element elem, Tag tag)
           
private  void closeAll(int end)
           
 void dontParse(Page page, java.io.InputStream stream)
          Download an input stream without parsing it.
 void dontParse(Page page, java.io.Reader stream)
          Download an input stream without parsing it.
private  Element findOpenElement(java.util.Hashtable tags)
           
private  Element findOpenElement(java.lang.String tagname)
           
private static java.lang.String indentation(int indent)
           
private static boolean isWhitespace(char c)
           
private static java.lang.Character lookupEntityRef(java.lang.String name)
           
static void main(java.lang.String[] args)
           
private  Element makeElement(java.net.URL base, Tag tag)
           
private  void open(Element e)
           
 void parse(Page page, java.io.InputStream stream)
          Parse an input stream.
 void parse(Page page, java.io.Reader stream)
          Parse an input stream.
 void parse(Page page, java.lang.String content)
          Parse a string.
private static void printout(Element element, int indent)
           
private static void printout(Link[] elements, int indent)
           
private  void tokenize(Page page, java.io.Reader stream, boolean saveContent)
           
private static void union(java.util.Hashtable map, java.lang.Object tagname, java.util.Hashtable tagset)
           
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, registerNatives, toString, wait, wait, wait
 

Field Detail

BUFFER_SIZE

static final int BUFFER_SIZE

maxBytes

int maxBytes

START

private static final int START

INWORD

private static final int INWORD

ENTITY

private static final int ENTITY

LT

private static final int LT

BANG

private static final int BANG

BANG_DASH

private static final int BANG_DASH

CMT

private static final int CMT

CMT_DASH

private static final int CMT_DASH

CMT_DASHDASH

private static final int CMT_DASHDASH

DIRECTIVE

private static final int DIRECTIVE

STAG

private static final int STAG

ETAG

private static final int ETAG

ATTR

private static final int ATTR

ATTRNAME

private static final int ATTRNAME

EQ

private static final int EQ

AFTEREQ

private static final int AFTEREQ

ATTRVAL

private static final int ATTRVAL

ATTRVAL_SQ

private static final int ATTRVAL_SQ

ATTRVAL_DQ

private static final int ATTRVAL_DQ

DONE

private static final int DONE

ENTNUM

private static final int ENTNUM

ENTREF

private static final int ENTREF

buf

char[] buf

contentBuf

java.lang.StringBuffer contentBuf

wordBuf

java.lang.StringBuffer wordBuf

tagName

java.lang.StringBuffer tagName

attrName

java.lang.StringBuffer attrName

attrVal

java.lang.StringBuffer attrVal

attrs

java.util.Vector attrs

entity

java.lang.StringBuffer entity

entities

private static java.util.Hashtable entities

vElements

java.util.Vector vElements

vLinks

java.util.Vector vLinks

text

java.lang.StringBuffer text

empty

private static java.util.Hashtable empty

blocktag

static java.util.Hashtable blocktag

forcesClosed

private static java.util.Hashtable forcesClosed

context

static java.util.Hashtable context

literal

static java.util.Hashtable literal

linktag

static java.util.Hashtable linktag

savetext

static java.util.Hashtable savetext

headtag

static java.util.Hashtable headtag

elems

java.util.Stack elems

openElems

int[] openElems

openPtr

int openPtr

currentForm

Form currentForm
Constructor Detail

HTMLParser

public HTMLParser()
Make an HTMLParser.

HTMLParser

public HTMLParser(DownloadParameters dp)
Make an HTMLParser which retrieves pages using the specified download parameters. Pages larger than dp.getMaxPageSize() are rejected by parse() with an IOException.
Parameters:
dp - download parameters used during parsing
Method Detail

parse

public void parse(Page page,
                  java.io.InputStream stream)
           throws java.io.IOException
Parse an input stream.
Parameters:
page - Page to receive parsed HTML
input - stream containing HTML

parse

public void parse(Page page,
                  java.io.Reader stream)
           throws java.io.IOException
Parse an input stream.
Parameters:
page - Page to receive parsed HTML
input - stream containing HTML

parse

public void parse(Page page,
                  java.lang.String content)
           throws java.io.IOException
Parse a string.
Parameters:
page - Page to receive parsed HTML
content - String containing HTML

dontParse

public void dontParse(Page page,
                      java.io.InputStream stream)
               throws java.io.IOException
Download an input stream without parsing it.
Parameters:
page - Page to receive the downloaded content
input - stream containing content

dontParse

public void dontParse(Page page,
                      java.io.Reader stream)
               throws java.io.IOException
Download an input stream without parsing it.
Parameters:
page - Page to receive the downloaded content
r - stream containing content

tokenize

private void tokenize(Page page,
                      java.io.Reader stream,
                      boolean saveContent)
               throws java.io.IOException

isWhitespace

private static boolean isWhitespace(char c)

lookupEntityRef

private static java.lang.Character lookupEntityRef(java.lang.String name)

static void ()

static void ()

union

private static void union(java.util.Hashtable map,
                          java.lang.Object tagname,
                          java.util.Hashtable tagset)

buildParseTree

private void buildParseTree(Page page)

makeElement

private Element makeElement(java.net.URL base,
                            Tag tag)

open

private void open(Element e)

findOpenElement

private Element findOpenElement(java.lang.String tagname)

findOpenElement

private Element findOpenElement(java.util.Hashtable tags)

close

private void close(Element elem,
                   Tag tag)

close

private void close(Element elem,
                   int end)

closeAll

private void closeAll(int end)

main

public static void main(java.lang.String[] args)
                 throws java.lang.Exception

indentation

private static java.lang.String indentation(int indent)

printout

private static void printout(Element element,
                             int indent)

printout

private static void printout(Link[] elements,
                             int indent)