|
|||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | ||||||||
SUMMARY: INNER | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Object | +--websphinx.Crawler
Web crawler.
To write a crawler, extend this class and override shouldVisit () and visit() to create your own crawler.
To use a crawler:
Field Summary | |
private Action |
action
|
static java.lang.String[] |
ALL_LINKS
Specify ALL_LINKS as the link type to allow the crawler to visit any kind of link |
private java.util.Vector |
classifiers
|
private Link[] |
crawledRoots
|
private java.util.Vector |
crawlListeners
|
private PriorityQueue |
crawlQueue
|
private boolean |
depthFirst
|
private java.lang.String[] |
domain
|
private DownloadParameters |
dp
|
private PriorityQueue |
fetchQueue
|
static java.lang.String[] |
HYPERLINKS
Specify HYPERLINKS as the link type to allow the crawler to visit only hyperlinks (A, AREA, and FRAME tags which point to http:, ftp:, file:, or gopher: URLs). |
static java.lang.String[] |
HYPERLINKS_AND_IMAGES
Specify HYPERLINKS_AND_IMAGES as the link type to allow the crawler to visit only hyperlinks and inline images. |
private boolean |
ignoreVisitedLinks
|
private java.util.Vector |
linkListeners
|
private LinkPredicate |
linkPredicate
|
private int |
maxDepth
|
private java.lang.String |
name
|
private int |
numLinksTested
|
private int |
numPagesLeft
|
private int |
numPagesVisited
|
private PagePredicate |
pagePredicate
|
private RobotExclusion |
robotExclusion
|
private java.lang.String[] |
rootHrefs
|
private Link[] |
roots
|
private static long |
serialVersionUID
|
static java.lang.String[] |
SERVER
Specify SERVER as the crawl domain to limit the crawler to visit only pages on the same Web server (hostname and port number) as the root link from which it started. |
private int |
state
|
static java.lang.String[] |
SUBTREE
Specify SUBTREE as the crawl domain to limit the crawler to visit only pages which are descendants of the root link from which it started. |
private boolean |
synchronous
|
private java.lang.String[] |
type
|
private java.util.Hashtable |
visitedPages
|
static java.lang.String[] |
WEB
Specify WEB as the crawl domain to allow the crawler to visit any page on the World Wide Web. |
private Worm[] |
worms
|
Constructor Summary | |
Crawler()
Make a new Crawler. |
Method Summary | |
void |
addClassifier(Classifier c)
Adds a classifier to this crawler. |
void |
addCrawlListener(CrawlListener listen)
Adds a listener to the set of CrawlListeners for this crawler. |
void |
addLinkListener(LinkListener listen)
Adds a listener to the set of LinkListeners for this crawler. |
void |
addRoot(Link link)
Add a root to the existing set of roots. |
void |
clear()
Initialize the crawler for a fresh crawl. |
protected void |
clearVisited()
Clear the set of visited links. |
java.util.Enumeration |
enumerateClassifiers()
Enumerates the set of classifiers. |
java.util.Enumeration |
enumerateQueue()
Enumerate crawling queue. |
void |
expand(Page page)
Expand the crawl from a page. |
(package private) void |
fetch(Worm w)
|
(package private) void |
fetchTimedOut(Worm w,
int interval)
|
Action |
getAction()
Get action. |
int |
getActiveThreads()
Get number of threads currently working. |
Classifier[] |
getClassifiers()
Get the set of classifiers |
Link[] |
getCrawledRoots()
Get roots of last crawl. |
boolean |
getDepthFirst()
Get depth-first search flag. |
java.lang.String[] |
getDomain()
Get crawl domain. |
DownloadParameters |
getDownloadParameters()
Get download parameters (such as number of threads, timeouts, maximum page size, etc.) |
boolean |
getIgnoreVisitedLinks()
Get ignore-visited-links flag. |
LinkPredicate |
getLinkPredicate()
Get link predicate. |
int |
getLinksTested()
Get number of links tested. |
java.lang.String[] |
getLinkType()
Get legal link types to crawl. |
int |
getMaxDepth()
Get maximum depth. |
java.lang.String |
getName()
Get human-readable name of crawler. |
PagePredicate |
getPagePredicate()
Get page predicate. |
int |
getPagesLeft()
Get number of pages left to be visited. |
int |
getPagesVisited()
Get number of pages visited. |
java.lang.String |
getRootHrefs()
Get starting points of crawl as a String of newline-delimited URLs. |
Link[] |
getRoots()
Get starting points of crawl as an array of Link objects. |
int |
getState()
Get state of crawler. |
boolean |
getSynchronous()
Get synchronous flag. |
private void |
init()
|
static void |
main(java.lang.String[] args)
|
protected void |
markVisited(Link link)
Register that a link has been visited. |
void |
pause()
Pause the crawl in progress. |
(package private) void |
process(Link link)
|
private void |
readObject(java.io.ObjectInputStream in)
|
void |
removeAllClassifiers()
Clears the set of classifiers. |
void |
removeClassifier(Classifier c)
Removes a classifier from the set of classifiers. |
void |
removeCrawlListener(CrawlListener listen)
Removes a listener from the set of CrawlListeners. |
void |
removeLinkListener(LinkListener listen)
Removes a listener from the set of LinkListeners. |
void |
run()
Start crawling. |
protected void |
sendCrawlEvent(int id)
Send a CrawlEvent to all CrawlListeners registered with this crawler. |
protected void |
sendLinkEvent(Link l,
int id)
Send a LinkEvent to all LinkListeners registered with this crawler. |
protected void |
sendLinkEvent(Link l,
int id,
java.lang.Throwable exception)
Send an exceptional LinkEvent to all LinkListeners registered with this crawler. |
void |
setAction(Action act)
Set the action. |
void |
setDepthFirst(boolean useDFS)
Set depth-first search flag. |
void |
setDomain(java.lang.String[] domain)
Set crawl domain. |
void |
setDownloadParameters(DownloadParameters dp)
Set download parameters (such as number of threads, timeouts, maximum page size, etc.) |
void |
setIgnoreVisitedLinks(boolean f)
Set ignore-visited-links flag. |
void |
setLinkPredicate(LinkPredicate pred)
Set link predicate. |
void |
setLinkType(java.lang.String[] type)
Set legal link types to crawl. |
void |
setMaxDepth(int maxDepth)
Set maximum depth. |
void |
setName(java.lang.String name)
Set human-readable name of crawler. |
void |
setPagePredicate(PagePredicate pred)
Set page predicate. |
void |
setRoot(Link link)
Set starting point of crawl as a single Link. |
void |
setRootHrefs(java.lang.String hrefs)
Set starting points of crawl as a string of whitespace-delimited URLs. |
void |
setRoots(Link[] links)
Set starting points of crawl as an array of Links. |
void |
setSynchronous(boolean f)
Set ssynchronous flag. |
boolean |
shouldVisit(Link l)
Callback for testing whether a link should be traversed. |
void |
stop()
Stop the crawl in progress. |
void |
submit(Link link)
Puts a link into the crawling queue. |
void |
submit(Link[] links)
Submit an array of Links for crawling. |
(package private) void |
timedOut()
|
java.lang.String |
toString()
Convert the crawler to a String. |
private static java.lang.String[] |
useStandard(java.lang.String[] standard,
java.lang.String[] s)
|
void |
visit(Page page)
Callback for visiting a page. |
boolean |
visited(Link link)
Test whether the page corresponding to a link has been visited (or queued for visiting). |
private void |
writeObject(java.io.ObjectOutputStream out)
|
Methods inherited from class java.lang.Object |
|
Field Detail |
private static final long serialVersionUID
public static final java.lang.String[] WEB
public static final java.lang.String[] SERVER
public static final java.lang.String[] SUBTREE
public static final java.lang.String[] HYPERLINKS
public static final java.lang.String[] HYPERLINKS_AND_IMAGES
public static final java.lang.String[] ALL_LINKS
private java.lang.String name
private transient Link[] roots
private java.lang.String[] rootHrefs
private java.lang.String[] domain
private boolean synchronous
private boolean depthFirst
private java.lang.String[] type
private boolean ignoreVisitedLinks
private int maxDepth
private DownloadParameters dp
private java.util.Vector classifiers
private LinkPredicate linkPredicate
private PagePredicate pagePredicate
private Action action
private transient Link[] crawledRoots
private transient int state
private transient Worm[] worms
private transient PriorityQueue fetchQueue
private transient PriorityQueue crawlQueue
private transient int numLinksTested
private transient int numPagesVisited
private transient int numPagesLeft
private transient java.util.Vector crawlListeners
private transient java.util.Vector linkListeners
private transient java.util.Hashtable visitedPages
private transient RobotExclusion robotExclusion
Constructor Detail |
public Crawler()
Method Detail |
private void init()
private void writeObject(java.io.ObjectOutputStream out) throws java.io.IOException
private void readObject(java.io.ObjectInputStream in) throws java.io.IOException, java.lang.ClassNotFoundException
private static java.lang.String[] useStandard(java.lang.String[] standard, java.lang.String[] s)
public void run()
run
in interface java.lang.Runnable
public void clear()
public void pause()
public void stop()
void timedOut()
public int getState()
public void visit(Page page)
page
- Page retrieved by the crawlerpublic boolean shouldVisit(Link l)
l
- Link encountered by the crawlerpublic void expand(Page page)
page
- Page to expandpublic int getPagesVisited()
public int getLinksTested()
public int getPagesLeft()
public int getActiveThreads()
public java.lang.String getName()
public void setName(java.lang.String name)
name
- new name for crawlerpublic java.lang.String toString()
toString
in class java.lang.Object
public Link[] getRoots()
public Link[] getCrawledRoots()
public java.lang.String getRootHrefs()
public void setRootHrefs(java.lang.String hrefs) throws java.net.MalformedURLException
hrefs
- URLs of starting point, separated by space, \t, or \njava.net.MalformedURLException
- if any of the URLs is invalid,
leaving starting points unchangedpublic void setRoot(Link link)
link
- starting pointpublic void setRoots(Link[] links)
links
- starting pointspublic void addRoot(Link link)
link
- starting point to addpublic java.lang.String[] getDomain()
public void setDomain(java.lang.String[] domain)
domain
- one of WEB, SERVER, or SUBTREE.public java.lang.String[] getLinkType()
public void setLinkType(java.lang.String[] type)
domain
- one of HYPERLINKS, HYPERLINKS_AND_IMAGES, or ALL_LINKS.public boolean getDepthFirst()
public void setDepthFirst(boolean useDFS)
useDFS
- true if search should be depth-first, false if search should be breadth-first.public boolean getSynchronous()
public void setSynchronous(boolean f)
f
- true if crawler must visit the pages in priority order; false if crawler can visit
pages in any order.public boolean getIgnoreVisitedLinks()
public void setIgnoreVisitedLinks(boolean f)
f
- true if search skips links whose URLs have already been visited
(or queued for visiting).public int getMaxDepth()
public void setMaxDepth(int maxDepth)
maxDepth
- maximum depth of crawl, in hops from starting pointpublic DownloadParameters getDownloadParameters()
public void setDownloadParameters(DownloadParameters dp)
dp
- Download parameterspublic void setLinkPredicate(LinkPredicate pred)
pred
- Link predicatepublic LinkPredicate getLinkPredicate()
public void setPagePredicate(PagePredicate pred)
pred
- Page predicatepublic PagePredicate getPagePredicate()
public void setAction(Action act)
act
- Actionpublic Action getAction()
public void submit(Link link)
link
- Link to put in queuepublic void submit(Link[] links)
links
- Links to put in queuepublic java.util.Enumeration enumerateQueue()
public void addClassifier(Classifier c)
c
- a classifierpublic void removeClassifier(Classifier c)
c
- a classifierpublic void removeAllClassifiers()
public java.util.Enumeration enumerateClassifiers()
public Classifier[] getClassifiers()
public void addCrawlListener(CrawlListener listen)
listen
- a listenerpublic void removeCrawlListener(CrawlListener listen)
listen
- a listenerpublic void addLinkListener(LinkListener listen)
listen
- a listenerpublic void removeLinkListener(LinkListener listen)
listen
- a listenerprotected void sendCrawlEvent(int id)
id
- Event idprotected void sendLinkEvent(Link l, int id)
l
- Link related to eventid
- Event idprotected void sendLinkEvent(Link l, int id, java.lang.Throwable exception)
l
- Link related to eventid
- Event idexception
- Exception associated with eventpublic boolean visited(Link link)
link
- Link to testprotected void markVisited(Link link)
link
- Link that has been visitedprotected void clearVisited()
void fetch(Worm w)
void process(Link link)
void fetchTimedOut(Worm w, int interval)
public static void main(java.lang.String[] args) throws java.lang.Exception
|
|||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | ||||||||
SUMMARY: INNER | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |