public class JerichoExtractorHTML extends ExtractorHTML
Modifier and Type | Field and Description |
---|---|
protected AtomicLong |
numberOfFormsProcessed |
A_FORM_OFFSETS, A_META_ROBOTS, extractorJS, metadata
DEFAULT_PARAMETERS, extractorParameters, loggerModule, numberOfLinksExtracted
Constructor and Description |
---|
JerichoExtractorHTML() |
Modifier and Type | Method and Description |
---|---|
protected void |
extract(CrawlURI curi,
CharSequence cs)
Run extractor.
|
protected void |
processForm(CrawlURI curi,
au.id.jericho.lib.html.Element element) |
protected void |
processGeneralTag(CrawlURI curi,
au.id.jericho.lib.html.Element element,
au.id.jericho.lib.html.Attributes attributes) |
protected boolean |
processMeta(CrawlURI curi,
au.id.jericho.lib.html.Element element) |
protected void |
processScript(CrawlURI curi,
au.id.jericho.lib.html.Element element) |
protected void |
processStyle(CrawlURI curi,
au.id.jericho.lib.html.Element element) |
String |
report() |
addLinkFromString, afterPropertiesSet, considerIfLikelyUri, considerQueryStringValues, elementContext, getContentDeclaredCharset, getExtractJavascript, getExtractOnlyFormGets, getExtractorJS, getExtractValueAttributes, getIgnoreFormActionUrls, getIgnoreUnexpectedHtml, getMaxAttributeNameLength, getMaxAttributeValLength, getMaxElementLength, getMetadata, getTreatFramesAsEmbedLinks, innerExtract, isHtmlExpectedHere, processEmbed, processEmbed, processGeneralTag, processLink, processMeta, processScript, processScriptCode, processStyle, setExtractJavascript, setExtractOnlyFormGets, setExtractorJS, setExtractValueAttributes, setIgnoreFormActionUrls, setIgnoreUnexpectedHtml, setMaxAttributeNameLength, setMaxAttributeValLength, setMaxElementLength, setMetadata, setTreatFramesAsEmbedLinks, shouldExtract
extract, shouldProcess
add, addOutlink, addOutlink, addRelativeToBase, addRelativeToVia, fromCheckpointJson, getExtractorParameters, getLoggerModule, innerProcess, logUriError, setExtractorParameters, setLoggerModule, toCheckpointJson
doCheckpoint, finishCheckpoint, flattenVia, getBeanName, getEnabled, getKeyedProperties, getRecordedSize, getShouldProcessRule, getURICount, hasHttpAuthenticationCredential, innerProcessResult, innerRejectProcess, isRunning, isSuccess, process, setBeanName, setEnabled, setRecoveryCheckpoint, setShouldProcessRule, start, startCheckpoint, stop
protected AtomicLong numberOfFormsProcessed
protected void processGeneralTag(CrawlURI curi, au.id.jericho.lib.html.Element element, au.id.jericho.lib.html.Attributes attributes)
protected boolean processMeta(CrawlURI curi, au.id.jericho.lib.html.Element element)
protected void processScript(CrawlURI curi, au.id.jericho.lib.html.Element element)
protected void processStyle(CrawlURI curi, au.id.jericho.lib.html.Element element)
protected void processForm(CrawlURI curi, au.id.jericho.lib.html.Element element)
protected void extract(CrawlURI curi, CharSequence cs)
extract
in class ExtractorHTML
curi
- CrawlURI we're processing.cs
- Sequence from underlying ReplayCharSequence.Copyright © 2003–2019 Internet Archive. All rights reserved.