public class FetchHTTP extends Processor implements org.springframework.context.Lifecycle
Modifier and Type | Field and Description |
---|---|
protected static org.apache.http.config.Lookup<org.apache.http.auth.AuthSchemeProvider> |
AUTH_SCHEME_REGISTRY |
protected AbstractCookieStore |
cookieStore |
protected String |
digestAlgorithm |
static String |
HTTP_BIND_ADDRESS |
static String |
HTTP_SCHEME |
static String |
HTTPS_SCHEME |
protected ServerCache |
serverCache |
protected SSLContext |
sslContext |
protected org.archive.httpclient.ConfigurableX509TrustManager.TrustLevel |
sslTrustLevel |
Constructor and Description |
---|
FetchHTTP() |
Modifier and Type | Method and Description |
---|---|
protected void |
addResponseContent(org.apache.http.HttpResponse response,
CrawlURI curi)
This method populates
curi with response status and
content type. |
protected boolean |
checkMidfetchAbort(CrawlURI curi) |
protected org.apache.http.auth.AuthScheme |
chooseAuthScheme(Map<String,String> challenges,
String challengeHeaderKey) |
protected void |
cleanup(CrawlURI curi,
Exception exception,
String message,
int status)
Cleanup after a failed method execute.
|
protected void |
doAbort(CrawlURI curi,
org.apache.http.client.methods.AbstractExecutionAwareRequest request,
String annotation) |
protected Map<String,String> |
extractChallenges(org.apache.http.HttpResponse response,
CrawlURI curi,
org.apache.http.client.AuthenticationStrategy authStrategy) |
protected void |
failedExecuteCleanup(CrawlURI curi,
Exception exception)
Cleanup after a failed method execute.
|
boolean |
getAcceptCompression() |
List<String> |
getAcceptHeaders() |
protected Object |
getAttributeEither(CrawlURI curi,
String key)
Get a value either from inside the CrawlURI instance, or from
settings (module attributes).
|
protected org.apache.http.ProtocolVersion |
getConfiguredHttpVersion() |
AbstractCookieStore |
getCookieStore() |
protected Set<Credential> |
getCredentials(CrawlURI curi,
Class<?> type) |
CredentialStore |
getCredentialStore() |
Charset |
getDefaultCharset() |
String |
getDefaultEncoding() |
String |
getDigestAlgorithm() |
boolean |
getDigestContent() |
String |
getHttpBindAddress() |
String |
getHttpProxyHost() |
String |
getHttpProxyPassword() |
Integer |
getHttpProxyPort() |
String |
getHttpProxyUser() |
boolean |
getIgnoreCookies() |
int |
getMaxFetchKBSec() |
long |
getMaxLengthBytes() |
boolean |
getSendConnectionClose() |
boolean |
getSendIfModifiedSince() |
boolean |
getSendIfNoneMatch() |
boolean |
getSendRange() |
boolean |
getSendReferer() |
ServerCache |
getServerCache() |
protected static String |
getServerKey(CrawlURI uri) |
DecideRule |
getShouldFetchBodyRule() |
int |
getSoTimeoutMs() |
org.archive.httpclient.ConfigurableX509TrustManager.TrustLevel |
getSslTrustLevel() |
int |
getTimeoutSeconds() |
boolean |
getUseHTTP11() |
UserAgentProvider |
getUserAgentProvider() |
protected void |
handle401(org.apache.http.HttpResponse response,
CrawlURI curi)
Server is looking for basic/digest auth credentials (RFC2617).
|
protected void |
innerProcess(CrawlURI curi)
Actually performs the process.
|
protected boolean |
maybeMidfetchAbort(CrawlURI curi,
org.apache.http.client.methods.AbstractExecutionAwareRequest request) |
protected void |
promoteCredentials(CrawlURI curi)
Promote successful credential to the server.
|
void |
setAcceptCompression(boolean acceptCompression)
Set headers to accept compressed responses.
|
void |
setAcceptHeaders(List<String> headers)
Accept Headers to include in each request.
|
protected void |
setCharacterEncoding(CrawlURI curi,
Recorder rec,
org.apache.http.HttpResponse response)
Set the character encoding based on the result headers or default.
|
void |
setCookieStore(AbstractCookieStore cookieStore) |
void |
setCredentialStore(CredentialStore credentials)
Used to store credentials.
|
void |
setDefaultEncoding(String encoding)
The character encoding to use for files that do not have one specified in
the HTTP response headers.
|
void |
setDigestAlgorithm(String digestAlgorithm)
Which algorithm (for example MD5 or SHA-1) to use to perform an
on-the-fly digest hash of retrieved content-bodies.
|
void |
setDigestContent(boolean digest)
Whether or not to perform an on-the-fly digest hash of retrieved
content-bodies.
|
void |
setHttpBindAddress(String address)
Local IP address or hostname to use when making connections (binding
sockets).
|
void |
setHttpProxyHost(String host)
Proxy host IP (set only if needed).
|
void |
setHttpProxyPassword(String password)
Proxy password (set only if needed).
|
void |
setHttpProxyPort(Integer port)
Proxy port (set only if needed).
|
void |
setHttpProxyUser(String user)
Proxy user (set only if needed).
|
void |
setIgnoreCookies(boolean ignoreCookies)
Disable cookie handling.
|
void |
setMaxFetchKBSec(int rate)
The maximum KB/sec to use when fetching data from a server.
|
void |
setMaxLengthBytes(long timeout)
Maximum length in bytes to fetch.
|
protected void |
setOtherCodings(CrawlURI uri,
Recorder rec,
org.apache.http.HttpResponse response)
Set the transfer, content encodings based on headers (if necessary).
|
void |
setSendConnectionClose(boolean sendClose)
Send 'Connection: close' header with every request.
|
void |
setSendIfModifiedSince(boolean sendIfModifiedSince)
Send 'If-Modified-Since' header, if previous 'Last-Modified' fetch
history information is available in URI history.
|
void |
setSendIfNoneMatch(boolean sendIfNoneMatch)
Send 'If-None-Match' header, if previous 'Etag' fetch history information
is available in URI history.
|
void |
setSendRange(boolean sendRange) |
void |
setSendReferer(boolean sendReferer)
Send 'Referer' header with every request.
|
void |
setServerCache(ServerCache serverCache)
Used to do DNS lookups.
|
void |
setShouldFetchBodyRule(DecideRule rule)
DecideRules applied after receipt of HTTP response headers but before we
start to download the body.
|
protected void |
setSizes(CrawlURI curi,
Recorder rec)
Update CrawlURI internal sizes based on current transaction (and
in the case of 304s, history)
|
void |
setSoTimeoutMs(int timeout)
If the socket is unresponsive for this number of milliseconds, give up.
|
void |
setSslTrustLevel(org.archive.httpclient.ConfigurableX509TrustManager.TrustLevel sslTrustLevel)
SSL certificate trust level.
|
void |
setTimeoutSeconds(int timeout)
If the fetch is not completed in this number of seconds, give up (and
retry later).
|
void |
setUseHTTP11(boolean useHTTP11)
Use HTTP/1.1.
|
void |
setUserAgentProvider(UserAgentProvider provider) |
protected boolean |
shouldProcess(CrawlURI curi)
Can this processor fetch the given CrawlURI.
|
protected SSLContext |
sslContext() |
void |
start() |
void |
stop() |
doCheckpoint, finishCheckpoint, flattenVia, fromCheckpointJson, getBeanName, getEnabled, getKeyedProperties, getRecordedSize, getShouldProcessRule, getURICount, hasHttpAuthenticationCredential, innerProcessResult, innerRejectProcess, isRunning, isSuccess, process, report, setBeanName, setEnabled, setRecoveryCheckpoint, setShouldProcessRule, startCheckpoint, toCheckpointJson
public static final String HTTP_SCHEME
public static final String HTTPS_SCHEME
protected static final org.apache.http.config.Lookup<org.apache.http.auth.AuthSchemeProvider> AUTH_SCHEME_REGISTRY
protected ServerCache serverCache
protected String digestAlgorithm
protected AbstractCookieStore cookieStore
public static final String HTTP_BIND_ADDRESS
protected org.archive.httpclient.ConfigurableX509TrustManager.TrustLevel sslTrustLevel
protected transient SSLContext sslContext
public ServerCache getServerCache()
public void setServerCache(ServerCache serverCache)
public boolean getDigestContent()
public void setDigestContent(boolean digest)
public String getDigestAlgorithm()
public void setDigestAlgorithm(String digestAlgorithm)
public UserAgentProvider getUserAgentProvider()
public void setUserAgentProvider(UserAgentProvider provider)
public boolean getSendConnectionClose()
public void setSendConnectionClose(boolean sendClose)
public String getDefaultEncoding()
public void setDefaultEncoding(String encoding)
public Charset getDefaultCharset()
public boolean getUseHTTP11()
public void setUseHTTP11(boolean useHTTP11)
protected org.apache.http.ProtocolVersion getConfiguredHttpVersion()
public boolean getIgnoreCookies()
public void setIgnoreCookies(boolean ignoreCookies)
public boolean getSendReferer()
public void setSendReferer(boolean sendReferer)
The 'Referer' header contans the location the crawler came from, the page the current URI was discovered in. The 'Referer' usually is logged on the remote server and can be of assistance to webmasters trying to figure how a crawler got to a particular area on a site.
public boolean getAcceptCompression()
public void setAcceptCompression(boolean acceptCompression)
public void setAcceptHeaders(List<String> headers)
public void setCookieStore(AbstractCookieStore cookieStore)
public AbstractCookieStore getCookieStore()
public CredentialStore getCredentialStore()
public void setCredentialStore(CredentialStore credentials)
public String getHttpBindAddress()
public void setHttpBindAddress(String address)
public String getHttpProxyHost()
public void setHttpProxyHost(String host)
public Integer getHttpProxyPort()
public void setHttpProxyPort(Integer port)
public String getHttpProxyUser()
public void setHttpProxyUser(String user)
public String getHttpProxyPassword()
public void setHttpProxyPassword(String password)
public int getMaxFetchKBSec()
public void setMaxFetchKBSec(int rate)
public int getTimeoutSeconds()
public void setTimeoutSeconds(int timeout)
public int getSoTimeoutMs()
public void setSoTimeoutMs(int timeout)
getTimeoutSeconds()
for optimal configuration: ensures at least one
retry read.public long getMaxLengthBytes()
public void setMaxLengthBytes(long timeout)
public boolean getSendRange()
public void setSendRange(boolean sendRange)
public boolean getSendIfModifiedSince()
public void setSendIfModifiedSince(boolean sendIfModifiedSince)
public boolean getSendIfNoneMatch()
public void setSendIfNoneMatch(boolean sendIfNoneMatch)
public DecideRule getShouldFetchBodyRule()
public void setShouldFetchBodyRule(DecideRule rule)
public org.archive.httpclient.ConfigurableX509TrustManager.TrustLevel getSslTrustLevel()
public void setSslTrustLevel(org.archive.httpclient.ConfigurableX509TrustManager.TrustLevel sslTrustLevel)
protected SSLContext sslContext()
protected boolean shouldProcess(CrawlURI curi)
shouldProcess
in class Processor
curi
- protected void setOtherCodings(CrawlURI uri, Recorder rec, org.apache.http.HttpResponse response)
rec
- Recorder for this request.response
- Method used for the request.protected void setCharacterEncoding(CrawlURI curi, Recorder rec, org.apache.http.HttpResponse response)
rec
- Recorder for this request.response
- Method used for the request.protected boolean checkMidfetchAbort(CrawlURI curi)
protected void doAbort(CrawlURI curi, org.apache.http.client.methods.AbstractExecutionAwareRequest request, String annotation)
protected boolean maybeMidfetchAbort(CrawlURI curi, org.apache.http.client.methods.AbstractExecutionAwareRequest request)
protected void innerProcess(CrawlURI curi) throws InterruptedException
Processor
Processor.getEnabled()
, the
Processor.getShouldProcessRule()
and the Processor.shouldProcess(CrawlURI)
tests.innerProcess
in class Processor
curi
- the URI to processInterruptedException
- if the thread is interruptedprotected void promoteCredentials(CrawlURI curi)
curi
- CrawlURI whose credentials we are to promote.protected void handle401(org.apache.http.HttpResponse response, CrawlURI curi)
response
- 401 http responsecuri
- CrawlURI that got a 401.protected Map<String,String> extractChallenges(org.apache.http.HttpResponse response, CrawlURI curi, org.apache.http.client.AuthenticationStrategy authStrategy)
response
- curi
- CrawlURI that got a 401 or 407.authStrategy
- Either ProxyAuthenticationStrategy or
TargetAuthenticationStrategy. Determines whether
Proxy-Authenticate or WWW-Authenticate header is consulted.protected org.apache.http.auth.AuthScheme chooseAuthScheme(Map<String,String> challenges, String challengeHeaderKey)
protected Set<Credential> getCredentials(CrawlURI curi, Class<?> type)
curi
- CrawlURI that got a 401.type
- Class of credential to get from curi.protected Object getAttributeEither(CrawlURI curi, String key)
curi
- CrawlURI to consultkey
- key to lookupprotected void setSizes(CrawlURI curi, Recorder rec)
curi
- CrawlURIrec
- HttpRecorderprotected void addResponseContent(org.apache.http.HttpResponse response, CrawlURI curi)
curi
with response status and
content type.curi
- CrawlURI to populate.response
- Method to get response status and headers from.protected void failedExecuteCleanup(CrawlURI curi, Exception exception)
curi
- CrawlURI we failed on.exception
- Exception we failed with.protected void cleanup(CrawlURI curi, Exception exception, String message, int status)
curi
- CrawlURI we failed on.exception
- Exception we failed with.message
- Message to log with failure. FIXME: Seems ignoredstatus
- Status to set on the fetch.public void start()
public void stop()
Copyright © 2003–2019 Internet Archive. All rights reserved.