public abstract class WriterPoolProcessor extends Processor implements org.springframework.context.Lifecycle, Checkpointable, WriterPoolSettings
WriterPoolMember
instance.Modifier and Type | Field and Description |
---|---|
protected static String |
ANNOTATION_UNWRITTEN
CrawlURI annotation indicating no record was written.
|
protected boolean |
compress
Whether to gzip-compress files when writing to disk;
by default true, meaning do-compress.
|
protected ConfigPath |
directory |
protected boolean |
frequentFlushes
Whether to flush to underlying file frequently (at least after each
record), or not.
|
protected long |
maxFileSizeBytes
Max size of each file.
|
protected long |
maxTotalBytesToWrite
Total file bytes to write to disk.
|
protected int |
maxWaitForIdleMs
Maximum time to wait on idle writer before (possibly) creating an
additional instance.
|
protected int |
poolMaxActive
Maximum active files in pool.
|
protected String |
prefix
File prefix.
|
protected ServerCache |
serverCache |
protected boolean |
skipIdenticalDigests
Whether to skip the writing of a record when URI history information is
available and indicates the prior fetch had an identical content digest.
|
protected boolean |
startNewFilesOnCheckpoint |
protected List<ConfigPath> |
storePaths
Where to save files.
|
protected String |
template
Template from which a filename is interpolated.
|
protected int |
writeBufferSize
Size of buffer in front of disk-writing.
|
Constructor and Description |
---|
WriterPoolProcessor() |
Modifier and Type | Method and Description |
---|---|
List<File> |
calcOutputDirs() |
protected ProcessResult |
checkBytesWritten() |
protected void |
copyForwardWriteTagIfDupe(CrawlURI curi)
If this fetch is identical to the last written (archived) fetch, then
copy forward the writeTag.
|
void |
doCheckpoint(Checkpoint checkpointInProgress) |
protected void |
fromCheckpointJson(org.json.JSONObject json)
Restore internal state from JSONObject stored at earlier
checkpoint-time.
|
boolean |
getCompress() |
protected abstract long |
getDefaultMaxFileSize() |
protected abstract List<ConfigPath> |
getDefaultStorePaths() |
ConfigPath |
getDirectory() |
boolean |
getFrequentFlushes() |
protected String |
getHostAddress(CrawlURI curi)
Return IP address of given URI suitable for recording (as in a
classic ARC 5-field header line).
|
long |
getMaxFileSizeBytes() |
long |
getMaxTotalBytesToWrite() |
int |
getMaxWaitForIdleMs() |
abstract List<String> |
getMetadata() |
CrawlMetadata |
getMetadataProvider() |
protected WriterPool |
getPool() |
int |
getPoolMaxActive() |
String |
getPrefix() |
protected AtomicInteger |
getSerialNo() |
ServerCache |
getServerCache() |
boolean |
getSkipIdenticalDigests() |
boolean |
getStartNewFilesOnCheckpoint() |
List<ConfigPath> |
getStorePaths() |
String |
getTemplate() |
protected long |
getTotalBytesWritten() |
int |
getWriteBufferSize() |
protected void |
innerProcess(CrawlURI puri)
Actually performs the process.
|
protected abstract ProcessResult |
innerProcessResult(CrawlURI uri) |
protected void |
innerRejectProcess(CrawlURI curi)
Invoked after a URI has been rejected.
|
void |
setCompress(boolean compress) |
void |
setDirectory(ConfigPath directory) |
void |
setFrequentFlushes(boolean frequentFlushes) |
void |
setMaxFileSizeBytes(long maxFileSizeBytes) |
void |
setMaxTotalBytesToWrite(long maxTotalBytesToWrite) |
void |
setMaxWaitForIdleMs(int maxWaitForIdle) |
void |
setMetadataProvider(CrawlMetadata provider) |
protected void |
setPool(WriterPool pool) |
void |
setPoolMaxActive(int poolMaxActive) |
void |
setPrefix(String prefix) |
void |
setServerCache(ServerCache serverCache) |
void |
setSkipIdenticalDigests(boolean skipIdenticalDigests) |
void |
setStartNewFilesOnCheckpoint(boolean startNewFilesOnCheckpoint)
Whether to close output files and start new ones on checkpoint.
|
void |
setStorePaths(List<ConfigPath> paths) |
void |
setTemplate(String template) |
protected void |
setTotalBytesWritten(long totalBytesWritten) |
protected abstract void |
setupPool(AtomicInteger serial)
Set up pool of files.
|
void |
setWriteBufferSize(int writeBufferSize) |
protected boolean |
shouldProcess(CrawlURI curi)
Determines whether the given uri should be processed by this
processor.
|
protected boolean |
shouldWrite(CrawlURI curi)
Whether the given CrawlURI should be written to archive files.
|
void |
start() |
void |
stop() |
protected org.json.JSONObject |
toCheckpointJson()
Return a JSONObject of current stat that can be consulted
on recovery to restore necessary values.
|
finishCheckpoint, flattenVia, getBeanName, getEnabled, getKeyedProperties, getRecordedSize, getShouldProcessRule, getURICount, hasHttpAuthenticationCredential, isRunning, isSuccess, process, report, setBeanName, setEnabled, setRecoveryCheckpoint, setShouldProcessRule, startCheckpoint
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
finishCheckpoint, setRecoveryCheckpoint, startCheckpoint
protected boolean compress
protected String prefix
protected String template
protected long maxFileSizeBytes
protected int poolMaxActive
protected int maxWaitForIdleMs
protected boolean skipIdenticalDigests
protected static final String ANNOTATION_UNWRITTEN
protected long maxTotalBytesToWrite
protected boolean frequentFlushes
protected int writeBufferSize
protected transient ServerCache serverCache
protected ConfigPath directory
protected boolean startNewFilesOnCheckpoint
protected List<ConfigPath> storePaths
public boolean getCompress()
getCompress
in interface WriterPoolSettings
public void setCompress(boolean compress)
public String getPrefix()
getPrefix
in interface WriterPoolSettings
public void setPrefix(String prefix)
public String getTemplate()
getTemplate
in interface WriterPoolSettings
public void setTemplate(String template)
protected abstract long getDefaultMaxFileSize()
public long getMaxFileSizeBytes()
getMaxFileSizeBytes
in interface WriterPoolSettings
public void setMaxFileSizeBytes(long maxFileSizeBytes)
public int getPoolMaxActive()
public void setPoolMaxActive(int poolMaxActive)
public int getMaxWaitForIdleMs()
public void setMaxWaitForIdleMs(int maxWaitForIdle)
public boolean getSkipIdenticalDigests()
public void setSkipIdenticalDigests(boolean skipIdenticalDigests)
public long getMaxTotalBytesToWrite()
public void setMaxTotalBytesToWrite(long maxTotalBytesToWrite)
public boolean getFrequentFlushes()
getFrequentFlushes
in interface WriterPoolSettings
public void setFrequentFlushes(boolean frequentFlushes)
public int getWriteBufferSize()
getWriteBufferSize
in interface WriterPoolSettings
public void setWriteBufferSize(int writeBufferSize)
public CrawlMetadata getMetadataProvider()
public void setMetadataProvider(CrawlMetadata provider)
public ServerCache getServerCache()
public void setServerCache(ServerCache serverCache)
public ConfigPath getDirectory()
public void setDirectory(ConfigPath directory)
public boolean getStartNewFilesOnCheckpoint()
public void setStartNewFilesOnCheckpoint(boolean startNewFilesOnCheckpoint)
protected abstract List<ConfigPath> getDefaultStorePaths()
public List<ConfigPath> getStorePaths()
public void setStorePaths(List<ConfigPath> paths)
public void start()
public void stop()
protected AtomicInteger getSerialNo()
protected abstract void setupPool(AtomicInteger serial)
protected ProcessResult checkBytesWritten()
protected boolean shouldWrite(CrawlURI curi)
curi
- CrawlURIprotected String getHostAddress(CrawlURI curi)
curi
- CrawlURIpublic void doCheckpoint(Checkpoint checkpointInProgress) throws IOException
doCheckpoint
in interface Checkpointable
doCheckpoint
in class Processor
IOException
protected org.json.JSONObject toCheckpointJson() throws org.json.JSONException
Processor
toCheckpointJson
in class Processor
org.json.JSONException
protected void fromCheckpointJson(org.json.JSONObject json) throws org.json.JSONException
Processor
fromCheckpointJson
in class Processor
json
- JSONObjectorg.json.JSONException
protected WriterPool getPool()
protected void setPool(WriterPool pool)
protected long getTotalBytesWritten()
protected void setTotalBytesWritten(long totalBytesWritten)
public abstract List<String> getMetadata()
getMetadata
in interface WriterPoolSettings
public List<File> calcOutputDirs()
calcOutputDirs
in interface WriterPoolSettings
protected void innerProcess(CrawlURI puri)
Processor
Processor.getEnabled()
, the
Processor.getShouldProcessRule()
and the Processor.shouldProcess(CrawlURI)
tests.innerProcess
in class Processor
puri
- the URI to processprotected abstract ProcessResult innerProcessResult(CrawlURI uri)
innerProcessResult
in class Processor
protected boolean shouldProcess(CrawlURI curi)
Processor
shouldProcess
in class Processor
curi
- the URI to testprotected void copyForwardWriteTagIfDupe(CrawlURI curi)
protected void innerRejectProcess(CrawlURI curi) throws InterruptedException
Processor
innerRejectProcess
in class Processor
curi
- the URI that was rejectedInterruptedException
- if the thread is interruptedCopyright © 2003–2019 Internet Archive. All rights reserved.