public class WARCWriterProcessor extends org.archive.crawler.framework.WriterPoolProcessor implements org.archive.crawler.datamodel.CoreAttributeConstants, org.archive.crawler.event.CrawlStatusListener, org.archive.io.WriterPoolSettings, org.archive.crawler.datamodel.FetchStatusCodes, org.archive.io.warc.WARCConstants
Based on the WARCWriterProcessor in package org.archive.crawler.writer With modifications to the WARC-info record..
Modifier and Type | Field and Description |
---|---|
static String |
ATTR_METADATA_ITEMS
Key for metadata-items to include in the warcinfo.
|
static String |
ATTR_WRITE_METADATA
Key for whether to write 'metadata' type records where possible
|
static String |
ATTR_WRITE_REQUESTS
Key for whether to write 'request' type records where possible
|
static String |
ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS
Key for whether to write 'revisit' type records when consecutive identical digest
|
static String |
ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED
Key for whether to write 'revisit' type records for server "304 not modified" responses
|
ANNOTATION_UNWRITTEN, ATTR_COMPRESS, ATTR_MAX_BYTES_WRITTEN, ATTR_MAX_SIZE_BYTES, ATTR_PATH, ATTR_POOL_MAX_ACTIVE, ATTR_POOL_MAX_WAIT, ATTR_PREFIX, ATTR_SKIP_IDENTICAL_DIGESTS, ATTR_SUFFIX, DEFAULT_COMPRESS
ATTR_DECIDE_RULES, ATTR_ENABLED, attrDecideRules
A_ANNOTATIONS, A_CONTENT_DIGEST, A_CONTENT_TYPE, A_CREDENTIAL_AVATARS_KEY, A_DELAY_FACTOR, A_DISTANCE_FROM_SEED, A_DNS_FETCH_TIME, A_DNS_SERVER_IP_LABEL, A_ETAG_HEADER, A_FETCH_BEGAN_TIME, A_FETCH_COMPLETED_TIME, A_FETCH_HISTORY, A_FORCE_RETIRE, A_FTP_CONTROL_CONVERSATION, A_FTP_FETCH_STATUS, A_HERITABLE_KEYS, A_HTML_BASE, A_HTTP_BIND_ADDRESS, A_HTTP_PROXY_HOST, A_HTTP_PROXY_PORT, A_HTTP_TRANSACTION, A_LAST_MODIFIED_HEADER, A_LOCALIZED_ERRORS, A_META_ROBOTS, A_MINIMUM_DELAY, A_MIRROR_PATH, A_PREREQUISITE_URI, A_REFERENCE_LENGTH, A_RETRY_DELAY, A_RRECORD_SET_LABEL, A_RUNTIME_EXCEPTION, A_SOURCE_TAG, A_STATUS, HEADER_TRUNC, LENGTH_TRUNC, TIMER_TRUNC, TRUNC_SUFFIX
S_BLOCKED_BY_CUSTOM_PROCESSOR, S_BLOCKED_BY_QUOTA, S_BLOCKED_BY_RUNTIME_LIMIT, S_BLOCKED_BY_USER, S_CONNECT_FAILED, S_CONNECT_LOST, S_DEEMED_CHAFF, S_DEEMED_NOT_FOUND, S_DEFERRED, S_DELETED_BY_USER, S_DNS_SUCCESS, S_DOMAIN_PREREQUISITE_FAILURE, S_DOMAIN_UNRESOLVABLE, S_GETBYNAME_SUCCESS, S_OTHER_PREREQUISITE_FAILURE, S_OUT_OF_SCOPE, S_PREREQUISITE_UNSCHEDULABLE_FAILURE, S_PROCESSING_THREAD_KILLED, S_ROBOTS_PRECLUDED, S_ROBOTS_PREREQUISITE_FAILURE, S_RUNTIME_EXCEPTION, S_SERIOUS_ERROR, S_TIMEOUT, S_TOO_MANY_EMBED_HOPS, S_TOO_MANY_LINK_HOPS, S_TOO_MANY_RETRIES, S_UNATTEMPTED, S_UNFETCHABLE_URI, S_UNQUEUEABLE
COLON_SPACE, COMPRESSED_WARC_FILE_EXTENSION, CONTENT_DESCRIPTION, CONTENT_LENGTH, CONTENT_TYPE, CONTINUATION, CONTINUATION_INDEX, CONVERSION, CONVERSION_INDEX, DEFAULT_ENCODING, DEFAULT_MAX_WARC_FILE_SIZE, DOT_COMPRESSED_FILE_EXTENSION, DOT_COMPRESSED_WARC_FILE_EXTENSION, DOT_WARC_FILE_EXTENSION, FTP_CONTROL_CONVERSATION_MIMETYPE, HEADER_FIELD_KEYS, HEADER_FIELD_SEPARATOR, HEADER_KEY_BLOCK_DIGEST, HEADER_KEY_CONCURRENT_TO, HEADER_KEY_DATE, HEADER_KEY_ETAG, HEADER_KEY_FILENAME, HEADER_KEY_ID, HEADER_KEY_IP, HEADER_KEY_LAST_MODIFIED, HEADER_KEY_PAYLOAD_DIGEST, HEADER_KEY_PROFILE, HEADER_KEY_TRUNCATED, HEADER_KEY_TYPE, HEADER_KEY_URI, HEADER_LINE_ENCODING, HTTP_REQUEST_MIMETYPE, HTTP_RESPONSE_MIMETYPE, MAX_LINE_LENGTH, MAX_WARC_HEADER_LINE_LENGTH, METADATA, METADATA_INDEX, NAMED_FIELD_CHECKSUM_LABEL, NAMED_FIELD_DESCRIPTION, NAMED_FIELD_FILEDESC, NAMED_FIELD_IP_LABEL, NAMED_FIELD_RELATED_LABEL, NAMED_FIELD_TRUNCATED, NAMED_FIELD_TRUNCATED_VALUE_HEAD, NAMED_FIELD_TRUNCATED_VALUE_LENGTH, NAMED_FIELD_TRUNCATED_VALUE_TIME, NAMED_FIELD_TRUNCATED_VALUE_UNSPECIFIED, NAMED_FIELD_WARCFILENAME, PLACEHOLDER_RECORD_LENGTH_STRING, PROFILE_REVISIT_IDENTICAL_DIGEST, PROFILE_REVISIT_NOT_MODIFIED, REQUEST, REQUEST_INDEX, RESOURCE, RESOURCE_INDEX, RESPONSE, RESPONSE_INDEX, REVISIT, REVISIT_INDEX, TRUNCATED_VALUE_UNSPECIFIED, TYPE, TYPES, TYPES_LIST, WARC_010_ID, WARC_010_MAGIC, WARC_FILE_EXTENSION, WARC_HEADER_ENCODING, WARC_ID, WARC_MAGIC, WARC_VERSION, WARCINFO, WARCINFO_INDEX, WSP
ABSOLUTE_OFFSET_KEY, CDX, CDX_FILE, CDX_LINE_BUFFER_SIZE, COMPRESSED_FILE_EXTENSION, CRLF, DATE_FIELD_KEY, DEFAULT_DIGEST_METHOD, DUMP, GZIP_DUMP, HEADER, INVALID_SUFFIX, LENGTH_FIELD_KEY, MIMETYPE_FIELD_KEY, NOHEAD, OCCUPIED_SUFFIX, READER_IDENTIFIER_FIELD_KEY, RECORD_IDENTIFIER_FIELD_KEY, SINGLE_SPACE, TYPE_FIELD_KEY, URL_FIELD_KEY, VERSION_FIELD_KEY
Constructor and Description |
---|
WARCWriterProcessor(String name) |
Modifier and Type | Method and Description |
---|---|
protected void |
addIfNotBlank(org.archive.util.anvl.ANVLRecord record,
String label,
String value) |
long |
getDefaultMaxFileSize() |
protected String[] |
getDefaultPath() |
protected String |
getFirstrecordBody(File orderFile)
Return relevant values as header-like fields (here ANVLRecord, but spec-defined "application/warc-fields" type
when written).
|
protected String |
getFirstrecordStylesheet() |
protected URI |
getRecordID() |
protected void |
innerProcess(org.archive.crawler.datamodel.CrawlURI curi) |
protected URI |
qualifyRecordID(URI base,
String key,
String value) |
protected void |
saveHeader(String origName,
org.apache.commons.httpclient.HttpMethodBase method,
org.archive.util.anvl.ANVLRecord headers,
String newName)
Save a header from the given HTTP operation into the provider headers under a new name
|
protected void |
setupPool(AtomicInteger serialNo) |
protected void |
write(String lowerCaseScheme,
org.archive.crawler.datamodel.CrawlURI curi) |
protected URI |
writeFtpControlConversation(org.archive.io.warc.WARCWriter w,
String timestamp,
URI baseid,
org.archive.crawler.datamodel.CrawlURI curi,
org.archive.util.anvl.ANVLRecord headers,
String controlConversation) |
protected URI |
writeMetadata(org.archive.io.warc.WARCWriter w,
String timestamp,
URI baseid,
org.archive.crawler.datamodel.CrawlURI curi,
org.archive.util.anvl.ANVLRecord namedFields) |
protected URI |
writeRequest(org.archive.io.warc.WARCWriter w,
String timestamp,
String mimetype,
URI baseid,
org.archive.crawler.datamodel.CrawlURI curi,
org.archive.util.anvl.ANVLRecord namedFields) |
protected URI |
writeResource(org.archive.io.warc.WARCWriter w,
String timestamp,
String mimetype,
URI baseid,
org.archive.crawler.datamodel.CrawlURI curi,
org.archive.util.anvl.ANVLRecord namedFields) |
protected URI |
writeResponse(org.archive.io.warc.WARCWriter w,
String timestamp,
String mimetype,
URI baseid,
org.archive.crawler.datamodel.CrawlURI curi,
org.archive.util.anvl.ANVLRecord namedFields) |
protected URI |
writeRevisitDigest(org.archive.io.warc.WARCWriter w,
String timestamp,
String mimetype,
URI baseid,
org.archive.crawler.datamodel.CrawlURI curi,
org.archive.util.anvl.ANVLRecord namedFields) |
protected URI |
writeRevisitNotModified(org.archive.io.warc.WARCWriter w,
String timestamp,
URI baseid,
org.archive.crawler.datamodel.CrawlURI curi,
org.archive.util.anvl.ANVLRecord namedFields) |
cacheMetadata, checkBytesWritten, checkpointRecover, crawlCheckpoint, crawlEnded, crawlEnding, crawlPaused, crawlPausing, crawlResuming, crawlStarted, getAttributeUnchecked, getCheckpointStateFile, getHostAddress, getMaxSize, getMaxToWrite, getMetadata, getOutputDirs, getPool, getPoolMaximumActive, getPoolMaximumWait, getPrefix, getSerialNo, getSuffix, getTotalBytesWritten, initialTasks, isCompressed, loadCheckpointSerialNumber, saveCheckpointSerialNumber, setPool, setTotalBytesWritten, shouldWrite
checkForInterrupt, finalTasks, getController, getDecideRule, getDefaultNextProcessor, innerRejectProcess, isContentToProcess, isEnabled, isExpectedMimeType, isHttpTransactionContentToProcess, kickUpdate, process, report, rulesAccept, rulesAccept, setDefaultNextProcessor, spawn
addElementToDefinition, checkValue, earlyInitialize, getAbsoluteName, getAttribute, getAttribute, getAttribute, getAttributeInfo, getAttributeInfo, getAttributeInfoIterator, getAttributes, getDataContainerRecursive, getDataContainerRecursive, getDefaultValue, getDescription, getElementFromDefinition, getLegalValues, getLocalAttribute, getMBeanInfo, getMBeanInfo, getParent, getPreservedFields, getSettingsHandler, getUncheckedAttribute, getValue, globalSettings, invoke, isInitialized, isOverridden, iterator, removeElementFromDefinition, setAsOrder, setAttribute, setAttribute, setAttributes, setDescription, setPreservedFields, toString, unsetAttribute
addConstraint, equals, getConstraints, getLegalValueType, isExpertSetting, isOverrideable, isTransient, setExpertSetting, setLegalValueType, setOverrideable, setTransient
clone, finalize, getClass, notify, notifyAll, wait, wait, wait
public static final String ATTR_WRITE_REQUESTS
public static final String ATTR_WRITE_METADATA
public static final String ATTR_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS
public static final String ATTR_WRITE_REVISIT_FOR_NOT_MODIFIED
public static final String ATTR_METADATA_ITEMS
public WARCWriterProcessor(String name)
name
- Name of this writer.public long getDefaultMaxFileSize()
getDefaultMaxFileSize
in class org.archive.crawler.framework.WriterPoolProcessor
protected String[] getDefaultPath()
getDefaultPath
in class org.archive.crawler.framework.WriterPoolProcessor
protected void setupPool(AtomicInteger serialNo)
setupPool
in class org.archive.crawler.framework.WriterPoolProcessor
protected void innerProcess(org.archive.crawler.datamodel.CrawlURI curi)
innerProcess
in class org.archive.crawler.framework.WriterPoolProcessor
AttributeNotFoundException
ReflectionException
MBeanException
protected void write(String lowerCaseScheme, org.archive.crawler.datamodel.CrawlURI curi) throws IOException
IOException
protected URI writeFtpControlConversation(org.archive.io.warc.WARCWriter w, String timestamp, URI baseid, org.archive.crawler.datamodel.CrawlURI curi, org.archive.util.anvl.ANVLRecord headers, String controlConversation) throws IOException
IOException
protected URI writeRequest(org.archive.io.warc.WARCWriter w, String timestamp, String mimetype, URI baseid, org.archive.crawler.datamodel.CrawlURI curi, org.archive.util.anvl.ANVLRecord namedFields) throws IOException
IOException
protected URI writeResponse(org.archive.io.warc.WARCWriter w, String timestamp, String mimetype, URI baseid, org.archive.crawler.datamodel.CrawlURI curi, org.archive.util.anvl.ANVLRecord namedFields) throws IOException
IOException
protected URI writeResource(org.archive.io.warc.WARCWriter w, String timestamp, String mimetype, URI baseid, org.archive.crawler.datamodel.CrawlURI curi, org.archive.util.anvl.ANVLRecord namedFields) throws IOException
IOException
protected URI writeRevisitDigest(org.archive.io.warc.WARCWriter w, String timestamp, String mimetype, URI baseid, org.archive.crawler.datamodel.CrawlURI curi, org.archive.util.anvl.ANVLRecord namedFields) throws IOException
IOException
protected URI writeRevisitNotModified(org.archive.io.warc.WARCWriter w, String timestamp, URI baseid, org.archive.crawler.datamodel.CrawlURI curi, org.archive.util.anvl.ANVLRecord namedFields) throws IOException
IOException
protected void saveHeader(String origName, org.apache.commons.httpclient.HttpMethodBase method, org.archive.util.anvl.ANVLRecord headers, String newName)
origName
- header name to get if presentmethod
- http operation containing headersprotected URI writeMetadata(org.archive.io.warc.WARCWriter w, String timestamp, URI baseid, org.archive.crawler.datamodel.CrawlURI curi, org.archive.util.anvl.ANVLRecord namedFields) throws IOException
IOException
protected URI getRecordID() throws IOException
IOException
protected URI qualifyRecordID(URI base, String key, String value) throws IOException
IOException
protected String getFirstrecordStylesheet()
getFirstrecordStylesheet
in class org.archive.crawler.framework.WriterPoolProcessor
protected String getFirstrecordBody(File orderFile)
getFirstrecordBody
in class org.archive.crawler.framework.WriterPoolProcessor
WriterPoolProcessor.getFirstrecordBody(java.io.File)
protected void addIfNotBlank(org.archive.util.anvl.ANVLRecord record, String label, String value)
Copyright © 2005–2015 The Royal Danish Library, the Danish State and University Library, the National Library of France and the Austrian National Library.. All rights reserved.