public class DeDupFetchHTTP extends org.archive.crawler.fetcher.FetchHTTP implements org.archive.crawler.frontier.AdaptiveRevisitAttributeConstants
FetchHTTP
processor for downloading HTTP documents.
This extension adds a check after the content header has been downloaded that compares the 'last-modified' and or
'last-etag' values from the header against information stored in an appropriate index.DigestIndexer
,
FetchHTTP
,
Serialized FormModifier and Type | Field and Description |
---|---|
static String |
ATTR_DECISION_SCHEME |
static String |
ATTR_FILTER_MODE
Is the mime filter a blacklist (do not apply processor to what matches) or whitelist (apply processor only to
what matches).
|
static String |
ATTR_INDEX_LOCATION |
static String |
ATTR_MIME_FILTER
The filter on mime types.
|
static String |
ATTR_USE_SPARSE_RANGE_FILTER
Should we use sparse queries (uses less memory at a cost to performance? *
|
static String[] |
AVAILABLE_DECISION_SCHEMES |
static String[] |
AVAILABLE_FILTER_MODES |
protected boolean |
blacklist |
static String |
DEFAULT_DECISION_SCHEME |
static String |
DEFAULT_FILTER_MODE |
static String |
DEFAULT_INDEX_LOCATION |
static String |
DEFAULT_MIME_FILTER |
static Boolean |
DEFAULT_USE_SPARSE_RANGE_FILTER |
protected org.apache.lucene.search.IndexSearcher |
index |
protected org.apache.lucene.index.IndexReader |
indexReader |
protected String |
mimefilter |
protected long |
processedURLs |
static String |
SCHEME_ETAG |
static String |
SCHEME_TIMESTAMP |
static String |
SCHEME_TIMESTAMP_AND_ETAG |
static String |
SCHEME_TIMESTAMP_OR_ETAG |
protected long |
unchangedURLs |
protected boolean |
useSparseRangeFilter |
ATTR_ACCEPT_HEADERS, ATTR_BDB_COOKIES, ATTR_DEFAULT_ENCODING, ATTR_DIGEST_ALGORITHM, ATTR_DIGEST_CONTENT, ATTR_FETCH_BANDWIDTH_MAX, ATTR_HTTP_BIND_ADDRESS, ATTR_HTTP_PROXY_HOST, ATTR_HTTP_PROXY_PORT, ATTR_IGNORE_COOKIES, ATTR_LOAD_COOKIES, ATTR_MAX_LENGTH_BYTES, ATTR_MIDFETCH_DECIDE_RULES, ATTR_SAVE_COOKIES, ATTR_SEND_CONNECTION_CLOSE, ATTR_SEND_IF_MODIFIED_SINCE, ATTR_SEND_IF_NONE_MATCH, ATTR_SEND_RANGE, ATTR_SEND_REFERER, ATTR_SOTIMEOUT_MS, ATTR_TIMEOUT_SECONDS, ATTR_TRUST, cookieDb, COOKIEDB_NAME, DEFAULT_DIGEST_ALGORITHM, DESC_DIGEST_ALGORITHM, DESC_DIGEST_CONTENT, DIGEST_ALGORITHMS, HTTP_SCHEME, HTTPS_SCHEME, MD5, RANGE, RANGE_PREFIX, REFERER, SHA1
ATTR_DECIDE_RULES, ATTR_ENABLED, attrDecideRules
A_CONTENT_STATE_KEY, A_DISCARD_REVISIT, A_FETCH_OVERDUE, A_LAST_CONTENT_DIGEST, A_LAST_DATESTAMP, A_LAST_ETAG, A_NUMBER_OF_VERSIONS, A_NUMBER_OF_VISITS, A_TIME_OF_NEXT_PROCESSING, A_WAIT_INTERVAL, A_WAIT_REEVALUATED, CONTENT_CHANGED, CONTENT_UNCHANGED, CONTENT_UNKNOWN
A_ANNOTATIONS, A_CONTENT_DIGEST, A_CONTENT_TYPE, A_CREDENTIAL_AVATARS_KEY, A_DELAY_FACTOR, A_DISTANCE_FROM_SEED, A_DNS_FETCH_TIME, A_DNS_SERVER_IP_LABEL, A_ETAG_HEADER, A_FETCH_BEGAN_TIME, A_FETCH_COMPLETED_TIME, A_FETCH_HISTORY, A_FORCE_RETIRE, A_FTP_CONTROL_CONVERSATION, A_FTP_FETCH_STATUS, A_HERITABLE_KEYS, A_HTML_BASE, A_HTTP_BIND_ADDRESS, A_HTTP_PROXY_HOST, A_HTTP_PROXY_PORT, A_HTTP_TRANSACTION, A_LAST_MODIFIED_HEADER, A_LOCALIZED_ERRORS, A_META_ROBOTS, A_MINIMUM_DELAY, A_MIRROR_PATH, A_PREREQUISITE_URI, A_REFERENCE_LENGTH, A_RETRY_DELAY, A_RRECORD_SET_LABEL, A_RUNTIME_EXCEPTION, A_SOURCE_TAG, A_STATUS, HEADER_TRUNC, LENGTH_TRUNC, TIMER_TRUNC, TRUNC_SUFFIX
S_BLOCKED_BY_CUSTOM_PROCESSOR, S_BLOCKED_BY_QUOTA, S_BLOCKED_BY_RUNTIME_LIMIT, S_BLOCKED_BY_USER, S_CONNECT_FAILED, S_CONNECT_LOST, S_DEEMED_CHAFF, S_DEEMED_NOT_FOUND, S_DEFERRED, S_DELETED_BY_USER, S_DNS_SUCCESS, S_DOMAIN_PREREQUISITE_FAILURE, S_DOMAIN_UNRESOLVABLE, S_GETBYNAME_SUCCESS, S_OTHER_PREREQUISITE_FAILURE, S_OUT_OF_SCOPE, S_PREREQUISITE_UNSCHEDULABLE_FAILURE, S_PROCESSING_THREAD_KILLED, S_ROBOTS_PRECLUDED, S_ROBOTS_PREREQUISITE_FAILURE, S_RUNTIME_EXCEPTION, S_SERIOUS_ERROR, S_TIMEOUT, S_TOO_MANY_EMBED_HOPS, S_TOO_MANY_LINK_HOPS, S_TOO_MANY_RETRIES, S_UNATTEMPTED, S_UNFETCHABLE_URI, S_UNQUEUEABLE
Constructor and Description |
---|
DeDupFetchHTTP(String name) |
Modifier and Type | Method and Description |
---|---|
protected boolean |
checkMidfetchAbort(org.archive.crawler.datamodel.CrawlURI curi,
org.archive.httpclient.HttpRecorderMethod method,
org.apache.commons.httpclient.HttpConnection conn) |
protected boolean |
datestampIndicatesNonChange(org.apache.commons.httpclient.HttpMethod method,
org.apache.lucene.document.Document doc)
Checks if the 'last-modified' in the HTTP header and compares it against the timestamp in the supplied Lucene
document.
|
protected boolean |
etagIndicatesNonChange(org.apache.commons.httpclient.HttpMethod method,
org.apache.lucene.document.Document doc)
Checks if the 'etag' in the HTTP header and compares it against the etag in the supplied Lucene document.
|
void |
finalTasks() |
void |
initialTasks() |
protected boolean |
isDuplicate(org.archive.crawler.datamodel.CrawlURI curi)
Compare the header infomation for 'last-modified' and/or 'etag' against data in the index.
|
protected org.apache.lucene.document.Document |
lookup(org.archive.crawler.datamodel.CrawlURI curi)
Searches the index for the URL of the given CrawlURI.
|
String |
report() |
addResponseContent, cleanupHttp, configureHttp, configureMethod, crawlCheckpoint, crawlEnded, crawlEnding, crawlPaused, crawlPausing, crawlResuming, crawlStarted, doAbort, getAttributeEither, getAuthScheme, getHttp, getMidfetchRule, handle401, innerProcess, listUsedFiles, loadCookies, loadCookies, saveCookies, saveCookies, setConditionalGetHeader, setSizes
checkForInterrupt, getController, getDecideRule, getDefaultNextProcessor, innerRejectProcess, isContentToProcess, isEnabled, isExpectedMimeType, isHttpTransactionContentToProcess, kickUpdate, process, rulesAccept, rulesAccept, setDefaultNextProcessor, spawn
addElementToDefinition, checkValue, earlyInitialize, getAbsoluteName, getAttribute, getAttribute, getAttribute, getAttributeInfo, getAttributeInfo, getAttributeInfoIterator, getAttributes, getDataContainerRecursive, getDataContainerRecursive, getDefaultValue, getDescription, getElementFromDefinition, getLegalValues, getLocalAttribute, getMBeanInfo, getMBeanInfo, getParent, getPreservedFields, getSettingsHandler, getUncheckedAttribute, getValue, globalSettings, invoke, isInitialized, isOverridden, iterator, removeElementFromDefinition, setAsOrder, setAttribute, setAttribute, setAttributes, setDescription, setPreservedFields, toString, unsetAttribute
protected org.apache.lucene.search.IndexSearcher index
protected org.apache.lucene.index.IndexReader indexReader
protected String mimefilter
protected boolean blacklist
protected long processedURLs
protected long unchangedURLs
protected boolean useSparseRangeFilter
public static final String ATTR_DECISION_SCHEME
public static final String SCHEME_TIMESTAMP
public static final String SCHEME_ETAG
public static final String SCHEME_TIMESTAMP_AND_ETAG
public static final String SCHEME_TIMESTAMP_OR_ETAG
public static final String[] AVAILABLE_DECISION_SCHEMES
public static final String DEFAULT_DECISION_SCHEME
public static final String ATTR_INDEX_LOCATION
public static final String DEFAULT_INDEX_LOCATION
public static final String ATTR_MIME_FILTER
public static final String DEFAULT_MIME_FILTER
public static final String ATTR_FILTER_MODE
public static final String[] AVAILABLE_FILTER_MODES
public static final String DEFAULT_FILTER_MODE
public static final String ATTR_USE_SPARSE_RANGE_FILTER
public static final Boolean DEFAULT_USE_SPARSE_RANGE_FILTER
public DeDupFetchHTTP(String name)
protected boolean checkMidfetchAbort(org.archive.crawler.datamodel.CrawlURI curi, org.archive.httpclient.HttpRecorderMethod method, org.apache.commons.httpclient.HttpConnection conn)
checkMidfetchAbort
in class org.archive.crawler.fetcher.FetchHTTP
protected boolean isDuplicate(org.archive.crawler.datamodel.CrawlURI curi)
curi
- The Crawl URI being processed.protected boolean datestampIndicatesNonChange(org.apache.commons.httpclient.HttpMethod method, org.apache.lucene.document.Document doc)
method
- HTTPMethod that allows access to the relevant HTTP headerdoc
- The Lucene document to compare againstprotected boolean etagIndicatesNonChange(org.apache.commons.httpclient.HttpMethod method, org.apache.lucene.document.Document doc)
method
- HTTPMethod that allows access to the relevant HTTP headerdoc
- The Lucene document to compare againstprotected org.apache.lucene.document.Document lookup(org.archive.crawler.datamodel.CrawlURI curi)
curi
- The CrawlURI to search forpublic void finalTasks()
finalTasks
in class org.archive.crawler.fetcher.FetchHTTP
public void initialTasks()
initialTasks
in class org.archive.crawler.fetcher.FetchHTTP
Copyright © 2005–2016 The Royal Danish Library, the Danish State and University Library, the National Library of France and the Austrian National Library.. All rights reserved.