public class DeDuplicator extends org.archive.crawler.framework.Processor implements org.archive.crawler.frontier.AdaptiveRevisitAttributeConstants
Will abort the processing (skip to post processor chain) of CrawlURIs that are deemed duplicates.
Duplicate detection can only be performed after the fetch processors have run.
Modifier and Type | Field and Description |
---|---|
static String |
ATTR_ANALYSIS_MODE
Set analysis mode.
|
static String |
ATTR_CHANGE_CONTENT_SIZE
Should the content size information be set to zero when a duplicate is found?
|
static String |
ATTR_EQUIVALENT
If an exact match is not made, should the processor try to find an equivalent match?
|
static String |
ATTR_FILTER_MODE
Is the mime filter a blacklist (do not apply processor to what matches) or whitelist (apply processor only to
what matches).
|
static String |
ATTR_INDEX_LOCATION
Location of Lucene Index to use for lookups
|
static String |
ATTR_LOG_LEVEL
What to write to a log file
|
static String |
ATTR_MATCHING_METHOD
The matching method in use (by url or content digest)
|
static String |
ATTR_MIME_FILTER
The filter on mime types.
|
static String |
ATTR_ORIGIN
Origin of duplicate URLs *
|
static String |
ATTR_ORIGIN_HANDLING
How should 'origin' be handled *
|
static String |
ATTR_SKIP_WRITE
Should the writer processor chain be skipped? *
|
static String |
ATTR_STATS_PER_HOST
Should statistics be tracked per host? *
|
static String |
ATTR_USE_SPARSE_RANGE_FILTER
Should we use sparse queries (uses less memory at a cost to performance? *
|
static String[] |
AVAILABLE_ANALYSIS_MODES |
static String[] |
AVAILABLE_FILTER_MODES |
static String[] |
AVAILABLE_LOG_LEVELS |
static String[] |
AVAILABLE_MATCHING_METHODS |
static String[] |
AVAILABLE_ORIGIN_HANDLING |
protected boolean |
blacklist |
protected boolean |
changeContentSize |
static String |
DEFAULT_ANALYSIS_MODE |
static Boolean |
DEFAULT_CHANGE_CONTENT_SIZE |
static Boolean |
DEFAULT_EQUIVALENT |
static String |
DEFAULT_FILTER_MODE |
static String |
DEFAULT_INDEX_LOCATION |
static String |
DEFAULT_LOG_LEVEL |
static String |
DEFAULT_MATCHING_METHOD |
static String |
DEFAULT_MIME_FILTER |
static String |
DEFAULT_ORIGIN |
static String |
DEFAULT_ORIGIN_HANDLING |
static Boolean |
DEFAULT_SKIP_WRITE |
static Boolean |
DEFAULT_STATS_PER_HOST |
static Boolean |
DEFAULT_USE_SPARSE_RANGE_FILTER |
protected boolean |
doETagAnalysis |
protected boolean |
doTimestampAnalysis |
protected boolean |
equivalent |
protected org.apache.lucene.search.IndexSearcher |
index |
protected org.apache.lucene.index.IndexReader |
indexReader |
protected boolean |
lookupByURL |
protected String |
mimefilter |
static String |
ORIGIN_HANDLING_INDEX |
static String |
ORIGIN_HANDLING_NONE |
static String |
ORIGIN_HANDLING_PROCESSOR |
protected HashMap<String,is.hi.bok.deduplicator.Statistics> |
perHostStats |
protected boolean |
skipWriting |
protected is.hi.bok.deduplicator.Statistics |
stats |
protected boolean |
statsPerHost |
protected boolean |
useOrigin |
protected boolean |
useOriginFromIndex |
protected boolean |
useSparseRangeFilter |
ATTR_DECIDE_RULES, ATTR_ENABLED, attrDecideRules
A_CONTENT_STATE_KEY, A_DISCARD_REVISIT, A_FETCH_OVERDUE, A_LAST_CONTENT_DIGEST, A_LAST_DATESTAMP, A_LAST_ETAG, A_NUMBER_OF_VERSIONS, A_NUMBER_OF_VISITS, A_TIME_OF_NEXT_PROCESSING, A_WAIT_INTERVAL, A_WAIT_REEVALUATED, CONTENT_CHANGED, CONTENT_UNCHANGED, CONTENT_UNKNOWN
A_ANNOTATIONS, A_CONTENT_DIGEST, A_CONTENT_TYPE, A_CREDENTIAL_AVATARS_KEY, A_DELAY_FACTOR, A_DISTANCE_FROM_SEED, A_DNS_FETCH_TIME, A_DNS_SERVER_IP_LABEL, A_ETAG_HEADER, A_FETCH_BEGAN_TIME, A_FETCH_COMPLETED_TIME, A_FETCH_HISTORY, A_FORCE_RETIRE, A_FTP_CONTROL_CONVERSATION, A_FTP_FETCH_STATUS, A_HERITABLE_KEYS, A_HTML_BASE, A_HTTP_BIND_ADDRESS, A_HTTP_PROXY_HOST, A_HTTP_PROXY_PORT, A_HTTP_TRANSACTION, A_LAST_MODIFIED_HEADER, A_LOCALIZED_ERRORS, A_META_ROBOTS, A_MINIMUM_DELAY, A_MIRROR_PATH, A_PREREQUISITE_URI, A_REFERENCE_LENGTH, A_RETRY_DELAY, A_RRECORD_SET_LABEL, A_RUNTIME_EXCEPTION, A_SOURCE_TAG, A_STATUS, HEADER_TRUNC, LENGTH_TRUNC, TIMER_TRUNC, TRUNC_SUFFIX
Constructor and Description |
---|
DeDuplicator(String name) |
Modifier and Type | Method and Description |
---|---|
protected void |
doAnalysis(org.archive.crawler.datamodel.CrawlURI curi,
is.hi.bok.deduplicator.Statistics currHostStats,
boolean isDuplicate) |
protected void |
doTimestampAnalysis(org.archive.crawler.datamodel.CrawlURI curi,
org.apache.lucene.document.Document urlHit,
is.hi.bok.deduplicator.Statistics currHostStats,
boolean isDuplicate) |
protected void |
finalTasks() |
protected static String |
getPercentage(double portion,
double total) |
protected void |
initialTasks() |
protected void |
innerProcess(org.archive.crawler.datamodel.CrawlURI curi) |
protected org.apache.lucene.document.Document |
lookupByDigest(org.archive.crawler.datamodel.CrawlURI curi,
is.hi.bok.deduplicator.Statistics currHostStats)
Process a CrawlURI looking up in the index by content digest
|
protected org.apache.lucene.document.Document |
lookupByURL(org.archive.crawler.datamodel.CrawlURI curi,
is.hi.bok.deduplicator.Statistics currHostStats)
Process a CrawlURI looking up in the index by URL
|
protected org.apache.lucene.search.Query |
queryField(String fieldName,
String value)
Run a simple Lucene query for a single term in a single field.
|
protected Object |
readAttribute(String name,
Object defaultValue)
A utility method for reading attributes.
|
String |
report() |
checkForInterrupt, getController, getDecideRule, getDefaultNextProcessor, innerRejectProcess, isContentToProcess, isEnabled, isExpectedMimeType, isHttpTransactionContentToProcess, kickUpdate, process, rulesAccept, rulesAccept, setDefaultNextProcessor, spawn
addElementToDefinition, checkValue, earlyInitialize, getAbsoluteName, getAttribute, getAttribute, getAttribute, getAttributeInfo, getAttributeInfo, getAttributeInfoIterator, getAttributes, getDataContainerRecursive, getDataContainerRecursive, getDefaultValue, getDescription, getElementFromDefinition, getLegalValues, getLocalAttribute, getMBeanInfo, getMBeanInfo, getParent, getPreservedFields, getSettingsHandler, getUncheckedAttribute, getValue, globalSettings, invoke, isInitialized, isOverridden, iterator, removeElementFromDefinition, setAsOrder, setAttribute, setAttribute, setAttributes, setDescription, setPreservedFields, toString, unsetAttribute
protected org.apache.lucene.search.IndexSearcher index
protected org.apache.lucene.index.IndexReader indexReader
protected boolean lookupByURL
protected boolean equivalent
protected String mimefilter
protected boolean blacklist
protected boolean doTimestampAnalysis
protected boolean doETagAnalysis
protected boolean statsPerHost
protected boolean changeContentSize
protected boolean useOrigin
protected boolean useOriginFromIndex
protected boolean useSparseRangeFilter
protected is.hi.bok.deduplicator.Statistics stats
protected HashMap<String,is.hi.bok.deduplicator.Statistics> perHostStats
protected boolean skipWriting
public static final String ATTR_INDEX_LOCATION
public static final String DEFAULT_INDEX_LOCATION
public static final String ATTR_MATCHING_METHOD
public static final String[] AVAILABLE_MATCHING_METHODS
public static final String DEFAULT_MATCHING_METHOD
public static final String ATTR_EQUIVALENT
public static final Boolean DEFAULT_EQUIVALENT
public static final String ATTR_MIME_FILTER
public static final String DEFAULT_MIME_FILTER
public static final String ATTR_FILTER_MODE
public static final String[] AVAILABLE_FILTER_MODES
public static final String DEFAULT_FILTER_MODE
public static final String ATTR_ANALYSIS_MODE
public static final String[] AVAILABLE_ANALYSIS_MODES
public static final String DEFAULT_ANALYSIS_MODE
public static final String ATTR_CHANGE_CONTENT_SIZE
public static final Boolean DEFAULT_CHANGE_CONTENT_SIZE
public static final String ATTR_LOG_LEVEL
public static final String[] AVAILABLE_LOG_LEVELS
public static final String DEFAULT_LOG_LEVEL
public static final String ATTR_STATS_PER_HOST
public static final Boolean DEFAULT_STATS_PER_HOST
public static final String ATTR_ORIGIN_HANDLING
public static final String ORIGIN_HANDLING_NONE
public static final String ORIGIN_HANDLING_PROCESSOR
public static final String ORIGIN_HANDLING_INDEX
public static final String[] AVAILABLE_ORIGIN_HANDLING
public static final String DEFAULT_ORIGIN_HANDLING
public static final String ATTR_ORIGIN
public static final String DEFAULT_ORIGIN
public static final String ATTR_SKIP_WRITE
public static final Boolean DEFAULT_SKIP_WRITE
public static final String ATTR_USE_SPARSE_RANGE_FILTER
public static final Boolean DEFAULT_USE_SPARSE_RANGE_FILTER
public DeDuplicator(String name)
protected void initialTasks()
initialTasks
in class org.archive.crawler.framework.Processor
protected Object readAttribute(String name, Object defaultValue)
name
- The name of the attributedefaultValue
- A default value to return if an error occursprotected void innerProcess(org.archive.crawler.datamodel.CrawlURI curi) throws InterruptedException
innerProcess
in class org.archive.crawler.framework.Processor
InterruptedException
protected org.apache.lucene.document.Document lookupByURL(org.archive.crawler.datamodel.CrawlURI curi, is.hi.bok.deduplicator.Statistics currHostStats)
curi
- The CrawlURI to processcurrHostStats
- A statistics object for the current host. If per host statistics tracking is enabled this
must be non null and the method will increment appropriate counters on it.protected org.apache.lucene.document.Document lookupByDigest(org.archive.crawler.datamodel.CrawlURI curi, is.hi.bok.deduplicator.Statistics currHostStats)
curi
- The CrawlURI to processcurrHostStats
- A statistics object for the current host. If per host statistics tracking is enabled this
must be non null and the method will increment appropriate counters on it.protected static String getPercentage(double portion, double total)
protected void doAnalysis(org.archive.crawler.datamodel.CrawlURI curi, is.hi.bok.deduplicator.Statistics currHostStats, boolean isDuplicate)
protected void doTimestampAnalysis(org.archive.crawler.datamodel.CrawlURI curi, org.apache.lucene.document.Document urlHit, is.hi.bok.deduplicator.Statistics currHostStats, boolean isDuplicate)
protected org.apache.lucene.search.Query queryField(String fieldName, String value)
fieldName
- name of the field to look in.value
- The value to query forprotected void finalTasks()
finalTasks
in class org.archive.crawler.framework.Processor
Copyright © 2005–2015 The Royal Danish Library, the Danish State and University Library, the National Library of France and the Austrian National Library.. All rights reserved.