public class DeDuplicator extends org.archive.modules.Processor implements org.springframework.beans.factory.InitializingBean
Will determine if CrawlURIs are duplicates.
Duplicate detection can only be performed after the fetch processors have run. Modified by SVC to use Lucene 4.X
Modifier and Type | Class and Description |
---|---|
static class |
DeDuplicator.AnalysisMode |
static class |
DeDuplicator.FilterMode |
static class |
DeDuplicator.MatchingMethod |
static class |
DeDuplicator.OriginHandling |
Modifier and Type | Field and Description |
---|---|
static String |
ATTR_ANALYZE_MODE |
static String |
ATTR_CHANGE_CONTENT_SIZE |
static String |
ATTR_EQUIVALENT |
static String |
ATTR_FILTER_MODE |
static String |
ATTR_JUMP_TO |
static String |
ATTR_MIME_FILTER |
static String |
ATTR_ORIGIN |
static String |
ATTR_ORIGIN_HANDLING |
static String |
ATTR_REVISIT_IN_WARCS |
static String |
ATTR_STATS_PER_HOST |
static String |
DEFAULT_MIME_FILTER |
static DeDuplicator.OriginHandling |
DEFAULT_ORIGIN_HANDLING |
protected org.apache.lucene.index.IndexReader |
indexReader |
protected org.apache.lucene.search.IndexSearcher |
indexSearcher |
protected boolean |
lookupByURL |
protected HashMap<String,is.hi.bok.deduplicator.Statistics> |
perHostStats |
protected org.archive.modules.net.ServerCache |
serverCache |
protected is.hi.bok.deduplicator.Statistics |
stats |
protected boolean |
statsPerHost |
protected boolean |
useOrigin |
protected boolean |
useOriginFromIndex |
Constructor and Description |
---|
DeDuplicator() |
Modifier and Type | Method and Description |
---|---|
void |
afterPropertiesSet() |
protected void |
doAnalysis(org.archive.modules.CrawlURI curi,
is.hi.bok.deduplicator.Statistics currHostStats,
boolean isDuplicate) |
protected void |
doTimestampAnalysis(org.archive.modules.CrawlURI curi,
org.apache.lucene.document.Document urlHit,
is.hi.bok.deduplicator.Statistics currHostStats,
boolean isDuplicate) |
DeDuplicator.AnalysisMode |
getAnalysisMode() |
boolean |
getAnalyzeTimestamp() |
Boolean |
getBlacklist() |
Boolean |
getChangeContentSize() |
boolean |
getEnabled() |
DeDuplicator.FilterMode |
getFilterMode() |
String |
getIndexLocation() |
String |
getJumpTo() |
DeDuplicator.MatchingMethod |
getMatchingMethod() |
String |
getMimeFilter() |
String |
getOrigin() |
DeDuplicator.OriginHandling |
getOriginHandling() |
protected static String |
getPercentage(double portion,
double total) |
Boolean |
getRevisitInWarcs() |
org.archive.modules.net.ServerCache |
getServerCache() |
Boolean |
getStatsPerHost() |
Boolean |
getTryEquivalent() |
protected void |
innerProcess(org.archive.modules.CrawlURI puri) |
protected org.archive.modules.ProcessResult |
innerProcessResult(org.archive.modules.CrawlURI curi) |
protected org.apache.lucene.document.Document |
lookupByDigest(org.archive.modules.CrawlURI curi,
is.hi.bok.deduplicator.Statistics currHostStats)
Process a CrawlURI looking up in the index by content digest
|
protected org.apache.lucene.document.Document |
lookupByURL(org.archive.modules.CrawlURI curi,
is.hi.bok.deduplicator.Statistics currHostStats)
Process a CrawlURI looking up in the index by URL
|
protected org.apache.lucene.search.Query |
queryField(String fieldName,
String value)
Run a simple Lucene query for a single term in a single field.
|
String |
report() |
void |
setAnalysisMode(DeDuplicator.AnalysisMode analyzeMode) |
void |
setChangeContentSize(Boolean changeContentSize)
SPRING SETTER
|
void |
setEnabled(boolean enabled) |
void |
setfilterMode(DeDuplicator.FilterMode filterMode)
SPRING SETTER method
|
void |
setIndexLocation(String indexLocation)
SETTER used by Spring
|
void |
setJumpTo(String jumpTo)
SPRING SETTER.
|
void |
setMatchingMethod(DeDuplicator.MatchingMethod method)
SETTER used by Spring
|
void |
setMimeFilter(String mimeFilter) |
void |
setOrigin(String origin)
SPRING SETTER
|
void |
setOriginHandling(DeDuplicator.OriginHandling originHandling) |
void |
setRevisitInWarcs(Boolean revisitOn) |
void |
setServerCache(org.archive.modules.net.ServerCache serverCache) |
void |
setStatsPerHost(Boolean statsPerHost) |
void |
setTryEquivalent(Boolean tryEquivalent)
SPRING SETTER
|
protected boolean |
shouldProcess(org.archive.modules.CrawlURI curi) |
doCheckpoint, finishCheckpoint, flattenVia, fromCheckpointJson, getBeanName, getKeyedProperties, getRecordedSize, getShouldProcessRule, getURICount, hasHttpAuthenticationCredential, innerRejectProcess, isRunning, isSuccess, process, setBeanName, setRecoveryCheckpoint, setShouldProcessRule, start, startCheckpoint, stop, toCheckpointJson
public static final String ATTR_JUMP_TO
public static final String ATTR_ORIGIN
public static final String ATTR_EQUIVALENT
public static final String ATTR_MIME_FILTER
public static final String DEFAULT_MIME_FILTER
public static final String ATTR_FILTER_MODE
public static final String ATTR_ANALYZE_MODE
public static final String ATTR_CHANGE_CONTENT_SIZE
public static final String ATTR_STATS_PER_HOST
public static final String ATTR_ORIGIN_HANDLING
public static final DeDuplicator.OriginHandling DEFAULT_ORIGIN_HANDLING
public static final String ATTR_REVISIT_IN_WARCS
protected org.archive.modules.net.ServerCache serverCache
protected org.apache.lucene.search.IndexSearcher indexSearcher
protected org.apache.lucene.index.IndexReader indexReader
protected boolean lookupByURL
protected boolean statsPerHost
protected boolean useOrigin
protected boolean useOriginFromIndex
protected is.hi.bok.deduplicator.Statistics stats
protected HashMap<String,is.hi.bok.deduplicator.Statistics> perHostStats
public DeDuplicator()
public boolean getEnabled()
getEnabled
in class org.archive.modules.Processor
public void setEnabled(boolean enabled)
setEnabled
in class org.archive.modules.Processor
public String getIndexLocation()
public void setIndexLocation(String indexLocation)
public DeDuplicator.MatchingMethod getMatchingMethod()
public void setMatchingMethod(DeDuplicator.MatchingMethod method)
public void setJumpTo(String jumpTo)
public Boolean getTryEquivalent()
public void setTryEquivalent(Boolean tryEquivalent)
public String getMimeFilter()
public void setMimeFilter(String mimeFilter)
public DeDuplicator.FilterMode getFilterMode()
public Boolean getBlacklist()
public void setfilterMode(DeDuplicator.FilterMode filterMode)
public boolean getAnalyzeTimestamp()
public void setAnalysisMode(DeDuplicator.AnalysisMode analyzeMode)
public DeDuplicator.AnalysisMode getAnalysisMode()
public Boolean getChangeContentSize()
public void setChangeContentSize(Boolean changeContentSize)
public Boolean getStatsPerHost()
public void setStatsPerHost(Boolean statsPerHost)
public DeDuplicator.OriginHandling getOriginHandling()
public void setOriginHandling(DeDuplicator.OriginHandling originHandling)
public void setRevisitInWarcs(Boolean revisitOn)
public Boolean getRevisitInWarcs()
public org.archive.modules.net.ServerCache getServerCache()
public void setServerCache(org.archive.modules.net.ServerCache serverCache)
public void afterPropertiesSet() throws Exception
afterPropertiesSet
in interface org.springframework.beans.factory.InitializingBean
Exception
protected boolean shouldProcess(org.archive.modules.CrawlURI curi)
shouldProcess
in class org.archive.modules.Processor
protected void innerProcess(org.archive.modules.CrawlURI puri)
innerProcess
in class org.archive.modules.Processor
protected org.archive.modules.ProcessResult innerProcessResult(org.archive.modules.CrawlURI curi) throws InterruptedException
innerProcessResult
in class org.archive.modules.Processor
InterruptedException
protected org.apache.lucene.document.Document lookupByURL(org.archive.modules.CrawlURI curi, is.hi.bok.deduplicator.Statistics currHostStats)
curi
- The CrawlURI to processcurrHostStats
- A statistics object for the current host. If per host statistics tracking is enabled this
must be non null and the method will increment appropriate counters on it.protected org.apache.lucene.document.Document lookupByDigest(org.archive.modules.CrawlURI curi, is.hi.bok.deduplicator.Statistics currHostStats)
curi
- The CrawlURI to processcurrHostStats
- A statistics object for the current host. If per host statistics tracking is enabled this
must be non null and the method will increment appropriate counters on it.protected static String getPercentage(double portion, double total)
protected void doAnalysis(org.archive.modules.CrawlURI curi, is.hi.bok.deduplicator.Statistics currHostStats, boolean isDuplicate)
protected void doTimestampAnalysis(org.archive.modules.CrawlURI curi, org.apache.lucene.document.Document urlHit, is.hi.bok.deduplicator.Statistics currHostStats, boolean isDuplicate)
protected org.apache.lucene.search.Query queryField(String fieldName, String value)
fieldName
- name of the field to look in.value
- The value to query forCopyright © 2005–2018 The Royal Danish Library, the National Library of France and the Austrian National Library.. All rights reserved.