public class H3HeritrixTemplate extends HeritrixTemplate implements Serializable
Heritrix3 has a new model based on spring, So the XPATH is no good for processing. Instead we use placeholders instead, marked by %{..} instead of ${..}, which is used by Heritrix3 already. The template is a H3 template if it contains the string: "xmlns="http://www.springframework.org/...."
Modifier and Type | Field and Description |
---|---|
static String |
ARCHIVE_FILE_PREFIX_PLACEHOLDER |
static String |
CRAWLERTRAPS_PLACEHOLDER |
static String |
DEDUPLICATION_BEAN_PATTERN |
static String |
DEDUPLICATION_BEAN_REFERENCE_PATTERN |
static String |
DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER |
static String |
FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER |
static String |
MAX_TIME_SECONDS_PLACEHOLDER |
static String |
METADATA_ITEMS_PLACEHOLDER |
static String |
QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER |
static String |
QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER |
HARVESTINFO_AUDIENCE, HARVESTINFO_CHANNEL, HARVESTINFO_HARVESTFILENAMEPREFIX, HARVESTINFO_HARVESTNUM, HARVESTINFO_JOBID, HARVESTINFO_JOBSUBMITDATE, HARVESTINFO_MAXBYTESPERDOMAIN, HARVESTINFO_MAXOBJECTSPERDOMAIN, HARVESTINFO_ORDERXMLNAME, HARVESTINFO_ORIGHARVESTDEFINITIONID, HARVESTINFO_ORIGHARVESTDEFINITIONNAME, HARVESTINFO_PERFORMER, HARVESTINFO_SCHEDULENAME, HARVESTINFO_VERSION, HARVESTINFO_VERSION_NUMBER, template_id
Constructor and Description |
---|
H3HeritrixTemplate(long template_id,
String template)
Constructor for HeritrixTemplate class.
|
Modifier and Type | Method and Description |
---|---|
void |
configureQuotaEnforcer(boolean maxObjectsIsSetByQuotaEnforcer,
long forceMaxBytesPerDomain,
long forceMaxObjectsPerDomain)
Configuring the quota-enforcer, depending on budget definition.
|
Long |
getMaxBytesPerDomain() |
Long |
getMaxObjectsPerDomain() |
HeritrixTemplate |
getTemplate()
return the template.
|
String |
getXML()
Return HeritrixTemplate as XML.
|
boolean |
hasContent() |
void |
insertAttributes(List<EAV.AttributeAndType> attributesAndTypes)
Try to insert the given list of attributes into the template.
|
void |
insertCrawlerTraps(String elementName,
List<String> crawlertraps)
Method to add a list of crawler traps with a given element name.
|
void |
insertWarcInfoMetadata(Job ajob,
String origHarvestdefinitionName,
String scheduleName,
String performer)
Method to add settings to the WARCWriterProcesser, so that it can generate a proper WARCINFO record.
|
boolean |
IsDeduplicationEnabled() |
boolean |
isValid() |
boolean |
isVerified()
Has Template been verified?
|
void |
removeDeduplicatorIfPresent()
Try to remove the deduplicator, if present in the template.
|
void |
removePlaceholders()
Hack to remove existing placeholders, that is still present after template
manipulation is completed.
|
void |
setArchiveFilePrefix(String archiveFilePrefix) |
void |
setArchiveFormat(String archiveFormat)
Make sure that Heritrix will archive its data in the chosen archiveFormat.
|
void |
setDeduplicationIndexLocation(String absolutePath) |
void |
setDiskPath(String absolutePath) |
void |
setMaxBytesPerDomain(Long maxbytesL) |
void |
setMaxJobRunningTime(Long maxJobRunningTimeSecondsL)
Update the maxTimeSeconds property in the heritrix3 template, if possible.
|
void |
setMaxObjectsPerDomain(Long maxobjectsL) |
void |
setRecoverlogNode(File recoverlogGzFile) |
void |
setSeedsFilePath(String absolutePath) |
void |
writeTemplate(javax.servlet.jsp.JspWriter out) |
void |
writeTemplate(OutputStream os) |
void |
writeToFile(File orderXmlFile) |
editOrderXMLAddPerDomainCrawlerTraps, getTemplateFromString, isActive, read, read, setIsActive
public static final String METADATA_ITEMS_PLACEHOLDER
public static final String MAX_TIME_SECONDS_PLACEHOLDER
public static final String CRAWLERTRAPS_PLACEHOLDER
public static final String DEDUPLICATION_BEAN_REFERENCE_PATTERN
public static final String DEDUPLICATION_BEAN_PATTERN
public static final String DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER
public static final String ARCHIVE_FILE_PREFIX_PLACEHOLDER
public static final String FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER
public static final String QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER
public static final String QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER
public H3HeritrixTemplate(long template_id, String template)
template_id
- The persistent id of the template in the databasetemplate
- The template as String objectArgumentNotValid
- if template is null.public HeritrixTemplate getTemplate()
public boolean isVerified()
public String getXML()
getXML
in class HeritrixTemplate
public void setMaxJobRunningTime(Long maxJobRunningTimeSecondsL)
setMaxJobRunningTime
in class HeritrixTemplate
maxJobRunningTimeSecondsL
- Force the harvestJob to end after this number of seconds
Property of the org.archive.crawler.framework.CrawlLimitEnforcer
public void setMaxBytesPerDomain(Long maxbytesL)
setMaxBytesPerDomain
in class HeritrixTemplate
public Long getMaxBytesPerDomain()
getMaxBytesPerDomain
in class HeritrixTemplate
public void setMaxObjectsPerDomain(Long maxobjectsL)
setMaxObjectsPerDomain
in class HeritrixTemplate
public Long getMaxObjectsPerDomain()
getMaxObjectsPerDomain
in class HeritrixTemplate
public boolean isValid()
isValid
in class HeritrixTemplate
public boolean IsDeduplicationEnabled()
IsDeduplicationEnabled
in class HeritrixTemplate
public void configureQuotaEnforcer(boolean maxObjectsIsSetByQuotaEnforcer, long forceMaxBytesPerDomain, long forceMaxObjectsPerDomain)
configureQuotaEnforcer
in class HeritrixTemplate
maxObjectsIsSetByQuotaEnforcer
- Decides whether the maxObjectsIsSetByQuotaEnforcer or not.forceMaxBytesPerDomain
- The number of max bytes per domain enforced (can be no limit)forceMaxObjectsPerDomain
- The number of max objects per domain enforced (can be no limit)public void setArchiveFormat(String archiveFormat)
setArchiveFormat
in class HeritrixTemplate
archiveFormat
- the chosen archiveformat ('arc' or 'warc' supported)ArgumentNotValid
- If the chosen archiveFormat is not supported.public void insertCrawlerTraps(String elementName, List<String> crawlertraps)
HeritrixTemplate
insertCrawlerTraps
in class HeritrixTemplate
elementName
- The name of the added element.crawlertraps
- A list of crawler trap regular expressions to add to this job.public void writeTemplate(OutputStream os) throws IOFailure
writeTemplate
in class HeritrixTemplate
IOFailure
public boolean hasContent()
hasContent
in class HeritrixTemplate
public void writeToFile(File orderXmlFile)
writeToFile
in class HeritrixTemplate
public void setRecoverlogNode(File recoverlogGzFile)
setRecoverlogNode
in class HeritrixTemplate
public void setDeduplicationIndexLocation(String absolutePath)
setDeduplicationIndexLocation
in class HeritrixTemplate
public void setSeedsFilePath(String absolutePath)
setSeedsFilePath
in class HeritrixTemplate
public void setArchiveFilePrefix(String archiveFilePrefix)
setArchiveFilePrefix
in class HeritrixTemplate
public void setDiskPath(String absolutePath)
setDiskPath
in class HeritrixTemplate
public void removeDeduplicatorIfPresent()
HeritrixTemplate
removeDeduplicatorIfPresent
in class HeritrixTemplate
public void insertWarcInfoMetadata(Job ajob, String origHarvestdefinitionName, String scheduleName, String performer)
HeritrixTemplate
insertWarcInfoMetadata
in class HeritrixTemplate
ajob
- a HarvestJoborigHarvestdefinitionName
- The name of the harvestdefinition behind this jobscheduleName
- The name of the schedule used. (Will be null, if the job is not a selectiveHarvest).performer
- The name of organisation/person doing this harvestpublic void insertAttributes(List<EAV.AttributeAndType> attributesAndTypes)
HeritrixTemplate
insertAttributes
in class HeritrixTemplate
public void writeTemplate(javax.servlet.jsp.JspWriter out) throws IOFailure
writeTemplate
in class HeritrixTemplate
IOFailure
public void removePlaceholders()
Copyright © 2005–2016 The Royal Danish Library, the Danish State and University Library, the National Library of France and the Austrian National Library.. All rights reserved.