public abstract class HeritrixTemplate extends Object implements Serializable
Modifier and Type | Field and Description |
---|---|
protected static String |
HARVESTINFO_AUDIENCE |
protected static String |
HARVESTINFO_CHANNEL |
protected static String |
HARVESTINFO_HARVESTFILENAMEPREFIX |
protected static String |
HARVESTINFO_HARVESTNUM |
protected static String |
HARVESTINFO_JOBID |
protected static String |
HARVESTINFO_JOBSUBMITDATE |
protected static String |
HARVESTINFO_MAXBYTESPERDOMAIN |
protected static String |
HARVESTINFO_MAXOBJECTSPERDOMAIN |
protected static String |
HARVESTINFO_ORDERXMLNAME |
protected static String |
HARVESTINFO_ORIGHARVESTDEFINITIONID |
protected static String |
HARVESTINFO_ORIGHARVESTDEFINITIONNAME |
protected static String |
HARVESTINFO_PERFORMER |
protected static String |
HARVESTINFO_SCHEDULENAME |
protected static String |
HARVESTINFO_VERSION |
protected static String |
HARVESTINFO_VERSION_NUMBER |
Constructor and Description |
---|
HeritrixTemplate() |
Modifier and Type | Method and Description |
---|---|
abstract void |
configureQuotaEnforcer(boolean maxObjectsIsSetByQuotaEnforcer,
long forceMaxBytesPerDomain,
long forceMaxObjectsPerDomain)
Activates or deactivate the quota-enforcer, depending on budget definition.
|
void |
editOrderXMLAddPerDomainCrawlerTraps(DomainConfiguration cfg)
Updates the order.xml to include a MatchesListRegExpDecideRule for each crawler-trap associated with for the given
DomainConfiguration.
|
abstract Long |
getMaxBytesPerDomain() |
abstract Long |
getMaxObjectsPerDomain() |
static HeritrixTemplate |
getTemplateFromString(String templateAsString) |
abstract String |
getXML() |
abstract boolean |
hasContent() |
abstract void |
insertCrawlerTraps(String elementName,
List<String> crawlertraps)
Method to add a list of crawler traps with a given element name.
|
abstract void |
insertWarcInfoMetadata(Job ajob,
String origHarvestdefinitionName,
String scheduleName,
String performer)
Method to add settings to the WARCWriterProcesser, so that it can generate a proper WARCINFO record.
|
abstract boolean |
IsDeduplicationEnabled() |
abstract boolean |
isValid() |
static HeritrixTemplate |
read(File orderXmlFile)
Read the given template from file.
|
static HeritrixTemplate |
read(Reader orderTemplateReader)
Read the template using the given Reader
|
abstract void |
removeDeduplicatorIfPresent()
Try to remove the deduplicator, if present in the template.
|
abstract void |
setArchiveFilePrefix(String archiveFilePrefix) |
abstract void |
setArchiveFormat(String archiveFormat)
Make sure that Heritrix will archive its data in the chosen archiveFormat.
|
abstract void |
setDeduplicationIndexLocation(String absolutePath) |
abstract void |
setDiskPath(String absolutePath) |
abstract void |
setMaxBytesPerDomain(Long maxbytesL) |
abstract void |
setMaxJobRunningTime(Long maxJobRunningTimeSecondsL)
Set the maxRunning time for the harvest
|
abstract void |
setMaxObjectsPerDomain(Long maxobjectsL) |
abstract void |
setRecoverlogNode(File recoverlogGzFile) |
abstract void |
setSeedsFilePath(String absolutePath) |
abstract void |
writeTemplate(javax.servlet.jsp.JspWriter out) |
abstract void |
writeTemplate(OutputStream os) |
abstract void |
writeToFile(File orderXmlFile) |
protected static final String HARVESTINFO_VERSION_NUMBER
protected static final String HARVESTINFO_VERSION
protected static final String HARVESTINFO_JOBID
protected static final String HARVESTINFO_CHANNEL
protected static final String HARVESTINFO_HARVESTNUM
protected static final String HARVESTINFO_ORIGHARVESTDEFINITIONID
protected static final String HARVESTINFO_MAXBYTESPERDOMAIN
protected static final String HARVESTINFO_MAXOBJECTSPERDOMAIN
protected static final String HARVESTINFO_ORDERXMLNAME
protected static final String HARVESTINFO_ORIGHARVESTDEFINITIONNAME
protected static final String HARVESTINFO_SCHEDULENAME
protected static final String HARVESTINFO_HARVESTFILENAMEPREFIX
protected static final String HARVESTINFO_JOBSUBMITDATE
protected static final String HARVESTINFO_PERFORMER
protected static final String HARVESTINFO_AUDIENCE
public HeritrixTemplate()
public abstract void configureQuotaEnforcer(boolean maxObjectsIsSetByQuotaEnforcer, long forceMaxBytesPerDomain, long forceMaxObjectsPerDomain)
maxObjectsIsSetByQuotaEnforcer
- Decides whether the maxObjectsIsSetByQuotaEnforcer or not.forceMaxBytesPerDomain
- The number of max bytes per domain enforced (can be no limit)forceMaxObjectsPerDomain
- The number of max objects per domain enforced (can be no limit)public abstract void setMaxBytesPerDomain(Long maxbytesL)
public abstract Long getMaxBytesPerDomain()
public abstract void setMaxObjectsPerDomain(Long maxobjectsL)
public abstract Long getMaxObjectsPerDomain()
public abstract boolean IsDeduplicationEnabled()
public abstract boolean isValid()
public abstract void insertCrawlerTraps(String elementName, List<String> crawlertraps)
elementName
- The name of the added element.crawlerTraps
- A list of crawler trap regular expressions to add to this job.public abstract void setArchiveFormat(String archiveFormat)
archiveFormat
- the chosen archiveformat ('arc' or 'warc' supported) Throws ArgumentNotValid If the chosen
archiveFormat is not supported.public abstract void setMaxJobRunningTime(Long maxJobRunningTimeSecondsL)
maxJobRunningTimeSecondsL
- Limit the harvest to this number of secondspublic void editOrderXMLAddPerDomainCrawlerTraps(DomainConfiguration cfg)
The added nodes have the form
cfg
- The DomainConfiguration for which to generate crawler trap deciderulesIllegalState
- If unable to update order.xml due to wrong order.xml formatpublic abstract void setDeduplicationIndexLocation(String absolutePath)
public abstract void setSeedsFilePath(String absolutePath)
public abstract void setArchiveFilePrefix(String archiveFilePrefix)
public abstract void setDiskPath(String absolutePath)
public abstract void writeTemplate(OutputStream os) throws IOException, ArgumentNotValid
IOException
ArgumentNotValid
public abstract void writeTemplate(javax.servlet.jsp.JspWriter out)
public abstract boolean hasContent()
public abstract void writeToFile(File orderXmlFile)
public abstract void setRecoverlogNode(File recoverlogGzFile)
public static HeritrixTemplate getTemplateFromString(String templateAsString)
public static HeritrixTemplate read(File orderXmlFile)
orderXmlFile
- a given HeritrixTemplate (H1 or H3) as a Filepublic static HeritrixTemplate read(Reader orderTemplateReader)
reader
- A given Readerpublic abstract void removeDeduplicatorIfPresent()
public abstract void insertWarcInfoMetadata(Job ajob, String origHarvestdefinitionName, String scheduleName, String performer)
ajob
- a HarvestJoborigHarvestdefinitionName
- The name of the harvestdefinition behind this jobscheduleName
- The name of the schedule used. (Will be null, if the job is not a selectiveHarvest).performer
- The name of organisation/person doing this harvestCopyright © 2005–2015 The Royal Danish Library, the Danish State and University Library, the National Library of France and the Austrian National Library.. All rights reserved.