001package dk.netarkivet.harvester.datamodel; 002 003import java.io.BufferedReader; 004import java.io.File; 005import java.io.FileNotFoundException; 006import java.io.FileReader; 007import java.io.IOException; 008import java.io.OutputStream; 009import java.io.Reader; 010import java.io.Serializable; 011import java.util.List; 012 013import javax.servlet.jsp.JspWriter; 014 015import org.dom4j.DocumentException; 016import org.slf4j.Logger; 017import org.slf4j.LoggerFactory; 018 019import dk.netarkivet.common.exceptions.ArgumentNotValid; 020import dk.netarkivet.common.exceptions.IOFailure; 021import dk.netarkivet.common.exceptions.IllegalState; 022 023/** 024 * Abstract class for manipulating Heritrix Templates. 025 * 026 */ 027public abstract class HeritrixTemplate implements Serializable { 028 029 private static final Logger log = LoggerFactory.getLogger(HeritrixTemplate.class); 030 031 private static final CharSequence H1_SIGNATURE = "<crawl-order xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance"; 032 //private static final CharSequence H3_SIGNATURE = "xmlns=\"http://www.springframework.org/"; 033 private static final CharSequence H3_SIGNATURE = "http://www.springframework.org/"; 034 035 // Constants for the metadata added to the warcinfo record when using WARC 036 037 protected static final String HARVESTINFO_VERSION_NUMBER = "0.5"; 038 protected static final String HARVESTINFO_VERSION = "harvestInfo.version"; 039 protected static final String HARVESTINFO_JOBID = "harvestInfo.jobId"; 040 protected static final String HARVESTINFO_CHANNEL = "harvestInfo.channel"; 041 protected static final String HARVESTINFO_HARVESTNUM = "harvestInfo.harvestNum"; 042 protected static final String HARVESTINFO_ORIGHARVESTDEFINITIONID = "harvestInfo.origHarvestDefinitionID"; 043 protected static final String HARVESTINFO_MAXBYTESPERDOMAIN = "harvestInfo.maxBytesPerDomain"; 044 protected static final String HARVESTINFO_MAXOBJECTSPERDOMAIN = "harvestInfo.maxObjectsPerDomain"; 045 protected static final String HARVESTINFO_ORDERXMLNAME = "harvestInfo.orderXMLName"; 046 protected static final String HARVESTINFO_ORIGHARVESTDEFINITIONNAME = "harvestInfo.origHarvestDefinitionName"; 047 protected static final String HARVESTINFO_SCHEDULENAME = "harvestInfo.scheduleName"; 048 protected static final String HARVESTINFO_HARVESTFILENAMEPREFIX = "harvestInfo.harvestFilenamePrefix"; 049 protected static final String HARVESTINFO_JOBSUBMITDATE = "harvestInfo.jobSubmitDate"; 050 protected static final String HARVESTINFO_PERFORMER = "harvestInfo.performer"; 051 protected static final String HARVESTINFO_AUDIENCE = "harvestInfo.audience"; 052 053 054 /** insertion-methods 055 * 056 * Two methods for adding domain quotas to the quotaEnforcer bean. 057 * maxBytesPerDomain() 058 * maxObjectsPerDomain() 059 * 060 * One or two methods for inserting crawlertraps 061 * insertGlobalCrawlerTraps 062 * insertDomainSpecificCrawlerTraps 063 */ 064 065 /** 066 * Activates or deactivate the quota-enforcer, depending on budget definition. Object limit can be defined either by 067 * using the queue-total-budget property or the quota enforcer. Which is chosen is set by the argument 068 * maxObjectsIsSetByQuotaEnforcer}'s value. So quota enforcer is set as follows: 069 * <ul> 070 * <li>Object limit is not set by quota enforcer, disabled only if there is no byte limit.</li> 071 * <li>Object limit is set by quota enforcer, so it should be enabled whether a byte or object limit is set.</li> 072 * </ul> 073 * 074 * @param maxObjectsIsSetByQuotaEnforcer Decides whether the maxObjectsIsSetByQuotaEnforcer or not. 075 * @param forceMaxBytesPerDomain The number of max bytes per domain enforced (can be no limit) 076 * @param forceMaxObjectsPerDomain The number of max objects per domain enforced (can be no limit) 077 */ 078 public abstract void configureQuotaEnforcer( 079 boolean maxObjectsIsSetByQuotaEnforcer, long forceMaxBytesPerDomain, long forceMaxObjectsPerDomain); 080 081 082 // Getter/Setter for MaxBytesPerDomain value 083 public abstract void setMaxBytesPerDomain(Long maxbytesL); 084 public abstract Long getMaxBytesPerDomain(); // TODO Is necessary? 085 086 // Getter/Setter for MaxObjectsPerDomain value 087 public abstract void setMaxObjectsPerDomain(Long maxobjectsL); 088 public abstract Long getMaxObjectsPerDomain(); // TODO Is necessary? 089 090 /** 091 * 092 * @return true, if deduplication is enabled in the template (used for determine whether or not to request a deduplication index from the indexserver) 093 */ 094 public abstract boolean IsDeduplicationEnabled(); 095 096 /** 097 * @return true, if the template is valid, otherwise false 098 */ 099 public abstract boolean isValid(); 100 101 /** 102 * @return the XML behind this template 103 */ 104 public abstract String getXML(); 105 106 /** 107 * Method to add a list of crawler traps with a given element name. It is used both to add per-domain traps and 108 * global traps. 109 * 110 * @param elementName The name of the added element. 111 * @param crawlerTraps A list of crawler trap regular expressions to add to this job. 112 */ 113 114 public abstract void insertCrawlerTraps(String elementName, List<String> crawlertraps); 115 116 /** 117 * Make sure that Heritrix will archive its data in the chosen archiveFormat. 118 * 119 * @param archiveFormat the chosen archiveformat ('arc' or 'warc' supported) Throws ArgumentNotValid If the chosen 120 * archiveFormat is not supported. 121 */ 122 public abstract void setArchiveFormat(String archiveFormat); 123 124 125 /** 126 * Set the maxRunning time for the harvest 127 * @param maxJobRunningTimeSecondsL Limit the harvest to this number of seconds 128 */ 129 public abstract void setMaxJobRunningTime(Long maxJobRunningTimeSecondsL); 130 131 /** 132 * Updates the order.xml to include a MatchesListRegExpDecideRule for each crawler-trap associated with for the given 133 * DomainConfiguration. 134 * <p> 135 * The added nodes have the form 136 * <p> 137 * <newObject name="domain.dk" class="org.archive.crawler.deciderules.MatchesListRegExpDecideRule"> <string 138 * name="decision">REJECT</string> <string name="list-logic">OR</string> <stringList name="regexp-list"> 139 * <string>theFirstRegexp</string> <string>theSecondRegexp</string> </stringList> </newObject> 140 * 141 * @param cfg The DomainConfiguration for which to generate crawler trap deciderules 142 * @throws IllegalState If unable to update order.xml due to wrong order.xml format 143 */ 144 public void editOrderXMLAddPerDomainCrawlerTraps(DomainConfiguration cfg) { 145 List<String> crawlerTraps = cfg.getCrawlertraps(); 146 String elementName = cfg.getDomainName(); 147 if (!crawlerTraps.isEmpty()) { 148 log.info("Inserting {} crawlertraps for domain '{}' into the template", crawlerTraps.size(), elementName); 149 insertCrawlerTraps(elementName, crawlerTraps); 150 } 151 } 152 153 154 public abstract void setDeduplicationIndexLocation(String absolutePath); 155 public abstract void setSeedsFilePath(String absolutePath); 156 157 public abstract void setArchiveFilePrefix(String archiveFilePrefix); 158 public abstract void setDiskPath(String absolutePath); 159 160 161 public abstract void writeTemplate(OutputStream os) throws IOException, ArgumentNotValid; 162 public abstract void writeTemplate(JspWriter out); 163 public abstract boolean hasContent(); 164 165 public abstract void writeToFile(File orderXmlFile); 166 public abstract void setRecoverlogNode(File recoverlogGzFile); 167 168 public static HeritrixTemplate getTemplateFromString(String templateAsString){ 169 if (templateAsString.contains(H1_SIGNATURE)) { 170 try { 171 return new H1HeritrixTemplate(templateAsString); 172 } catch (DocumentException e) { 173 throw new IOFailure("Unable to recognize as a valid dom4j Document the following string: " 174 + templateAsString, e); 175 } 176 } else if (templateAsString.contains(H3_SIGNATURE)) { 177 return new H3HeritrixTemplate(templateAsString); 178 } else { 179 throw new ArgumentNotValid("The given template is neither H1 or H3: " + templateAsString); 180 } 181 } 182 183 /** 184 * Read the given template from file. 185 * @param orderXmlFile a given HeritrixTemplate (H1 or H3) as a File 186 * @return the given HeritrixTemplate (H1 or H3) as a HeritrixTemplate object 187 */ 188 public static HeritrixTemplate read(File orderXmlFile){ 189 try { 190 return read(new FileReader(orderXmlFile)); 191 } catch (FileNotFoundException e) { 192 throw new IOFailure("The file '" + orderXmlFile.getAbsolutePath() + "' was not found", e); 193 } 194 } 195 196 /** 197 * Read the template using the given Reader 198 * @param reader A given Reader 199 * @return a HeritrixTemplate object 200 */ 201 public static HeritrixTemplate read(Reader orderTemplateReader) { 202 StringBuilder sb = new StringBuilder(); 203 BufferedReader in = new BufferedReader(orderTemplateReader); 204 String line; 205 try { 206 while ((line = in.readLine()) != null) { 207 sb.append(line); 208 sb.append('\n'); 209 } 210 } catch (IOException e) { 211 throw new IOFailure("IOException thrown", e); 212 } 213 return getTemplateFromString((sb.toString())); 214 } 215 216 217 /** 218 * Try to remove the deduplicator, if present in the template. 219 */ 220 public abstract void removeDeduplicatorIfPresent(); 221 222 /** 223 * Method to add settings to the WARCWriterProcesser, so that it can generate a proper WARCINFO record. 224 * @param ajob a HarvestJob 225 * @param origHarvestdefinitionName The name of the harvestdefinition behind this job 226 * @param scheduleName The name of the schedule used. (Will be null, if the job is not a selectiveHarvest). 227 * @param performer The name of organisation/person doing this harvest 228 */ 229 public abstract void insertWarcInfoMetadata(Job ajob, 230 String origHarvestdefinitionName, String scheduleName, 231 String performer); 232 233}