001package dk.netarkivet.harvester.datamodel; 002 003import java.io.BufferedReader; 004import java.io.File; 005import java.io.FileNotFoundException; 006import java.io.FileReader; 007import java.io.IOException; 008import java.io.OutputStream; 009import java.io.Reader; 010import java.io.Serializable; 011import java.util.List; 012 013import javax.servlet.jsp.JspWriter; 014 015import org.dom4j.DocumentException; 016import org.slf4j.Logger; 017import org.slf4j.LoggerFactory; 018 019import dk.netarkivet.common.exceptions.ArgumentNotValid; 020import dk.netarkivet.common.exceptions.IOFailure; 021import dk.netarkivet.common.exceptions.IllegalState; 022import dk.netarkivet.harvester.datamodel.eav.EAV.AttributeAndType; 023 024/** 025 * Abstract class for manipulating Heritrix Templates. 026 * 027 */ 028public abstract class HeritrixTemplate implements Serializable { 029 030 private static final Logger log = LoggerFactory.getLogger(HeritrixTemplate.class); 031 032 private static final CharSequence H1_SIGNATURE = "<crawl-order xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance"; 033 //private static final CharSequence H3_SIGNATURE = "xmlns=\"http://www.springframework.org/"; 034 private static final CharSequence H3_SIGNATURE = "http://www.springframework.org/"; 035 036 /** 037 * Templates for which isActive is false will be hidden in the web-gui by default. 038 */ 039 private boolean isActive = true; 040 041 // Constants for the metadata added to the warcinfo record when using WARC 042 043 protected static final String HARVESTINFO_VERSION_NUMBER = "0.6"; 044 protected static final String HARVESTINFO_VERSION = "harvestInfo.version"; 045 protected static final String HARVESTINFO_JOBID = "harvestInfo.jobId"; 046 protected static final String HARVESTINFO_CHANNEL = "harvestInfo.channel"; 047 protected static final String HARVESTINFO_HARVESTNUM = "harvestInfo.harvestNum"; 048 protected static final String HARVESTINFO_ORIGHARVESTDEFINITIONID = "harvestInfo.origHarvestDefinitionID"; 049 protected static final String HARVESTINFO_MAXBYTESPERDOMAIN = "harvestInfo.maxBytesPerDomain"; 050 protected static final String HARVESTINFO_MAXOBJECTSPERDOMAIN = "harvestInfo.maxObjectsPerDomain"; 051 protected static final String HARVESTINFO_ORDERXMLNAME = "harvestInfo.templateName"; 052 protected static final String HARVESTINFO_ORDERXMLUPDATEDATE = "harvestInfo.templateLastUpdateDate"; 053 protected static final String HARVESTINFO_ORDERXMLDESCRIPTION = "harvestInfo.templateDescription"; 054 protected static final String HARVESTINFO_ORIGHARVESTDEFINITIONNAME = "harvestInfo.origHarvestDefinitionName"; 055 protected static final String HARVESTINFO_ORIGHARVESTDEFINITIONCOMMENTS = "harvestInfo.origHarvestDefinitionComments"; 056 protected static final String HARVESTINFO_SCHEDULENAME = "harvestInfo.scheduleName"; 057 protected static final String HARVESTINFO_HARVESTFILENAMEPREFIX = "harvestInfo.harvestFilenamePrefix"; 058 protected static final String HARVESTINFO_JOBSUBMITDATE = "harvestInfo.jobSubmitDate"; 059 protected static final String HARVESTINFO_PERFORMER = "harvestInfo.performer"; 060 protected static final String HARVESTINFO_OPERATOR = "harvestInfo.operator"; 061 protected static final String HARVESTINFO_AUDIENCE = "harvestInfo.audience"; 062 063 064 /** insertion-methods 065 * 066 * Two methods for adding domain quotas to the quotaEnforcer bean. 067 * maxBytesPerDomain() 068 * maxObjectsPerDomain() 069 * 070 * One or two methods for inserting crawlertraps 071 * insertGlobalCrawlerTraps 072 * insertDomainSpecificCrawlerTraps 073 */ 074 075 /** 076 * Activates or deactivate the quota-enforcer, depending on budget definition. Object limit can be defined either by 077 * using the queue-total-budget property or the quota enforcer. Which is chosen is set by the argument 078 * maxObjectsIsSetByQuotaEnforcer}'s value. So quota enforcer is set as follows: 079 * <ul> 080 * <li>Object limit is not set by quota enforcer, disabled only if there is no byte limit.</li> 081 * <li>Object limit is set by quota enforcer, so it should be enabled whether a byte or object limit is set.</li> 082 * </ul> 083 * 084 * @param maxObjectsIsSetByQuotaEnforcer Decides whether the maxObjectsIsSetByQuotaEnforcer or not. 085 * @param forceMaxBytesPerDomain The number of max bytes per domain enforced (can be no limit) 086 * @param forceMaxObjectsPerDomain The number of max objects per domain enforced (can be no limit) 087 */ 088 public abstract void configureQuotaEnforcer( 089 boolean maxObjectsIsSetByQuotaEnforcer, long forceMaxBytesPerDomain, long forceMaxObjectsPerDomain); 090 091 public boolean isActive() { 092 return isActive; 093 } 094 095 public void setIsActive(boolean isActive) { 096 this.isActive = isActive; 097 } 098 099 // Getter/Setter for MaxBytesPerDomain value 100 public abstract void setMaxBytesPerDomain(Long maxbytesL); 101 public abstract Long getMaxBytesPerDomain(); // TODO Is necessary? 102 103 // Getter/Setter for MaxObjectsPerDomain value 104 public abstract void setMaxObjectsPerDomain(Long maxobjectsL); 105 public abstract Long getMaxObjectsPerDomain(); // TODO Is necessary? 106 107 /** We need the persistent template id if we want to attach any attributes to it. */ 108 public long template_id; 109 110 /** 111 * 112 * @return true, if deduplication is enabled in the template (used for determine whether or not to request a deduplication index from the indexserver) 113 */ 114 public abstract boolean IsDeduplicationEnabled(); 115 116 /** 117 * @return true, if the template is valid, otherwise false 118 */ 119 public abstract boolean isValid(); 120 121 /** 122 * @return the XML behind this template 123 */ 124 public abstract String getXML(); 125 126 /** 127 * Method to add a list of crawler traps with a given element name. It is used both to add per-domain traps and 128 * global traps. 129 * 130 * @param elementName The name of the added element. 131 * @param crawlertraps A list of crawler trap regular expressions to add to this job. 132 */ 133 public abstract void insertCrawlerTraps(String elementName, List<String> crawlertraps); 134 135 /** 136 * Make sure that Heritrix will archive its data in the chosen archiveFormat. 137 * 138 * @param archiveFormat the chosen archiveformat ('arc' or 'warc' supported) Throws ArgumentNotValid If the chosen 139 * archiveFormat is not supported. 140 */ 141 public abstract void setArchiveFormat(String archiveFormat); 142 143 144 /** 145 * Set the maxRunning time for the harvest 146 * @param maxJobRunningTimeSecondsL Limit the harvest to this number of seconds 147 */ 148 public abstract void setMaxJobRunningTime(Long maxJobRunningTimeSecondsL); 149 150 /** 151 * Try to insert the given list of attributes into the template. 152 * @param attributesAndTypes 153 */ 154 public abstract void insertAttributes(List<AttributeAndType> attributesAndTypes); 155 156 /** 157 * Updates the order.xml to include a MatchesListRegExpDecideRule for each crawler-trap associated with for the given 158 * DomainConfiguration. 159 * <p> 160 * The added nodes have the form 161 * <p> 162 * <newObject name="domain.dk" class="org.archive.crawler.deciderules.MatchesListRegExpDecideRule"> <string 163 * name="decision">REJECT</string> <string name="list-logic">OR</string> <stringList name="regexp-list"> 164 * <string>theFirstRegexp</string> <string>theSecondRegexp</string> </stringList> </newObject> 165 * 166 * @param cfg The DomainConfiguration for which to generate crawler trap deciderules 167 * @throws IllegalState If unable to update order.xml due to wrong order.xml format 168 */ 169 public void editOrderXMLAddPerDomainCrawlerTraps(DomainConfiguration cfg) { 170 List<String> crawlerTraps = cfg.getCrawlertraps(); 171 String elementName = cfg.getDomainName(); 172 int trapCount=crawlerTraps.size(); 173 for (String trap: crawlerTraps){ 174 if (trap.isEmpty()) { // Ignore empty traps in the trapcount (NAS-2480) 175 log.warn("Found empty trap for domain {}", cfg.getDomainName()); 176 trapCount--; 177 } 178 } 179 if (trapCount > 0) { 180 log.info("Inserting {} crawlertraps for domain '{}' into the template", crawlerTraps.size(), elementName); 181 insertCrawlerTraps(elementName, crawlerTraps); 182 } 183 } 184 185 public abstract void setDeduplicationIndexLocation(String absolutePath); 186 public abstract void setSeedsFilePath(String absolutePath); 187 188 public abstract void setArchiveFilePrefix(String archiveFilePrefix); 189 public abstract void setDiskPath(String absolutePath); 190 191 192 public abstract void writeTemplate(OutputStream os) throws IOException, ArgumentNotValid; 193 public abstract void writeTemplate(JspWriter out); 194 public abstract boolean hasContent(); 195 196 public abstract void writeToFile(File orderXmlFile); 197 public abstract void setRecoverlogNode(File recoverlogGzFile); 198 199 /** 200 * Construct a H1HeritrixTemplate or H3HeritrixTemplate based on the signature of the given string. 201 * @param template_id The id of the template 202 * @param templateAsString The template as a String object 203 * @return a HeritrixTemplate based on the signature of the given string. 204 */ 205 public static HeritrixTemplate getTemplateFromString(long template_id, String templateAsString){ 206 if (templateAsString.contains(H1_SIGNATURE)) { 207 try { 208 return new H1HeritrixTemplate(template_id, templateAsString); 209 } catch (DocumentException e) { 210 throw new IOFailure("Unable to recognize as a valid dom4j Document the following string: " 211 + templateAsString, e); 212 } 213 } else if (templateAsString.contains(H3_SIGNATURE)) { 214 return new H3HeritrixTemplate(template_id, templateAsString); 215 } else { 216 throw new ArgumentNotValid("The given template is neither H1 or H3: " + templateAsString); 217 } 218 } 219 220 /** 221 * Read the given template from file. 222 * @param orderXmlFile a given HeritrixTemplate (H1 or H3) as a File 223 * @return the given HeritrixTemplate (H1 or H3) as a HeritrixTemplate object 224 */ 225 public static HeritrixTemplate read(File orderXmlFile){ 226 try { 227 return read(-1, new FileReader(orderXmlFile)); 228 } catch (FileNotFoundException e) { 229 throw new IOFailure("The file '" + orderXmlFile.getAbsolutePath() + "' was not found", e); 230 } 231 } 232 233 /** 234 * Read the template using the given Reader. 235 * 236 * @param template_id The id of the template 237 * @param orderTemplateReader A given Reader to read a template 238 * @return a HeritrixTemplate object 239 */ 240 public static HeritrixTemplate read(long template_id, Reader orderTemplateReader) { 241 StringBuilder sb = new StringBuilder(); 242 BufferedReader in = new BufferedReader(orderTemplateReader); 243 String line; 244 try { 245 while ((line = in.readLine()) != null) { 246 sb.append(line); 247 sb.append('\n'); 248 } 249 } catch (IOException e) { 250 throw new IOFailure("IOException thrown", e); 251 } 252 return getTemplateFromString(template_id, sb.toString()); 253 } 254 255 256 /** 257 * Try to remove the deduplicator, if present in the template. 258 */ 259 public abstract void removeDeduplicatorIfPresent(); 260 261 /** 262 * 263 */ 264 public abstract void enableOrDisableDeduplication(boolean enabled); 265 266 /** 267 * Method to add settings to the WARCWriterProcesser, so that it can generate a proper WARCINFO record. 268 * @param ajob a HarvestJob 269 * @param origHarvestdefinitionName The name of the harvestdefinition behind this job 270 * @param scheduleName The name of the schedule used. (Will be null, if the job is not a selectiveHarvest). 271 * @param performer The name of organisation/person doing this harvest 272 */ 273 public abstract void insertWarcInfoMetadata(Job ajob, 274 String origHarvestdefinitionName, String origHarvestdefinitionComments, 275 String scheduleName, String performer); 276 277}