001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.datamodel; 024 025import java.io.File; 026import java.io.IOException; 027import java.io.OutputStream; 028import java.io.Serializable; 029import java.io.UnsupportedEncodingException; 030import java.util.HashMap; 031import java.util.List; 032import java.util.Map; 033import java.util.regex.Matcher; 034import java.util.regex.Pattern; 035 036import javax.servlet.jsp.JspWriter; 037 038import org.dom4j.Document; 039import org.dom4j.DocumentException; 040import org.dom4j.Element; 041import org.dom4j.Node; 042import org.dom4j.io.XMLWriter; 043import org.slf4j.Logger; 044import org.slf4j.LoggerFactory; 045 046import dk.netarkivet.common.exceptions.ArgumentNotValid; 047import dk.netarkivet.common.exceptions.IOFailure; 048import dk.netarkivet.common.exceptions.IllegalState; 049import dk.netarkivet.common.exceptions.PermissionDenied; 050import dk.netarkivet.common.exceptions.UnknownID; 051import dk.netarkivet.common.utils.Settings; 052import dk.netarkivet.common.utils.XmlUtils; 053import dk.netarkivet.harvester.HarvesterSettings; 054import dk.netarkivet.harvester.datamodel.eav.EAV.AttributeAndType; 055import dk.netarkivet.harvester.harvesting.report.Heritrix1Constants; 056 057/** 058 * Class encapsulating the Heritrix order.xml. Enables verification that dom4j Document obey the constraints required by 059 * our software, specifically the Job class. 060 * <p> 061 * The class assumes the type of order.xml used in configuring Heritrix version 1.10+. Information about the Heritrix 062 * crawler, and its processes and modules can be found in the Heritrix developer and user manuals found on <a 063 * href="http://crawler.archive.org">http://crawler.archive.org<a/> 064 * 065 */ 066public class H1HeritrixTemplate extends HeritrixTemplate implements Serializable { 067 068 private static final Logger log = LoggerFactory.getLogger(H1HeritrixTemplate.class); 069 070 /** the dom4j Document hiding behind this instance of HeritrixTemplate. */ 071 private Document template; 072 073 /** has this HeritrixTemplate been verified. */ 074 private boolean verified; 075 076 /** Xpath needed by Job.editOrderXML_maxBytesPerDomain(). */ 077 public static final String QUOTA_ENFORCER_ENABLED_XPATH = "/crawl-order/controller/map[@name='pre-fetch-processors']" 078 + "/newObject[@name='QuotaEnforcer']" + "/boolean[@name='enabled']";; 079 /** Xpath needed by Job.editOrderXML_maxBytesPerDomain(). */ 080 public static final String GROUP_MAX_ALL_KB_XPATH = "/crawl-order/controller/map[@name='pre-fetch-processors']" 081 + "/newObject[@name='QuotaEnforcer']" + "/long[@name='group-max-all-kb']"; 082 /** Xpath needed by Job.editOrderXML_maxObjectsPerDomain(). */ 083 public static final String GROUP_MAX_FETCH_SUCCESS_XPATH = "/crawl-order/controller/map[@name='pre-fetch-processors']" 084 + "/newObject[@name='QuotaEnforcer']" + "/long[@name='group-max-fetch-successes']"; 085 /** Xpath needed by Job.editOrderXML_maxObjectsPerDomain(). */ 086 public static final String QUEUE_TOTAL_BUDGET_XPATH = "/crawl-order/controller/newObject[@name='frontier']" 087 + "/long[@name='queue-total-budget']"; 088 /** Xpath needed by Job.editOrderXML_crawlerTraps(). */ 089 public static final String DECIDERULES_MAP_XPATH = "/crawl-order/controller/newObject" 090 + "/newObject[@name='decide-rules']" + "/map[@name='rules']"; 091 /** Xpath needed by Job.editOrderXML_crawlerTraps(). */ 092 public static final String DECIDERULES_ACCEPT_IF_PREREQUISITE_XPATH = "/crawl-order/controller/newObject" 093 + "/newObject[@name='decide-rules']" + "/map[@name='rules']/newObject[@class=" 094 + "'org.archive.crawler.deciderules.PrerequisiteAcceptDecideRule']"; 095 096 /** Xpath checked by Heritrix for correct user-agent field in requests. */ 097 public static final String HERITRIX_USER_AGENT_XPATH = "/crawl-order/controller/map[@name='http-headers']" 098 + "/string[@name='user-agent']"; 099 /** Xpath checked by Heritrix for correct mail address. */ 100 public static final String HERITRIX_FROM_XPATH = "/crawl-order/controller/map[@name='http-headers']/" 101 + "string[@name='from']"; 102 /** Xpath to check, that all templates use the DecidingScope. */ 103 public static final String DECIDINGSCOPE_XPATH = "/crawl-order/controller/newObject[@name='scope']" + "[@class='" 104 + Heritrix1Constants.DECIDINGSCOPE_CLASSNAME + "']"; 105 /** 106 * Xpath for the deduplicator node in order.xml documents. 107 */ 108 public static final String DEDUPLICATOR_XPATH = "/crawl-order/controller/map[@name='write-processors']" 109 + "/newObject[@name='DeDuplicator']"; 110 111 /** 112 * Xpath to check, that all templates use the same ARC archiver path, 113 * {@link dk.netarkivet.common.Constants#ARCDIRECTORY_NAME}. The archive path tells Heritrix to which directory it 114 * shall write its arc files. 115 */ 116 public static final String ARC_ARCHIVER_PATH_XPATH = "/crawl-order/controller/map[@name='write-processors']/" 117 + "newObject[@name='Archiver']/stringList[@name='path']/string"; 118 119 /** 120 * Xpath to check, that all templates use the same WARC archiver path, 121 * {@link dk.netarkivet.common.Constants#WARCDIRECTORY_NAME}. The archive path tells Heritrix to which directory it 122 * shall write its arc files. 123 */ 124 public static final String WARC_ARCHIVER_PATH_XPATH = "/crawl-order/controller/map[@name='write-processors']/" 125 + "newObject[@name='WARCArchiver']/stringList[@name='path']/string"; 126 127 /** 128 * Xpath for the deduplicator index directory node in order.xml documents. 129 */ 130 public static final String DEDUPLICATOR_INDEX_LOCATION_XPATH = DEDUPLICATOR_XPATH 131 + "/string[@name='index-location']"; 132 133 /** 134 * Xpath for the boolean telling if the deduplicator is enabled in order.xml documents. 135 */ 136 public static final String DEDUPLICATOR_ENABLED = DEDUPLICATOR_XPATH + "/boolean[@name='enabled']"; 137 138 /** Xpath for the 'disk-path' in the order.xml . */ 139 public static final String DISK_PATH_XPATH = "//crawl-order/controller" + "/string[@name='disk-path']"; 140 /** Xpath for the arcfile 'prefix' in the order.xml . */ 141 public static final String ARCHIVEFILE_PREFIX_XPATH = "//crawl-order/controller" + "/map[@name='write-processors']" 142 + "/newObject/string[@name='prefix']"; 143 /** Xpath for the ARCs dir in the order.xml. */ 144 public static final String ARCSDIR_XPATH = "//crawl-order/controller" + "/map[@name='write-processors']" 145 + "/newObject[@name='Archiver']/stringList[@name='path']/string"; 146 147 public static final String WARCWRITERPROCESSOR_XPATH = "//crawl-order/controller" 148 + "/map[@name='write-processors']" + "/newObject[@name='WARCArchiver']"; 149 150 public static final String ARCWRITERPROCESSOR_XPATH = "//crawl-order/controller" 151 + "/map[@name='write-processors']" + "/newObject[@name='Archiver']"; 152 153 /** Xpath for the WARCs dir in the order.xml. */ 154 public static final String WARCSDIR_XPATH = WARCWRITERPROCESSOR_XPATH + "/stringList[@name='path']/string"; 155 156 /** Xpath for the 'seedsfile' in the order.xml. */ 157 public static final String SEEDS_FILE_XPATH = "//crawl-order/controller" + "/newObject[@name='scope']" 158 + "/string[@name='seedsfile']"; 159 160 public static final String ARCS_ENABLED_XPATH = ARCWRITERPROCESSOR_XPATH + "/boolean[@name='enabled']"; 161 162 /** Xpath for the WARCs dir in the order.xml. */ 163 public static final String WARCS_ENABLED_XPATH = WARCWRITERPROCESSOR_XPATH + "/boolean[@name='enabled']"; 164 165 public static final String WARCS_WRITE_REQUESTS_XPATH = WARCWRITERPROCESSOR_XPATH 166 + "/boolean[@name='write-requests']"; 167 public static final String WARCS_WRITE_METADATA_XPATH = WARCWRITERPROCESSOR_XPATH 168 + "/boolean[@name='write-metadata']"; 169 public static final String WARCS_WRITE_METADATA_OUTLINKS_XPATH = WARCWRITERPROCESSOR_XPATH 170 + "/boolean[@name='write-metadata-outlinks']"; 171 public static final String WARCS_SKIP_IDENTICAL_DIGESTS_XPATH = WARCWRITERPROCESSOR_XPATH 172 + "/boolean[@name='skip-identical-digests']"; 173 public static final String WARCS_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS_XPATH = WARCWRITERPROCESSOR_XPATH 174 + "/boolean[@name='write-revisit-for-identical-digests']"; 175 public static final String WARCS_WRITE_REVISIT_FOR_NOT_MODIFIED_XPATH = WARCWRITERPROCESSOR_XPATH 176 + "/boolean[@name='write-revisit-for-not-modified']"; 177 178 /** Xpath for the WARC metadata in the order.xml. */ 179 public static final String METADATA_ITEMS_XPATH = WARCWRITERPROCESSOR_XPATH + "/map[@name='metadata-items']"; 180 181 /** 182 * Map from required xpaths to a regular expression describing legal content for the path text. 183 */ 184 private static final Map<String, Pattern> requiredXpaths = new HashMap<String, Pattern>(); 185 186 /** 187 * A regular expression that matches a whole number, possibly negative, and with optional whitespace around it. 188 */ 189 private static final String WHOLE_NUMBER_REGEXP = "\\s*-?[0-9]+\\s*"; 190 /** 191 * A regular expression that matches everything. Except newlines, unless DOTALL is given to Pattern.compile(). 192 */ 193 private static final String EVERYTHING_REGEXP = ".*"; 194 195 // These two regexps are copied from 196 // org.archive.crawler.datamodel.CrawlOrder because they're private there. 197 198 /** 199 * A regular expression that matches Heritrix' specs for the user-agent field in order.xml. It should be used with 200 * DOTALL. An example match is "Org (ourCrawler, see +http://org.org/aPage for details) harvest". 201 */ 202 private static final String USER_AGENT_REGEXP = "\\S+.*\\(.*\\+http(s)?://\\S+\\.\\S+.*\\).*"; 203 /** 204 * A regular expression that matches Heritrix' specs for the from field. This should be a valid email address. 205 */ 206 private static final String FROM_REGEXP = "\\S+@\\S+\\.\\S+"; 207 208 /** 209 * Xpath to check, that all templates have the max-time-sec attribute. 210 */ 211 public static final String MAXTIMESEC_PATH_XPATH = "/crawl-order/controller/long[@name='max-time-sec']"; 212 213 214 static { 215 requiredXpaths.put(GROUP_MAX_FETCH_SUCCESS_XPATH, Pattern.compile(WHOLE_NUMBER_REGEXP)); 216 requiredXpaths.put(QUEUE_TOTAL_BUDGET_XPATH, Pattern.compile(WHOLE_NUMBER_REGEXP)); 217 requiredXpaths.put(GROUP_MAX_ALL_KB_XPATH, Pattern.compile(WHOLE_NUMBER_REGEXP)); 218 219 // Required that we use DecidingScope 220 // requiredXpaths.put(DECIDINGSCOPE_XPATH, 221 // Pattern.compile(EVERYTHING_REGEXP)); 222 223 // Required that we have a rules map used to add crawlertraps 224 requiredXpaths.put(DECIDERULES_MAP_XPATH, Pattern.compile(EVERYTHING_REGEXP, Pattern.DOTALL)); 225 226 requiredXpaths.put(HERITRIX_USER_AGENT_XPATH, Pattern.compile(USER_AGENT_REGEXP, Pattern.DOTALL)); 227 requiredXpaths.put(HERITRIX_FROM_XPATH, Pattern.compile(FROM_REGEXP)); 228 229 // max-time-sec attribute needed, so we can't override it set 230 // a timelimit on broad crawls. 231 requiredXpaths.put(MAXTIMESEC_PATH_XPATH, Pattern.compile(WHOLE_NUMBER_REGEXP)); 232 } 233 234 /** 235 * Constructor for HeritrixTemplate class. 236 * 237 * @param doc the order.xml 238 * @param verify If true, verifies if the given dom4j Document contains the elements required by our software. 239 * @throws ArgumentNotValid if doc is null, or verify is true and doc does not obey the constraints required by our 240 * software. 241 */ 242 public H1HeritrixTemplate(Document doc, boolean verify) { 243 ArgumentNotValid.checkNotNull(doc, "Document doc"); 244 String xpath; 245 Node node; 246 Pattern pattern; 247 Matcher matcher; 248 if (verify) { 249 for (Map.Entry<String, Pattern> required : requiredXpaths.entrySet()) { 250 xpath = required.getKey(); 251 node = doc.selectSingleNode(xpath); 252 ArgumentNotValid.checkTrue(node != null, "Template error: Missing node: " + xpath 253 + ". The template looks like this: " + doc.asXML()); 254 255 pattern = required.getValue(); 256 matcher = pattern.matcher(node.getText().trim()); 257 258 ArgumentNotValid.checkTrue(matcher.matches(), "Template error: Value '" + node.getText() 259 + "' of node '" + xpath + "' does not match required regexp '" + pattern 260 + "'. The template looks like this: " + doc.asXML()); 261 } 262 verified = true; 263 // Required that Heritrix write its ARC/WARC files to the correct dir 264 // relative to the crawldir. This dir is defined by the constant: 265 // dk.netarkivet.common.Constants.ARCDIRECTORY_NAME. 266 // dk.netarkivet.common.Constants.WARCDIRECTORY_NAME. 267 int validArchivePaths = 0; 268 node = doc.selectSingleNode(ARC_ARCHIVER_PATH_XPATH); 269 if (node != null) { 270 pattern = Pattern.compile(dk.netarkivet.common.Constants.ARCDIRECTORY_NAME); 271 matcher = pattern.matcher(node.getText().trim()); 272 ArgumentNotValid.checkTrue(matcher.matches(), "Template error: Value '" + node.getText() 273 + "' of node '" + ARC_ARCHIVER_PATH_XPATH + "' does not match required regexp '" + pattern 274 + "'"); 275 ++validArchivePaths; 276 } 277 node = doc.selectSingleNode(WARC_ARCHIVER_PATH_XPATH); 278 if (node != null) { 279 pattern = Pattern.compile(dk.netarkivet.common.Constants.WARCDIRECTORY_NAME); 280 matcher = pattern.matcher(node.getText().trim()); 281 ArgumentNotValid.checkTrue(matcher.matches(), "Template error: Value '" + node.getText() 282 + "' of node '" + WARC_ARCHIVER_PATH_XPATH + "' does not match required regexp '" + pattern 283 + "'"); 284 ++validArchivePaths; 285 } 286 ArgumentNotValid.checkTrue(validArchivePaths > 0, "Template error: " 287 + "An ARC or WARC writer processor seems to be missing"); 288 } 289 this.template = (Document) doc.clone(); 290 } 291 292 /** 293 * Alternate constructor, which always verifies the given document. 294 * 295 * @param doc 296 */ 297 public H1HeritrixTemplate(Document doc) { 298 this(doc, true); 299 } 300 301 public H1HeritrixTemplate(long template_id, String templateAsString) throws DocumentException { 302 ArgumentNotValid.checkNotNull(templateAsString, "String template"); 303 this.template_id = template_id; 304 this.template = XmlUtils.documentFromString(templateAsString); 305 } 306 307 /** 308 * return the template. 309 * 310 * @return the template 311 */ 312 public Document getTemplate() { 313 return (Document) template.clone(); 314 } 315 316 /** 317 * Has Template been verified? 318 * 319 * @return true, if verified on construction, otherwise false 320 */ 321 public boolean isVerified() { 322 return verified; 323 } 324 325 /** 326 * Return HeritrixTemplate as XML. 327 * 328 * @return HeritrixTemplate as XML 329 */ 330 public String getXML() { 331 return template.asXML(); 332 } 333 334 /** 335 * Method to add a list of crawler traps with a given element name. It is used both to add per-domain traps and 336 * global traps. 337 * 338 * @param elementName The name of the added element. 339 * @param crawlerTraps A list of crawler trap regular expressions to add to this job. 340 */ 341 @SuppressWarnings("unchecked") 342 public static void editOrderXMLAddCrawlerTraps(Document orderXMLdoc, String elementName, List<String> crawlerTraps) { 343 if (crawlerTraps.size() == 0) { 344 return; 345 } 346 347 // Get the node to update 348 // If there is an acceptIfPrerequisite decideRule in the template, crawler traps should be 349 // placed before (cf. issue NAS-2205) 350 // If no such rule exists then we append the crawler traps as to the existing decideRuleds. 351 352 Node rulesMapNode = orderXMLdoc.selectSingleNode(DECIDERULES_MAP_XPATH); 353 if (rulesMapNode == null || !(rulesMapNode instanceof Element)) { 354 throw new IllegalState("Unable to update order.xml document. It does not have the right form to add" 355 + "crawler trap deciderules."); 356 } 357 358 Element rulesMap = (Element) rulesMapNode; 359 360 // Create the root node and append it top existing rules 361 Element decideRule = rulesMap.addElement("newObject"); 362 363 // If an acceptiIfPrerequisite node exists, detach and insert before it 364 Node acceptIfPrerequisiteNode = orderXMLdoc.selectSingleNode(DECIDERULES_ACCEPT_IF_PREREQUISITE_XPATH); 365 if (acceptIfPrerequisiteNode != null) { 366 List<Node> elements = rulesMap.elements(); 367 int insertPosition = elements.indexOf(acceptIfPrerequisiteNode); 368 decideRule.detach(); 369 elements.add(insertPosition, decideRule); 370 } else { 371 rulesMap.elements().size(); 372 } 373 374 // Add all regexps in the list to a single MatchesListRegExpDecideRule 375 decideRule.addAttribute("name", elementName); 376 decideRule.addAttribute("class", Heritrix1Constants.MATCHESLISTREGEXPDECIDERULE_CLASSNAME); 377 378 Element decision = decideRule.addElement("string"); 379 decision.addAttribute("name", "decision"); 380 decision.addText("REJECT"); 381 382 Element listlogic = decideRule.addElement("string"); 383 listlogic.addAttribute("name", "list-logic"); 384 listlogic.addText("OR"); 385 386 Element regexpList = decideRule.addElement("stringList"); 387 regexpList.addAttribute("name", "regexp-list"); 388 for (String trap : crawlerTraps) { 389 regexpList.addElement("string").addText(trap); 390 } 391 } 392 393 /** 394 * Updates the order.xml to include a MatchesListRegExpDecideRule for each crawlertrap associated with for the given 395 * DomainConfiguration. 396 * <p> 397 * The added nodes have the form 398 * <p> 399 * <newObject name="domain.dk" class="org.archive.crawler.deciderules.MatchesListRegExpDecideRule"> <string 400 * name="decision">REJECT</string> <string name="list-logic">OR</string> <stringList name="regexp-list"> 401 * <string>theFirstRegexp</string> <string>theSecondRegexp</string> </stringList> </newObject> 402 * 403 * @param cfg The DomainConfiguration for which to generate crawler trap deciderules 404 * @throws IllegalState If unable to update order.xml due to wrong order.xml format 405 */ 406 // FIXME REMOVE IF NOT USED 407 /* 408 public static void editOrderXMLAddPerDomainCrawlerTraps(Document orderXmlDoc, DomainConfiguration cfg) { 409 // Get the regexps to exclude 410 List<String> crawlerTraps = cfg.getCrawlertraps(); 411 String elementName = cfg.getDomainName(); 412 H1HeritrixTemplate.editOrderXMLAddCrawlerTraps(orderXmlDoc, elementName, crawlerTraps); 413 } 414 */ 415 416 private static void setIfFound(Document doc, String Xpath, String param, String value) { 417 if (doc.selectSingleNode(Xpath) != null) { 418 XmlUtils.setNode(doc, Xpath, value); 419 } else { 420 log.warn("Could not replace setting value of '" + param + "' in template. Xpath not found: " + Xpath); 421 } 422 } 423 424 /** 425 * Auxiliary method to modify the orderXMLdoc Document with respect to setting the maximum number of objects to be 426 * retrieved per domain. This method updates 'group-max-fetch-success' element of the QuotaEnforcer pre-fetch 427 * processor node (org.archive.crawler.frontier.BdbFrontier) with the value of the argument forceMaxObjectsPerDomain 428 * 429 * @param orderXMLdoc 430 * @param forceMaxObjectsPerDomain The maximum number of objects to retrieve per domain, or 0 for no limit. 431 * @throws PermissionDenied If unable to replace the frontier node of the orderXMLdoc Document 432 * @throws IOFailure If the group-max-fetch-success element is not found in the orderXml. TODO The 433 * group-max-fetch-success check should also be performed in TemplateDAO.create, TemplateDAO.update 434 */ 435 public static void editOrderXML_maxObjectsPerDomain(Document orderXMLdoc, long forceMaxObjectsPerDomain, 436 boolean maxObjectsIsSetByQuotaEnforcer) { 437 438 String xpath = (maxObjectsIsSetByQuotaEnforcer ? GROUP_MAX_FETCH_SUCCESS_XPATH : QUEUE_TOTAL_BUDGET_XPATH); 439 440 Node orderXmlNode = orderXMLdoc.selectSingleNode(xpath); 441 if (orderXmlNode != null) { 442 orderXmlNode.setText(String.valueOf(forceMaxObjectsPerDomain)); 443 } else { 444 throw new IOFailure("Unable to locate " + xpath + " element in order.xml: " + orderXMLdoc.asXML()); 445 } 446 } 447 448 /** 449 * Activates or deactivate the quota-enforcer, depending on budget definition. Object limit can be defined either by 450 * using the queue-total-budget property or the quota enforcer. Which is chosen is set by the argument 451 * maxObjectsIsSetByQuotaEnforcer}'s value. So quota enforcer is set as follows: 452 * <ul> 453 * <li>Object limit is not set by quota enforcer, disabled only if there is no byte limit.</li> 454 * <li>Object limit is set by quota enforcer, so it should be enabled whether a byte or object limit is set.</li> 455 * </ul> 456 * 457 * @param orderXMLdoc the template to modify 458 * @param maxObjectsIsSetByQuotaEnforcer Decides whether the maxObjectsIsSetByQuotaEnforcer or not. 459 * @param forceMaxBytesPerDomain The number of max bytes per domain enforced (can be no limit) 460 * @param forceMaxObjectsPerDomain The number of max objects per domain enforced (can be no limit) 461 */ 462 public static void editOrderXML_configureQuotaEnforcer(Document orderXMLdoc, 463 boolean maxObjectsIsSetByQuotaEnforcer, long forceMaxBytesPerDomain, long forceMaxObjectsPerDomain) { 464 465 boolean quotaEnabled = true; 466 467 if (!maxObjectsIsSetByQuotaEnforcer) { 468 // Object limit is not set by quota enforcer, so it should be disabled only 469 // if there is no byte limit. 470 quotaEnabled = forceMaxBytesPerDomain != Constants.HERITRIX_MAXBYTES_INFINITY; 471 472 } else { 473 // Object limit is set by quota enforcer, so it should be enabled whether 474 // a byte or object limit is set. 475 quotaEnabled = forceMaxObjectsPerDomain != Constants.HERITRIX_MAXOBJECTS_INFINITY 476 || forceMaxBytesPerDomain != Constants.HERITRIX_MAXBYTES_INFINITY; 477 } 478 479 String xpath = QUOTA_ENFORCER_ENABLED_XPATH; 480 Node qeNode = orderXMLdoc.selectSingleNode(xpath); 481 if (qeNode != null) { 482 qeNode.setText(Boolean.toString(quotaEnabled)); 483 } else { 484 throw new IOFailure("Unable to locate " + xpath + " element in order.xml: " + orderXMLdoc.asXML()); 485 } 486 } 487 488 489 490 @Override 491 // Always return true 492 public boolean isValid() { 493 return true; 494 } 495 496 @Override 497 public void configureQuotaEnforcer(boolean maxObjectsIsSetByQuotaEnforcer, 498 long forceMaxBytesPerDomain, long forceMaxObjectsPerDomain) { 499 Document orderXMLdoc = this.template; 500 boolean quotaEnabled = true; 501 502 if (!maxObjectsIsSetByQuotaEnforcer) { 503 // Object limit is not set by quota enforcer, so it should be disabled only 504 // if there is no byte limit. 505 quotaEnabled = forceMaxBytesPerDomain != Constants.HERITRIX_MAXBYTES_INFINITY; 506 507 } else { 508 // Object limit is set by quota enforcer, so it should be enabled whether 509 // a byte or object limit is set. 510 quotaEnabled = forceMaxObjectsPerDomain != Constants.HERITRIX_MAXOBJECTS_INFINITY 511 || forceMaxBytesPerDomain != Constants.HERITRIX_MAXBYTES_INFINITY; 512 } 513 514 String xpath = QUOTA_ENFORCER_ENABLED_XPATH; 515 Node qeNode = orderXMLdoc.selectSingleNode(xpath); 516 if (qeNode != null) { 517 qeNode.setText(Boolean.toString(quotaEnabled)); 518 } else { 519 throw new IOFailure("Unable to locate " + xpath + " element in order.xml: " + orderXMLdoc.asXML()); 520 } 521 } 522 523 /** 524 * Auxiliary method to modify the orderXMLdoc Document with respect to setting the maximum number of bytes to 525 * retrieve per domain. This method updates 'group-max-all-kb' element of the 'QuotaEnforcer' node, which again is a 526 * subelement of 'pre-fetch-processors' node. with the value of the argument forceMaxBytesPerDomain 527 * 528 * @param forceMaxBytesPerDomain The maximum number of byte to retrieve per domain, or -1 for no limit. Note that 529 * the number is divided by 1024 before being inserted into the orderXml, as Heritrix expects KB. 530 * @throws PermissionDenied If unable to replace the QuotaEnforcer node of the orderXMLdoc Document 531 * @throws IOFailure If the group-max-all-kb element cannot be found. TODO This group-max-all-kb check also be 532 * performed in TemplateDAO.create, TemplateDAO.update 533 */ 534 @Override 535 public void setMaxBytesPerDomain(Long forceMaxBytesPerDomain) { 536 // get and set the group-max-all-kb Node of the orderXMLdoc: 537 String xpath = GROUP_MAX_ALL_KB_XPATH; 538 Node groupMaxSuccessKbNode = template.selectSingleNode(xpath); 539 if (groupMaxSuccessKbNode != null) { 540 if (forceMaxBytesPerDomain == 0) { 541 groupMaxSuccessKbNode.setText("0"); 542 } else if (forceMaxBytesPerDomain != Constants.HERITRIX_MAXBYTES_INFINITY) { 543 // Divide by 1024 since Heritrix uses KB rather than bytes, 544 // and add 1 to avoid to low limit due to rounding. 545 groupMaxSuccessKbNode.setText(Long 546 .toString((forceMaxBytesPerDomain / Constants.BYTES_PER_HERITRIX_BYTELIMIT_UNIT) + 1)); 547 } else { 548 groupMaxSuccessKbNode.setText(String.valueOf(Constants.HERITRIX_MAXBYTES_INFINITY)); 549 } 550 } else { 551 throw new IOFailure("Unable to locate QuotaEnforcer object in order.xml: " + template.asXML()); 552 } 553 } 554 555 @Override 556 public Long getMaxBytesPerDomain() { 557 // FIXME IMPLEMENT ME 558 return null; 559 } 560 561 @Override 562 public void setMaxObjectsPerDomain(Long maxobjectsL) { 563 // FIXME IMPLEMENT ME 564 565 } 566 567 @Override 568 public Long getMaxObjectsPerDomain() { 569 // FIXME IMPLEMENT ME OR DELETE 570 return null; 571 } 572 573 /** 574 * Return true if the templatefile has deduplication enabled. 575 * @return True if Deduplicator is enabled. 576 */ 577 @Override 578 public boolean IsDeduplicationEnabled() { 579 Node xpathNode = template.selectSingleNode(DEDUPLICATOR_ENABLED); 580 return xpathNode != null && xpathNode.getText().trim().equals("true"); 581 } 582 583 @Override 584 public void setArchiveFormat(String archiveFormat) { 585 Document orderXML = this.template; 586 boolean arcMode = false; 587 boolean warcMode = false; 588 589 //System.out.println("Document: " + template.asXML()); 590 591 if ("arc".equalsIgnoreCase(archiveFormat)) { 592 arcMode = true; 593 log.debug("ARC format selected to be used by Heritrix"); 594 } else if ("warc".equalsIgnoreCase(archiveFormat)) { 595 warcMode = true; 596 log.debug("WARC format selected to be used by Heritrix"); 597 } else { 598 throw new ArgumentNotValid("Configuration of '" + HarvesterSettings.HERITRIX_ARCHIVE_FORMAT 599 + "' is invalid! Unrecognized format '" + archiveFormat + "'."); 600 } 601 602 if (arcMode) { 603 // enable ARC writing in Heritrix and disable WARC writing if needed. 604 if (orderXML.selectSingleNode(ARCSDIR_XPATH) != null 605 && orderXML.selectSingleNode(ARCS_ENABLED_XPATH) != null) { 606 XmlUtils.setNode(orderXML, ARCSDIR_XPATH, 607 dk.netarkivet.common.Constants.ARCDIRECTORY_NAME); 608 XmlUtils.setNode(orderXML, ARCS_ENABLED_XPATH, "true"); 609 if (orderXML.selectSingleNode(WARCS_ENABLED_XPATH) != null) { 610 XmlUtils.setNode(orderXML, WARCS_ENABLED_XPATH, "false"); 611 } 612 } else { 613 throw new IllegalState("Unable to choose ARC as Heritrix archive format because " 614 + " one of the following xpaths are invalid in the given order.xml: " 615 + ARCSDIR_XPATH + "," + ARCS_ENABLED_XPATH); 616 } 617 } else if (warcMode) { // WARCmode 618 // enable ARC writing in Heritrix and disable WARC writing if needed. 619 if (orderXML.selectSingleNode(WARCSDIR_XPATH) != null 620 && orderXML.selectSingleNode(WARCS_ENABLED_XPATH) != null) { 621 XmlUtils.setNode(orderXML, WARCSDIR_XPATH, 622 dk.netarkivet.common.Constants.WARCDIRECTORY_NAME); 623 XmlUtils.setNode(orderXML, WARCS_ENABLED_XPATH, "true"); 624 if (orderXML.selectSingleNode(ARCS_ENABLED_XPATH) != null) { 625 XmlUtils.setNode(orderXML, ARCS_ENABLED_XPATH, "false"); 626 } 627 628 String warcParametersOverrideStr = null; 629 try { 630 warcParametersOverrideStr = Settings.get(HarvesterSettings.HERITRIX_WARC_PARAMETERS_OVERRIDE); 631 } catch (UnknownID e) { 632 //nothing 633 } 634 //if the parameter is not found or if it exists and equals to true 635 if (warcParametersOverrideStr == null || (warcParametersOverrideStr != null 636 && "true".equals(warcParametersOverrideStr))) { 637 638 // Update the WARCWriterProcessorSettings with settings values 639 setIfFound(orderXML, WARCS_SKIP_IDENTICAL_DIGESTS_XPATH, 640 HarvesterSettings.HERITRIX_WARC_SKIP_IDENTICAL_DIGESTS, 641 Settings.get(HarvesterSettings.HERITRIX_WARC_SKIP_IDENTICAL_DIGESTS)); 642 643 setIfFound(orderXML, WARCS_WRITE_METADATA_XPATH, 644 HarvesterSettings.HERITRIX_WARC_WRITE_METADATA, 645 Settings.get(HarvesterSettings.HERITRIX_WARC_WRITE_METADATA)); 646 setIfFound(orderXML, WARCS_WRITE_METADATA_OUTLINKS_XPATH, 647 HarvesterSettings.HERITRIX_WARC_WRITE_METADATA_OUTLINKS, 648 Settings.get(HarvesterSettings.HERITRIX_WARC_WRITE_METADATA_OUTLINKS)); 649 setIfFound(orderXML, WARCS_WRITE_REQUESTS_XPATH, 650 HarvesterSettings.HERITRIX_WARC_WRITE_REQUESTS, 651 Settings.get(HarvesterSettings.HERITRIX_WARC_WRITE_REQUESTS)); 652 653 setIfFound(orderXML, WARCS_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS_XPATH, 654 HarvesterSettings.HERITRIX_WARC_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS, 655 Settings.get(HarvesterSettings.HERITRIX_WARC_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS)); 656 setIfFound(orderXML, WARCS_WRITE_REVISIT_FOR_NOT_MODIFIED_XPATH, 657 HarvesterSettings.HERITRIX_WARC_WRITE_REVISIT_FOR_NOT_MODIFIED, 658 Settings.get(HarvesterSettings.HERITRIX_WARC_WRITE_REVISIT_FOR_NOT_MODIFIED)); 659 } 660 } else { 661 throw new IllegalState("Unable to choose WARC as Heritrix archive format because " 662 + " one of the following xpaths are invalid in the given order.xml: " 663 + WARCSDIR_XPATH + "," + WARCS_ENABLED_XPATH 664 + ". order.xml: " + orderXML.asXML()); 665 } 666 667 } else { 668 throw new IllegalState("Unknown state: " 669 + "Should have selected either ARC or WARC as heritrix archive format"); 670 } 671 } 672 673 @Override 674 public void setMaxJobRunningTime(Long maxJobRunningTimeSecondsL) { 675 // get and set the "max-time-sec" node of the orderXMLdoc 676 String xpath = MAXTIMESEC_PATH_XPATH; 677 Node groupMaxTimeSecNode = template.selectSingleNode(xpath); 678 if (groupMaxTimeSecNode != null) { 679 String currentMaxTimeSec = groupMaxTimeSecNode.getText(); 680 groupMaxTimeSecNode.setText(Long.toString(maxJobRunningTimeSecondsL)); 681 log.trace("Value of groupMaxTimeSecNode changed from " + currentMaxTimeSec + " to " + maxJobRunningTimeSecondsL); 682 } else { 683 throw new IOFailure("Unable to locate xpath '" + xpath + "' in the order.xml: " + template.asXML()); 684 } 685 } 686 687 688 @Override 689 public void writeTemplate(OutputStream os) throws IOException, ArgumentNotValid{ 690 XMLWriter writer; 691 try { 692 writer = new XMLWriter(os); 693 writer.write(this.template); 694 } catch (UnsupportedEncodingException e) { 695 String errMsg = "The encoding of this template is unsupported by this environment"; 696 log.error(errMsg, e); 697 throw new ArgumentNotValid(errMsg, e); 698 } 699 } 700 701 /** 702 * Only available for H1 templates. 703 * @return the template as a String. 704 */ 705 public String getText() { 706 return this.template.getText(); 707 } 708 709 @Override 710 public void insertCrawlerTraps(String elementName, List<String> crawlerTraps) { 711 if (crawlerTraps.size() == 0) { 712 return; 713 } 714 715 //System.out.println("Calling insertCrawlerTraps(String elementName, List<String> crawlerTraps) "); 716 // Get the node to update 717 // If there is an acceptIfPrerequisite decideRule in the template, crawler traps should be 718 // placed before (cf. issue NAS-2205) 719 // If no such rule exists then we append the crawler traps as to the existing decideRuleds. 720 721 Node rulesMapNode = template.selectSingleNode(DECIDERULES_MAP_XPATH); 722 if (rulesMapNode == null || !(rulesMapNode instanceof Element)) { 723 throw new IllegalState("Unable to update order.xml document. It does not have the right form to add" 724 + "crawler trap deciderules."); 725 } 726 727 Element rulesMap = (Element) rulesMapNode; 728 729 // Create the root node and append it top existing rules 730 Element decideRule = rulesMap.addElement("newObject"); 731 732 // If an acceptiIfPrerequisite node exists, detach and insert before it 733 Node acceptIfPrerequisiteNode = template 734 .selectSingleNode(DECIDERULES_ACCEPT_IF_PREREQUISITE_XPATH); 735 if (acceptIfPrerequisiteNode != null) { 736 List<Node> elements = rulesMap.elements(); 737 int insertPosition = elements.indexOf(acceptIfPrerequisiteNode); 738 decideRule.detach(); 739 elements.add(insertPosition, decideRule); 740 } else { 741 rulesMap.elements().size(); 742 } 743 744 // Add all regexps in the list to a single MatchesListRegExpDecideRule 745 decideRule.addAttribute("name", elementName); 746 decideRule.addAttribute("class", Heritrix1Constants.MATCHESLISTREGEXPDECIDERULE_CLASSNAME); 747 748 Element decision = decideRule.addElement("string"); 749 decision.addAttribute("name", "decision"); 750 decision.addText("REJECT"); 751 752 Element listlogic = decideRule.addElement("string"); 753 listlogic.addAttribute("name", "list-logic"); 754 listlogic.addText("OR"); 755 756 Element regexpList = decideRule.addElement("stringList"); 757 regexpList.addAttribute("name", "regexp-list"); 758 for (String trap : crawlerTraps) { 759 regexpList.addElement("string").addText(trap); 760 } 761 762 } 763 764 @Override 765 public boolean hasContent() { 766 return this.template.hasContent(); 767 } 768 769 @Override 770 public void writeToFile(File orderXmlFile) { 771 XmlUtils.writeXmlToFile(this.template, orderXmlFile); 772 } 773 774 @Override 775 public void setRecoverlogNode(File recoverlogGzFile) { 776 final String RECOVERLOG_PATH_XPATH = "/crawl-order/controller/string[@name='recover-path']"; 777 Node orderXmlNode = template.selectSingleNode(RECOVERLOG_PATH_XPATH); 778 if (orderXmlNode != null) { 779 orderXmlNode.setText(recoverlogGzFile.getAbsolutePath()); 780 log.debug("The Heritrix recover path now refers to '{}'.", recoverlogGzFile.getAbsolutePath()); 781 } else { 782 throw new IOFailure("Unable to locate the '" + RECOVERLOG_PATH_XPATH + "' element in order.xml: " 783 + template.asXML()); 784 } 785 } 786 787 @Override 788 public void setDeduplicationIndexLocation(String absolutePath) { 789 XmlUtils.setNode(template, DEDUPLICATOR_INDEX_LOCATION_XPATH, absolutePath); 790 } 791 792 @Override 793 public void setSeedsFilePath(String absolutePath) { 794 XmlUtils.setNode(template, SEEDS_FILE_XPATH, absolutePath); 795 } 796 797 @Override 798 public void setArchiveFilePrefix(String archiveFilePrefix) { 799 XmlUtils.setNodes(template, ARCHIVEFILE_PREFIX_XPATH, archiveFilePrefix); 800 } 801 802 @Override 803 public void setDiskPath(String absolutePath) { 804 XmlUtils.setNode(template, DISK_PATH_XPATH, absolutePath); 805 } 806 807 @Override 808 public void removeDeduplicatorIfPresent() { 809 Node xpathNode = template.selectSingleNode(DEDUPLICATOR_XPATH); 810 if (xpathNode != null) { 811 xpathNode.detach(); 812 } 813 } 814 815 @Override 816 public void insertWarcInfoMetadata(Job ajob, String origHarvestdefinitionName, 817 String scheduleName, String performer) { 818 819 Node WARCWRITERNODE = template.selectSingleNode(WARCWRITERPROCESSOR_XPATH); 820 if (WARCWRITERNODE == null) { 821 throw new IOFailure("Unable to locate the '" + WARCWRITERPROCESSOR_XPATH + "' element in order.xml: " 822 + template.asXML()); 823 } 824 825 Element warcwriterElement = (Element) WARCWRITERNODE; 826 Element metadataMap = warcwriterElement.addElement("map"); 827 metadataMap.addAttribute("name", "metadata-items"); 828 829 Element metadataItem = null; 830 831 metadataItem = metadataMap.addElement("string"); 832 metadataItem.addAttribute("name", HARVESTINFO_VERSION); 833 metadataItem.addText(HARVESTINFO_VERSION_NUMBER); 834 835 metadataItem = metadataMap.addElement("string"); 836 metadataItem.addAttribute("name", HARVESTINFO_JOBID); 837 metadataItem.addText("" + ajob.getJobID()); 838 839 metadataItem = metadataMap.addElement("string"); 840 metadataItem.addAttribute("name", HARVESTINFO_CHANNEL); 841 metadataItem.addText(ajob.getChannel()); 842 843 metadataItem = metadataMap.addElement("string"); 844 metadataItem.addAttribute("name", HARVESTINFO_HARVESTNUM); 845 metadataItem.addText("" + ajob.getHarvestNum()); 846 847 metadataItem = metadataMap.addElement("string"); 848 metadataItem.addAttribute("name", HARVESTINFO_ORIGHARVESTDEFINITIONID); 849 metadataItem.addText("" + ajob.getOrigHarvestDefinitionID()); 850 851 metadataItem = metadataMap.addElement("string"); 852 metadataItem.addAttribute("name", HARVESTINFO_MAXBYTESPERDOMAIN); 853 metadataItem.addText("" + ajob.getMaxBytesPerDomain()); 854 855 metadataItem = metadataMap.addElement("string"); 856 metadataItem.addAttribute("name", HARVESTINFO_MAXOBJECTSPERDOMAIN); 857 metadataItem.addText("" + ajob.getMaxObjectsPerDomain()); 858 859 metadataItem = metadataMap.addElement("string"); 860 metadataItem.addAttribute("name", HARVESTINFO_ORDERXMLNAME); 861 metadataItem.addText(ajob.getOrderXMLName()); 862 863 metadataItem = metadataMap.addElement("string"); 864 metadataItem.addAttribute("name", HARVESTINFO_ORIGHARVESTDEFINITIONNAME); 865 metadataItem.addText(origHarvestdefinitionName); 866 867 /* optional schedule-name, only for selective harvests. */ 868 if (scheduleName != null) { 869 metadataItem = metadataMap.addElement("string"); 870 metadataItem.addAttribute("name", HARVESTINFO_SCHEDULENAME); 871 metadataItem.addText(scheduleName); 872 } 873 874 metadataItem = metadataMap.addElement("string"); 875 metadataItem.addAttribute("name", HARVESTINFO_HARVESTFILENAMEPREFIX); 876 metadataItem.addText(ajob.getHarvestFilenamePrefix()); 877 878 metadataItem = metadataMap.addElement("string"); 879 metadataItem.addAttribute("name", HARVESTINFO_JOBSUBMITDATE); 880 metadataItem.addText("" + ajob.getSubmittedDate()); 881 882 /* optional HARVESTINFO_PERFORMER */ 883 if (performer != null) { 884 metadataItem = metadataMap.addElement("string"); 885 metadataItem.addAttribute("name", HARVESTINFO_PERFORMER); 886 metadataItem.addText(performer); 887 } 888 889 /* optional HARVESTINFO_AUDIENCE */ 890 if (ajob.getHarvestAudience() != null) { 891 metadataItem = metadataMap.addElement("string"); 892 metadataItem.addAttribute("name", HARVESTINFO_AUDIENCE); 893 metadataItem.addText(ajob.getHarvestAudience()); 894 } 895 } 896 897 @Override 898 public void insertAttributes(List<AttributeAndType> attributesAndTypes) { 899 // Unsupported for Heritrix 1 templates at this point. 900 log.warn("No attribute insertion is done for H1 templates"); 901 } 902 903 @Override 904 public void writeTemplate(JspWriter out) throws IOFailure { 905 try { 906 out.write(template.asXML()); 907 } catch (IOException e) { 908 throw new IOFailure("Unable to write to JspWriter", e); 909 } 910 911 } 912 913}