001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.datamodel; 024 025import java.io.BufferedWriter; 026import java.io.File; 027import java.io.FileWriter; 028import java.io.IOException; 029import java.io.OutputStream; 030import java.io.Serializable; 031import java.nio.charset.Charset; 032import java.util.List; 033 034import javax.servlet.jsp.JspWriter; 035 036import org.apache.commons.io.IOUtils; 037import org.slf4j.Logger; 038import org.slf4j.LoggerFactory; 039 040import dk.netarkivet.common.exceptions.ArgumentNotValid; 041import dk.netarkivet.common.exceptions.IOFailure; 042import dk.netarkivet.common.exceptions.IllegalState; 043import dk.netarkivet.common.exceptions.NotImplementedException; 044import dk.netarkivet.common.utils.Settings; 045import dk.netarkivet.harvester.HarvesterSettings; 046 047/** 048 * Class encapsulating the Heritrix crawler-beans.cxml file 049 * <p> 050 * 051 * Heritrix3 has a new model based on spring, So the XPATH is no good for processing. 052 * Instead we use placeholders instead, marked by %{..} instead of ${..}, which is used by 053 * Heritrix3 already. 054 * 055 * The template is a H3 template if it contains the string: 056 * 057 * "xmlns="http://www.springframework.org/...." 058 * 059 */ 060public class H3HeritrixTemplate extends HeritrixTemplate implements Serializable { 061 062 private static final Logger log = LoggerFactory.getLogger(H3HeritrixTemplate.class); 063 064 private String template; 065 066 /** QuotaEnforcer states for this template. TODO necessary?? */ 067 private Long forceMaxbytesPerDomain; 068 private Long forceMaxobjectsPerDomain; 069 070 /** Has this HeritrixTemplate been verified. */ 071 private boolean verified; 072 073 public final static String METADATA_ITEMS_PLACEHOLDER = "%{METADATA_ITEMS_PLACEHOLDER}"; 074 public static final String MAX_TIME_SECONDS_PLACEHOLDER = "%{MAX_TIME_SECONDS_PLACEHOLDER}"; 075 public static final String CRAWLERTRAPS_PLACEHOLDER = "%{CRAWLERTRAPS_PLACEHOLDER}"; 076 077 public static final String DEDUPLICATION_BEAN_REFERENCE_PATTERN = "<ref bean=\"DeDuplicator\"/>"; 078 public static final String DEDUPLICATION_BEAN_PATTERN = "<bean id=\"DeDuplicator\""; 079 public static final String DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER 080 = "%{DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER}"; 081 082 public static final String ARCHIVE_FILE_PREFIX_PLACEHOLDER = "%{ARCHIVE_FILE_PREFIX_PLACEHOLDER}"; 083 084 public static final String FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER 085 = "%{FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER}"; 086 087 public static final String QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER = 088 "%{QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER}"; 089 090 public static final String QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER 091 = "%{QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER}"; 092 093 094 // PLACEHOLDERS for archiver beans (Maybe not necessary) 095 final String ARCHIVER_BEAN_REFERENCE_PLACEHOLDER = "%{ARCHIVER_BEAN_REFERENCE_PLACEHOLDER}"; 096 final String ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER = "%{ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER}"; 097 098 /** 099 * Constructor for HeritrixTemplate class. 100 * 101 * @param doc the order.xml 102 * @param verify If true, verifies if the given dom4j Document contains the elements required by our software. 103 * @throws ArgumentNotValid if doc is null, or verify is true and doc does not obey the constraints required by our 104 * software. 105 */ 106 public H3HeritrixTemplate(String template) { 107 ArgumentNotValid.checkNotNull(template, "String template"); 108 this.template = template; 109 } 110 111 /** 112 * return the template. 113 * 114 * @return the template 115 */ 116 public HeritrixTemplate getTemplate() { 117 return this; 118 } 119 120 /** 121 * Has Template been verified? 122 * 123 * @return true, if verified on construction, otherwise false 124 */ 125 public boolean isVerified() { 126 return verified; 127 } 128 129 /** 130 * Return HeritrixTemplate as XML. 131 * @return HeritrixTemplate as XML 132 */ 133 @Override 134 public String getXML() { 135 return template; 136 } 137 138 /** 139 * Update the maxTimeSeconds property in the heritrix3 template, if possible. 140 * @param maxJobRunningTimeSecondsL Force the harvestJob to end after this number of seconds 141 * Property of the org.archive.crawler.framework.CrawlLimitEnforcer 142 * <!-- <property name="maxTimeSeconds" value="0" /> --> 143 */ 144 @Override 145 public void setMaxJobRunningTime(Long maxJobRunningTimeSecondsL) { 146 if (template.contains(MAX_TIME_SECONDS_PLACEHOLDER)) { 147 this.template = template.replace(MAX_TIME_SECONDS_PLACEHOLDER, 148 Long.toString(maxJobRunningTimeSecondsL)); 149 } else { 150 log.warn("The placeholder '" + MAX_TIME_SECONDS_PLACEHOLDER 151 + "' was not found in the template. Therefore maxRunningTime not set"); 152 } 153 } 154 155 156 @Override 157 public void setMaxBytesPerDomain(Long maxbytesL) { 158 this.forceMaxbytesPerDomain = maxbytesL; 159 } 160 161 162 @Override 163 public Long getMaxBytesPerDomain() { 164 return this.forceMaxbytesPerDomain; 165 } 166 167 @Override 168 public void setMaxObjectsPerDomain(Long maxobjectsL) { 169 this.forceMaxobjectsPerDomain = maxobjectsL; 170 } 171 172 @Override 173 public Long getMaxObjectsPerDomain() { 174 return this.forceMaxobjectsPerDomain; 175 } 176 177 @Override 178 public boolean isValid() { 179 /* 180 StringBuilder errors = new StringBuilder(); 181 // check for Deduplication index-location placeholder and DEDUPLICATION_BEAN_PATTERN 182 if (template.contains(DEDUPLICATION_BEAN_PATTERN)) { 183 if (!template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER)) { 184 errors.append("Has DefdMissing placeholder '" + DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER + "'" 185 } 186 } 187 template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER) 188 && template.contains(deduplicationBeanPattern) 189 */ 190 return true; 191 } 192 193 @Override 194 // This method is used to decide, whether to request a deduplication index or not. 195 // Done by checking, if both 196 // - a DeDuplicator bean is present in the template 197 // and 198 // - a DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER is also present. 199 // and 200 // - a DeDuplicator reference bean is present in the template 201 public boolean IsDeduplicationEnabled() { 202 return (template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER) 203 && template.contains(DEDUPLICATION_BEAN_PATTERN) 204 && template.contains(DEDUPLICATION_BEAN_REFERENCE_PATTERN)); 205 } 206 207 /** 208 * Configuring the quota-enforcer, depending on budget definition. Object limit can be defined either by 209 * using the queue-total-budget property or the quota enforcer. Which is chosen is set by the argument 210 * maxObjectsIsSetByQuotaEnforcer}'s value. So quota enforcer is set as follows: 211 * If all values in the quotaEnforcer is infinity, it is in effect disabled 212 * <ul> 213 * <li>Object limit is not set by quota enforcer, disabled only if there is no byte limit.</li> 214 * <li>Object limit is set by quota enforcer, so it should be enabled if a byte or object limit is set.</li> 215 * </ul> 216 * 217 * @param maxObjectsIsSetByQuotaEnforcer Decides whether the maxObjectsIsSetByQuotaEnforcer or not. 218 * @param forceMaxBytesPerDomain The number of max bytes per domain enforced (can be no limit) 219 * @param forceMaxObjectsPerDomain The number of max objects per domain enforced (can be no limit) 220 */ 221 public void configureQuotaEnforcer(boolean maxObjectsIsSetByQuotaEnforcer, 222 long forceMaxBytesPerDomain, long forceMaxObjectsPerDomain) { 223 this.forceMaxobjectsPerDomain = forceMaxObjectsPerDomain; 224 this.forceMaxbytesPerDomain = forceMaxBytesPerDomain; 225 String tmp = template; 226 if (!maxObjectsIsSetByQuotaEnforcer) { 227 // SetMaxObjects in the global budget to forceMaxObjectsPerDomain?? 228 String tmp1 = tmp.replace( 229 FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER, Long.toString( forceMaxObjectsPerDomain )); 230 // SetMaxObjects to infinity in the quotaEnforcer 231 tmp = tmp1.replace(QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER, 232 Long.toString(Constants.HERITRIX_MAXOBJECTS_INFINITY)); 233 } else { 234 // SetMaxObjects in the global budget to Infinity 235 String tmp1 = tmp.replace( 236 FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER, Long.toString( Constants.HERITRIX_MAXOBJECTS_INFINITY )); 237 // SetMaxObjects to forceMaxObjectsPerDomain in the quotaEnforcer 238 tmp = tmp1.replace(QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER, 239 Long.toString(forceMaxObjectsPerDomain)); 240 } 241 242 // SetMaxbytes in the QuotaEnforcer to forceMaxBytesPerDomain 243 // Divide by 1024 since Heritrix uses KB rather than bytes, 244 // and add 1 to avoid to low limit due to rounding. 245 String maxBytesStringValue = "-1"; 246 if (forceMaxBytesPerDomain != Constants.HERITRIX_MAXBYTES_INFINITY) { 247 maxBytesStringValue = Long.toString(( forceMaxBytesPerDomain 248 / Constants.BYTES_PER_HERITRIX_BYTELIMIT_UNIT) + 1); 249 log.debug("MaxbytesPerDomain set to {} Kbytes per domain", maxBytesStringValue); 250 } else { 251 log.debug("MaxbytesPerDomain set to infinite number of Kbytes per domain"); 252 } 253 254 this.template = tmp.replace(QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER, maxBytesStringValue); 255 256 } 257 258 /** 259 * Make sure that Heritrix will archive its data in the chosen archiveFormat. 260 * 261 * @param archiveFormat the chosen archiveformat ('arc' or 'warc' supported) 262 * @throw ArgumentNotValid If the chosen archiveFormat is not supported. 263 */ 264 @Override 265 public void setArchiveFormat(String archiveFormat) { 266 if (!template.contains(ARCHIVER_BEAN_REFERENCE_PLACEHOLDER)){ 267 throw new IllegalState("The placeholder '" + ARCHIVER_BEAN_REFERENCE_PLACEHOLDER 268 + "' is missing. Unable to insert proper archive writer"); 269 } 270 if (!template.contains(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER)) { 271 throw new IllegalState("The placeholder '" + ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER 272 + "' is missing. Unable to insert proper archive writer"); 273 } 274 if ("arc".equalsIgnoreCase(archiveFormat)) { 275 log.debug("ARC format selected to be used by Heritrix3"); 276 setArcArchiveformat(); 277 } else if ("warc".equalsIgnoreCase(archiveFormat)) { 278 log.debug("WARC format selected to be used by Heritrix3"); 279 setWarcArchiveformat(); 280 } else { 281 throw new ArgumentNotValid("Configuration of '" + HarvesterSettings.HERITRIX_ARCHIVE_FORMAT 282 + "' is invalid! Unrecognized format '" + archiveFormat + "'."); 283 } 284 } 285 286 /** 287 * Set the archive-format as ARC. This means enabling the ARCWriterProcessor in the template 288 */ 289 private void setArcArchiveformat(){ 290 String arcWriterbeanReference = "<ref bean=\"arcWriter\"/>"; 291 String templateNew = template.replace(ARCHIVER_BEAN_REFERENCE_PLACEHOLDER, arcWriterbeanReference); 292 template = templateNew.replace(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER, getArcWriterProcessor()); 293 } 294 295 private String getArcWriterProcessor() { 296 297// <bean id="arcWriter" class="org.archive.modules.writer.ARCWriterProcessor"> 298// <!-- <property name="compress" value="true" /> --> 299// <!-- <property name="prefix" value="IAH" /> --> 300// <!-- <property name="suffix" value="${HOSTNAME}" /> --> 301// <!-- <property name="maxFileSizeBytes" value="100000000" /> --> 302// <!-- <property name="poolMaxActive" value="1" /> --> 303// <!-- <property name="poolMaxWaitMs" value="300000" /> --> 304// <!-- <property name="skipIdenticalDigests" value="false" /> --> 305// <!-- <property name="maxTotalBytesToWrite" value="0" /> --> 306// <!-- <property name="directory" value="." /> --> 307// <!-- <property name="storePaths"> 308// <list> 309// <value>arcs</value> 310// </list> 311// </property> --> 312// </bean> 313// 314 String arcWriterBean 315 = "<bean id=\"arcWriter\" class=\"org.archive.modules.writer.ARCWriterProcessor\">"; 316 // TODO Read compress value from heritrix3Settings 317 arcWriterBean += "\n<property name=\"compress\" value=\"false\"/>" 318 + "\n<property name=\"prefix\" value=\"" + ARCHIVE_FILE_PREFIX_PLACEHOLDER 319 + "\"/></bean>"; 320 return arcWriterBean; 321 } 322 323 324 /** 325 * Insert WARC-archiver beans and remove placeholder for ARC-Archiver-beans 326 * It is an error, if the WARC place-holders doesnt't exist. 327 * It is not an error, if the property placeholder does not exist. 328 */ 329 private void setWarcArchiveformat() { 330 String warcWriterbeanReference = "<ref bean=\"warcWriter\"/>"; 331 String warcWriterProcessorBean = "<bean id=\"warcWriter\" class=\"dk.netarkivet.harvester.harvesting.NasWARCProcessor\">"; 332 String propertyName="\n<property name=\""; 333 String valuePrefix = "\" value=\""; 334 String valueSuffix = "\""; 335 String propertyEnd="/>"; 336 if (!template.contains(ARCHIVER_BEAN_REFERENCE_PLACEHOLDER)) { 337 throw new IllegalState("The placeholder '" + ARCHIVER_BEAN_REFERENCE_PLACEHOLDER 338 + "' is missing"); 339 } 340 if (!template.contains(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER)) { 341 throw new IllegalState("The placeholder '" + ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER 342 + "' is missing"); 343 } 344 StringBuilder propertyBuilder = new StringBuilder(); 345 // TODO Read template from Heritrix3Settings 346 propertyBuilder.append(propertyName + "template" + valuePrefix 347 + "${prefix}-${timestamp17}-${serialno}-${heritrix.hostname}" 348 // Default value: ${prefix}-${timestamp17}-${serialno}-${heritrix.pid}~${heritrix.hostname}~${heritrix.port} 349 + valueSuffix + propertyEnd); 350 propertyBuilder.append(propertyName + "compress" + valuePrefix + "false" // TODO Replace false by Heritrix3Settingsvalue 351 + valueSuffix + propertyEnd); 352 propertyBuilder.append(propertyName + "prefix" + valuePrefix 353 + ARCHIVE_FILE_PREFIX_PLACEHOLDER 354 + valueSuffix + propertyEnd); 355 propertyBuilder.append(propertyName + "writeRequests" + valuePrefix 356 + Settings.get(HarvesterSettings.HERITRIX_WARC_WRITE_REQUESTS) 357 + valueSuffix + propertyEnd); 358 propertyBuilder.append(propertyName + "writeMetadata" + valuePrefix 359 + Settings.get(HarvesterSettings.HERITRIX_WARC_WRITE_METADATA) 360 + valueSuffix + propertyEnd); 361 /* 362 propertyBuilder.append(propertyName + "writeRevisitForIdenticalDigests" + valuePrefix 363 + Settings.get(HarvesterSettings.HERITRIX_WARC_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS) 364 + valueSuffix + propertyEnd); 365 propertyBuilder.append(propertyName + "writeRevisitForNotModified" + valuePrefix 366 + Settings.get(HarvesterSettings.HERITRIX_WARC_WRITE_REVISIT_FOR_NOT_MODIFIED) 367 + valueSuffix + propertyEnd); 368 */ 369 propertyBuilder.append(propertyName + "skipIdenticalDigests" + valuePrefix 370 + Settings.get(HarvesterSettings.HERITRIX_WARC_SKIP_IDENTICAL_DIGESTS) 371 + valueSuffix + propertyEnd); 372 propertyBuilder.append( 373 propertyName + "startNewFilesOnCheckpoint" + valuePrefix 374 + Settings.get(HarvesterSettings.HERITRIX_WARC_START_NEW_FILES_ON_CHECKPOINT) 375 + valueSuffix + propertyEnd); 376 377 warcWriterProcessorBean += propertyBuilder.toString(); 378 warcWriterProcessorBean += "\n\n%{METADATA_ITEMS_PLACEHOLDER}\n</bean>"; 379 String templateNew = template.replace( 380 ARCHIVER_BEAN_REFERENCE_PLACEHOLDER, warcWriterbeanReference); 381 this.template = templateNew.replace(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER, 382 warcWriterProcessorBean); 383 } 384 385 @Override 386 /** 387 * With H3 template, we insert the crawlertraps into the template at once. 388 * They are inserted to be part of a org.archive.modules.deciderules.MatchesListRegexDecideRule 389 * bean. 390 * 391 * @param elementName The elementName is currently not used with H3 392 * @param crawlertraps A list of crawlertraps to be inserted 393 */ 394 public void insertCrawlerTraps(String elementName, List<String> crawlertraps) { 395// <bean class="org.archive.modules.deciderules.MatchesListRegexDecideRule"> 396// <!-- <property name="listLogicalOr" value="true" /> --> 397// <!-- <property name="regexList"> 398// <list> 399// CRAWLERTRAPS_PLACEHOLDER 400// </list> 401// </property> --> 402// </bean> 403 if (crawlertraps.isEmpty()) { 404 log.debug("No crawlertraps yet. No insertion is done"); 405 return; 406 } else if (!template.contains(CRAWLERTRAPS_PLACEHOLDER)) { 407 log.warn("The placeholder '" + CRAWLERTRAPS_PLACEHOLDER 408 + "' is absent from the template. No insertion is done at all. {} traps were ignored", 409 crawlertraps); 410 return; 411 } else { 412 log.info("Inserting {} crawlertraps into the template", crawlertraps.size()); 413 StringBuilder sb = new StringBuilder(); 414 for (String trap: crawlertraps) { 415 sb.append("<value>" + trap + "</value>\n"); 416 } 417 // Adding the placeholder again to be able to insert crawlertraps multiple times. 418 sb.append(CRAWLERTRAPS_PLACEHOLDER + "\n"); 419 String templateNew = template.replace(CRAWLERTRAPS_PLACEHOLDER, sb.toString()); 420 this.template = templateNew; 421 } 422 } 423 424 @Override 425 public void writeTemplate(OutputStream os) throws IOFailure { 426 try { 427 os.write(template.getBytes(Charset.forName("UTF-8"))); 428 } catch (IOException e) { 429 throw new IOFailure("Unable to write template to outputstream", e); 430 } 431 432 } 433 434 @Override 435 public boolean hasContent() { 436 throw new NotImplementedException("The hasContent method hasn't been implemented yet"); 437 } 438 439 @Override 440 public void writeToFile(File orderXmlFile) { 441 BufferedWriter writer = null; 442 try { 443 writer = new BufferedWriter( new FileWriter(orderXmlFile)); 444 writer.write(template); 445 } catch(IOException e) { 446 throw new IOFailure("Unable to write template to file '" + orderXmlFile.getAbsolutePath() + "'.", e); 447 } finally { 448 IOUtils.closeQuietly(writer); 449 } 450 } 451 452 @Override 453 public void setRecoverlogNode(File recoverlogGzFile) { 454 throw new NotImplementedException("This method has not yet been implemented"); 455 456 } 457 458 @Override 459 public void setDeduplicationIndexLocation(String absolutePath) { 460 if (!template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER)) { 461 throw new IllegalState("The placeholder for the deduplication index location property '" + DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER 462 + "' was not found. Maybe the placeholder has already been replaced with the correct value: " 463 + template); 464 } 465 String templateNew = template.replace(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER, absolutePath); 466 this.template = templateNew; 467 } 468 469 @Override 470 public void setSeedsFilePath(String absolutePath) { 471 log.debug("Note: SeedsFilePath is not set in h3"); 472 } 473 474 @Override 475 public void setArchiveFilePrefix(String archiveFilePrefix) { 476 if (!template.contains(ARCHIVE_FILE_PREFIX_PLACEHOLDER)) { 477 throw new IllegalState("The placeholder for the archive file prefix property '" 478 + ARCHIVE_FILE_PREFIX_PLACEHOLDER 479 + "' was not found. Maybe the placeholder has already been replaced with the correct value. The template looks like this: " 480 + template); 481 } 482 String templateNew = template.replace(ARCHIVE_FILE_PREFIX_PLACEHOLDER, archiveFilePrefix); 483 this.template = templateNew; 484 485 } 486 487 @Override 488 public void setDiskPath(String absolutePath) { 489 // NOP 490 log.warn("The DiskPath is not settable in the H3 template"); 491 } 492 493 @Override 494 public void removeDeduplicatorIfPresent() { 495 //NOP 496 log.warn("Removing the Deduplicator is not possible with the H3 templates and should not be required with the H3 template."); 497 } 498 499//<property name="metadataItems"> 500// <map> 501// <entry key="harvestInfo.version" value="1.03"/> <!-- TODO maybe not add this one --> 502// <entry key="harvestInfo.jobId" value="1"/> 503// <entry key="harvestInfo.channel" value="HIGH"/> 504// <entry key="harvestInfo.harvestNum" value="1"/> 505// <entry key="harvestInfo.origHarvestDefinitionID" value="1"/> 506// <entry key="harvestInfo.maxBytesPerDomain" value="100000"/> 507// <entry key="harvestInfo.maxObjectsPerDomain" value="-1"/> 508// <entry key="harvestInfo.orderXMLName" value="defaultOrderXml"/> 509// <entry key="harvestInfo.origHarvestDefinitionName" value="ddddddddd"/> 510// <entry key="harvestInfo.scheduleName" value="EveryHour"/> <!-- Optional. only relevant for Selective Harvests --> 511// <entry key="harvestInfo.harvestFilenamePrefix" value="netarkivet-1-1"/> 512// <entry key="harvestInfo.jobSubmitDate" value="22. 10. 2014"/> 513// <entry key="harvestInfo.performer" value="performer"/> <!-- Optional. --> 514// <entry key="harvestInfo.audience" value="audience"/> <!-- Optional. --> 515// </map> 516// </property> 517 518 public void insertWarcInfoMetadata(Job ajob, String origHarvestdefinitionName, 519 String scheduleName, String performer) { 520 if (!template.contains(METADATA_ITEMS_PLACEHOLDER)) { 521 throw new IllegalState("The placeholder for the property '" + METADATA_ITEMS_PLACEHOLDER 522 + "' was not found. Maybe the placeholder has already been replaced with the correct value. The template looks like this: " 523 + template); 524 } 525 String startMetadataEntry = "\n<entry key=\""; 526 String endMetadataEntry = "\"/>"; 527 String valuePart = "\" value=\""; 528 StringBuilder sb = new StringBuilder(); 529 sb.append("<property name=\"metadataItems\">\n<map>\n"); 530 531 // <entry key="harvestInfo.version" value="1.03"/> 532 533 sb.append(startMetadataEntry); 534 sb.append(HARVESTINFO_VERSION + valuePart + HARVESTINFO_VERSION_NUMBER + endMetadataEntry); 535 sb.append(startMetadataEntry); 536 sb.append(HARVESTINFO_JOBID + valuePart + ajob.getJobID() + endMetadataEntry); 537 538 sb.append(startMetadataEntry); 539 sb.append(HARVESTINFO_CHANNEL + valuePart + ajob.getChannel() + endMetadataEntry); 540 sb.append(startMetadataEntry); 541 sb.append(HARVESTINFO_HARVESTNUM + valuePart + ajob.getHarvestNum() + endMetadataEntry); 542 sb.append(startMetadataEntry); 543 sb.append(HARVESTINFO_ORIGHARVESTDEFINITIONID + valuePart + ajob.getOrigHarvestDefinitionID() + endMetadataEntry); 544 sb.append(startMetadataEntry); 545 sb.append(HARVESTINFO_MAXBYTESPERDOMAIN + valuePart + ajob.getMaxBytesPerDomain() + endMetadataEntry); 546 sb.append(startMetadataEntry); 547 sb.append(HARVESTINFO_MAXOBJECTSPERDOMAIN + valuePart + ajob.getMaxObjectsPerDomain() + endMetadataEntry); 548 sb.append(startMetadataEntry); 549 sb.append(HARVESTINFO_ORDERXMLNAME + valuePart + ajob.getOrderXMLName() + endMetadataEntry); 550 sb.append(startMetadataEntry); 551 sb.append(HARVESTINFO_ORIGHARVESTDEFINITIONNAME + valuePart + 552 origHarvestdefinitionName + endMetadataEntry); 553 554 /* optional schedule-name. */ 555 if (scheduleName != null) { 556 sb.append(startMetadataEntry); 557 sb.append(HARVESTINFO_SCHEDULENAME + valuePart + scheduleName + endMetadataEntry); 558 } 559 sb.append(startMetadataEntry); 560 sb.append(HARVESTINFO_HARVESTFILENAMEPREFIX + valuePart + ajob.getHarvestFilenamePrefix() + endMetadataEntry); 561 sb.append(startMetadataEntry); 562 sb.append(HARVESTINFO_JOBSUBMITDATE + valuePart + ajob.getSubmittedDate() + endMetadataEntry); 563 564 /* optional HARVESTINFO_PERFORMER */ 565 if (performer != null){ 566 sb.append(startMetadataEntry); 567 sb.append(HARVESTINFO_PERFORMER + valuePart + performer + endMetadataEntry); 568 } 569 570 /* optional HARVESTINFO_PERFORMER */ 571 if (ajob.getHarvestAudience() != null) { 572 sb.append(startMetadataEntry); 573 sb.append(HARVESTINFO_AUDIENCE + valuePart + ajob.getHarvestAudience() + endMetadataEntry); 574 } 575 sb.append("\n</map>\n</property>\n"); 576 577 // Replace command 578 String templateNew = template.replace(METADATA_ITEMS_PLACEHOLDER, sb.toString()); 579 this.template = templateNew; 580 } 581 582 @Override 583 public void writeTemplate(JspWriter out) throws IOFailure { 584 try { 585 out.write(template); 586 } catch (IOException e) { 587 throw new IOFailure("Unable to write to JspWriter", e); 588 } 589 } 590 591 /** 592 * Hack to remove existing placeholders, that is still present after template 593 * manipulation is completed. 594 */ 595 public void removePlaceholders() { 596 template = template.replace(METADATA_ITEMS_PLACEHOLDER, ""); 597 template = template.replace(CRAWLERTRAPS_PLACEHOLDER, ""); 598 599 if (template.contains(METADATA_ITEMS_PLACEHOLDER)) { 600 throw new IllegalState("The placeholder for the property '" + METADATA_ITEMS_PLACEHOLDER 601 + "' should have been deleted now."); 602 } 603 if (template.contains(CRAWLERTRAPS_PLACEHOLDER)) { 604 throw new IllegalState("The placeholder for the property '" + CRAWLERTRAPS_PLACEHOLDER 605 + "' should have been deleted now."); 606 } 607 } 608}