001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.datamodel; 024 025import java.io.BufferedWriter; 026import java.io.File; 027import java.io.FileWriter; 028import java.io.IOException; 029import java.io.OutputStream; 030import java.io.Serializable; 031import java.nio.charset.Charset; 032import java.util.HashMap; 033import java.util.List; 034import java.util.Map; 035import java.util.regex.Matcher; 036import java.util.regex.Pattern; 037 038import javax.servlet.jsp.JspWriter; 039 040import org.apache.commons.io.IOUtils; 041import org.apache.commons.lang.StringUtils; 042import org.slf4j.Logger; 043import org.slf4j.LoggerFactory; 044 045import com.antiaction.raptor.dao.AttributeBase; 046import com.antiaction.raptor.dao.AttributeTypeBase; 047 048import dk.netarkivet.common.exceptions.ArgumentNotValid; 049import dk.netarkivet.common.exceptions.IOFailure; 050import dk.netarkivet.common.exceptions.IllegalState; 051import dk.netarkivet.common.exceptions.NotImplementedException; 052import dk.netarkivet.common.utils.Settings; 053import dk.netarkivet.common.utils.archive.ArchiveDateConverter; 054import dk.netarkivet.harvester.HarvesterSettings; 055import dk.netarkivet.harvester.datamodel.eav.EAV.AttributeAndType; 056 057/** 058 * Class encapsulating the Heritrix crawler-beans.cxml file 059 * <p> 060 * 061 * Heritrix3 has a new model based on spring, So the XPATH is no good for processing. 062 * Instead we use placeholders instead, marked by %{..} instead of ${..}, which is used by 063 * Heritrix3 already. 064 * 065 * The template is a H3 template if it contains the string: 066 * 067 * "xmlns="http://www.springframework.org/...." 068 * 069 */ 070public class H3HeritrixTemplate extends HeritrixTemplate implements Serializable { 071 072 private static final Logger log = LoggerFactory.getLogger(H3HeritrixTemplate.class); 073 074 private String template; 075 076 /** QuotaEnforcer states for this template. TODO necessary?? */ 077 private Long forceMaxbytesPerDomain; 078 private Long forceMaxobjectsPerDomain; 079 080 /** Has this HeritrixTemplate been verified. */ 081 private boolean verified; 082 083 public final static String METADATA_ITEMS_PLACEHOLDER = "%{METADATA_ITEMS_PLACEHOLDER}"; 084 public static final String MAX_TIME_SECONDS_PLACEHOLDER = "%{MAX_TIME_SECONDS_PLACEHOLDER}"; 085 public static final String CRAWLERTRAPS_PLACEHOLDER = "%{CRAWLERTRAPS_PLACEHOLDER}"; 086 087 public static final Pattern DEDUPLICATION_BEAN_REFERENCE_PATTERN = Pattern.compile(".*ref.*bean.*DeDuplicator.*", Pattern.DOTALL); 088 089 public static final Pattern DEDUPLICATION_BEAN_PATTERN = Pattern.compile(".*bean.*id.*DeDuplicator.*", Pattern.DOTALL); 090 public static final String DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER 091 = "%{DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER}"; 092 093 public static final String ARCHIVE_FILE_PREFIX_PLACEHOLDER = "%{ARCHIVE_FILE_PREFIX_PLACEHOLDER}"; 094 095 public static final String FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER 096 = "%{FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER}"; 097 098 public static final String QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER = 099 "%{QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER}"; 100 101 public static final String QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER 102 = "%{QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER}"; 103 104 public static final String DEDUPLICATION_ENABLED_PLACEHOLDER = "%{DEDUPLICATION_ENABLED_PLACEHOLDER}"; 105 106 107 // PLACEHOLDERS for archiver beans (Maybe not necessary) 108 final String ARCHIVER_BEAN_REFERENCE_PLACEHOLDER = "%{ARCHIVER_BEAN_REFERENCE_PLACEHOLDER}"; 109 final String ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER = "%{ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER}"; 110 111 //match theses properties in crawler-beans.cxml to add them into harvestInfo.xml 112 //for preservation purpose 113 public enum MetadataInfo { 114 TEMPLATE_DESCRIPTION("metadata\\.description=.+[\\r\\n]"), 115 TEMPLATE_UPDATE_DATE("metadata\\.date=.+[\\r\\n]"), 116 OPERATOR("metadata\\.operator=.+[\\r\\n]"); 117 118 private final String regex; 119 120 private MetadataInfo(String regex) { 121 this.regex = regex; 122 } 123 124 public String toString() { 125 return this.regex; 126 } 127 }; 128 129 public Map<MetadataInfo, String> metadataInfoMap; 130 131 /** 132 * Constructor for HeritrixTemplate class. 133 * 134 * @param template_id The persistent id of the template in the database 135 * @param template The template as String object 136 * @throws ArgumentNotValid if template is null. 137 */ 138 public H3HeritrixTemplate(long template_id, String template) { 139 ArgumentNotValid.checkNotNull(template, "String template"); 140 this.template_id = template_id; 141 this.template = template; 142 143 metadataInfoMap = new HashMap<MetadataInfo, String> (); 144 for(MetadataInfo metadataInfo : MetadataInfo.values()) { 145 Pattern p = Pattern.compile(metadataInfo.regex); 146 Matcher m = p.matcher(this.template); 147 if(m.find()) { 148 String operator = this.template.substring(m.start(), m.end()).trim(); 149 //return the value of the property after the = 150 metadataInfoMap.put(metadataInfo, operator.split("=")[1]); 151 } 152 } 153 } 154 155 /** 156 * return the template. 157 * 158 * @return the template 159 */ 160 public HeritrixTemplate getTemplate() { 161 return this; 162 } 163 164 /** 165 * Has Template been verified? 166 * 167 * @return true, if verified on construction, otherwise false 168 */ 169 public boolean isVerified() { 170 return verified; 171 } 172 173 /** 174 * Return HeritrixTemplate as XML. 175 * @return HeritrixTemplate as XML 176 */ 177 @Override 178 public String getXML() { 179 return template; 180 } 181 182 /** 183 * Update the maxTimeSeconds property in the heritrix3 template, if possible. 184 * @param maxJobRunningTimeSecondsL Force the harvestJob to end after this number of seconds 185 * Property of the org.archive.crawler.framework.CrawlLimitEnforcer 186 * <!-- <property name="maxTimeSeconds" value="0" /> --> 187 */ 188 @Override 189 public void setMaxJobRunningTime(Long maxJobRunningTimeSecondsL) { 190 if (template.contains(MAX_TIME_SECONDS_PLACEHOLDER)) { 191 this.template = template.replace(MAX_TIME_SECONDS_PLACEHOLDER, 192 Long.toString(maxJobRunningTimeSecondsL)); 193 } else { 194 log.warn("The placeholder '" + MAX_TIME_SECONDS_PLACEHOLDER 195 + "' was not found in the template. Therefore maxRunningTime not set"); 196 } 197 } 198 199 200 @Override 201 public void setMaxBytesPerDomain(Long maxbytesL) { 202 this.forceMaxbytesPerDomain = maxbytesL; 203 } 204 205 206 @Override 207 public Long getMaxBytesPerDomain() { 208 return this.forceMaxbytesPerDomain; 209 } 210 211 @Override 212 public void setMaxObjectsPerDomain(Long maxobjectsL) { 213 this.forceMaxobjectsPerDomain = maxobjectsL; 214 } 215 216 @Override 217 public Long getMaxObjectsPerDomain() { 218 return this.forceMaxobjectsPerDomain; 219 } 220 221 @Override 222 public boolean isValid() { 223 /* 224 StringBuilder errors = new StringBuilder(); 225 // check for Deduplication index-location placeholder and DEDUPLICATION_BEAN_PATTERN 226 if (template.contains(DEDUPLICATION_BEAN_PATTERN)) { 227 if (!template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER)) { 228 errors.append("Has DefdMissing placeholder '" + DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER + "'" 229 } 230 } 231 template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER) 232 && template.contains(deduplicationBeanPattern) 233 */ 234 return true; 235 } 236 237 @Override 238 // This method is used to decide, whether to request a deduplication index or not. 239 // Done by checking, if both 240 // - a DeDuplicator bean is present in the template 241 // and 242 // - a DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER is also present. 243 // and 244 // - a DeDuplicator reference bean is present in the template 245 public boolean IsDeduplicationEnabled() { 246 return (template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER) 247 && DEDUPLICATION_BEAN_PATTERN.matcher(template).matches() 248 && DEDUPLICATION_BEAN_REFERENCE_PATTERN.matcher(template).matches()); 249 } 250 251 /** 252 * Configuring the quota-enforcer, depending on budget definition. Object limit can be defined either by 253 * using the queue-total-budget property or the quota enforcer. Which is chosen is set by the argument 254 * maxObjectsIsSetByQuotaEnforcer}'s value. So quota enforcer is set as follows: 255 * If all values in the quotaEnforcer is infinity, it is in effect disabled 256 * <ul> 257 * <li>Object limit is not set by quota enforcer, disabled only if there is no byte limit.</li> 258 * <li>Object limit is set by quota enforcer, so it should be enabled if a byte or object limit is set.</li> 259 * </ul> 260 * 261 * @param maxObjectsIsSetByQuotaEnforcer Decides whether the maxObjectsIsSetByQuotaEnforcer or not. 262 * @param forceMaxBytesPerDomain The number of max bytes per domain enforced (can be no limit) 263 * @param forceMaxObjectsPerDomain The number of max objects per domain enforced (can be no limit) 264 */ 265 public void configureQuotaEnforcer(boolean maxObjectsIsSetByQuotaEnforcer, 266 long forceMaxBytesPerDomain, long forceMaxObjectsPerDomain) { 267 this.forceMaxobjectsPerDomain = forceMaxObjectsPerDomain; 268 this.forceMaxbytesPerDomain = forceMaxBytesPerDomain; 269 String tmp = template; 270 if (!maxObjectsIsSetByQuotaEnforcer) { 271 // SetMaxObjects in the global budget to forceMaxObjectsPerDomain?? 272 String tmp1 = tmp.replace( 273 FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER, Long.toString( forceMaxObjectsPerDomain )); 274 // SetMaxObjects to infinity in the quotaEnforcer 275 tmp = tmp1.replace(QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER, 276 Long.toString(Constants.HERITRIX_MAXOBJECTS_INFINITY)); 277 } else { 278 // SetMaxObjects in the global budget to Infinity 279 String tmp1 = tmp.replace( 280 FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER, Long.toString( Constants.HERITRIX_MAXOBJECTS_INFINITY )); 281 // SetMaxObjects to forceMaxObjectsPerDomain in the quotaEnforcer 282 tmp = tmp1.replace(QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER, 283 Long.toString(forceMaxObjectsPerDomain)); 284 } 285 286 // SetMaxbytes in the QuotaEnforcer to forceMaxBytesPerDomain 287 // Divide by 1024 since Heritrix uses KB rather than bytes, 288 // and add 1 to avoid to low limit due to rounding. 289 String maxBytesStringValue = "-1"; 290 if (forceMaxBytesPerDomain != Constants.HERITRIX_MAXBYTES_INFINITY) { 291 maxBytesStringValue = Long.toString(( forceMaxBytesPerDomain 292 / Constants.BYTES_PER_HERITRIX_BYTELIMIT_UNIT) + 1); 293 log.debug("MaxbytesPerDomain set to {} Kbytes per domain", maxBytesStringValue); 294 } else { 295 log.debug("MaxbytesPerDomain set to infinite number of Kbytes per domain"); 296 } 297 298 this.template = tmp.replace(QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER, maxBytesStringValue); 299 300 } 301 302 /** 303 * Make sure that Heritrix will archive its data in the chosen archiveFormat. 304 * 305 * @param archiveFormat the chosen archiveformat ('arc' or 'warc' supported) 306 * @throws ArgumentNotValid If the chosen archiveFormat is not supported. 307 */ 308 @Override 309 public void setArchiveFormat(String archiveFormat) { 310 if (!template.contains(ARCHIVER_BEAN_REFERENCE_PLACEHOLDER)){ 311 throw new IllegalState("The placeholder '" + ARCHIVER_BEAN_REFERENCE_PLACEHOLDER 312 + "' is missing. Unable to insert proper archive writer"); 313 } 314 if (!template.contains(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER)) { 315 throw new IllegalState("The placeholder '" + ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER 316 + "' is missing. Unable to insert proper archive writer"); 317 } 318 if ("arc".equalsIgnoreCase(archiveFormat)) { 319 log.debug("ARC format selected to be used by Heritrix3"); 320 setArcArchiveformat(); 321 } else if ("warc".equalsIgnoreCase(archiveFormat)) { 322 log.debug("WARC format selected to be used by Heritrix3"); 323 setWarcArchiveformat(); 324 } else { 325 throw new ArgumentNotValid("Configuration of '" + HarvesterSettings.HERITRIX_ARCHIVE_FORMAT 326 + "' is invalid! Unrecognized format '" + archiveFormat + "'."); 327 } 328 } 329 330 /** 331 * Set the archive-format as ARC. This means enabling the ARCWriterProcessor in the template 332 */ 333 private void setArcArchiveformat(){ 334 String arcWriterbeanReference = "<ref bean=\"arcWriter\"/>"; 335 String templateNew = template.replace(ARCHIVER_BEAN_REFERENCE_PLACEHOLDER, arcWriterbeanReference); 336 template = templateNew.replace(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER, getArcWriterProcessor()); 337 } 338 339 private String getArcWriterProcessor() { 340 341 // <bean id="arcWriter" class="org.archive.modules.writer.ARCWriterProcessor"> 342 // <!-- <property name="compress" value="true" /> --> 343 // <!-- <property name="prefix" value="IAH" /> --> 344 // <!-- <property name="suffix" value="${HOSTNAME}" /> --> 345 // <!-- <property name="maxFileSizeBytes" value="100000000" /> --> 346 // <!-- <property name="poolMaxActive" value="1" /> --> 347 // <!-- <property name="poolMaxWaitMs" value="300000" /> --> 348 // <!-- <property name="skipIdenticalDigests" value="false" /> --> 349 // <!-- <property name="maxTotalBytesToWrite" value="0" /> --> 350 // <!-- <property name="directory" value="." /> --> 351 // <!-- <property name="storePaths"> 352 // <list> 353 // <value>arcs</value> 354 // </list> 355 // </property> --> 356 // </bean> 357 // "<bean id=\"arcWriter\" class=\"org.archive.modules.writer.ARCWriterProcessor\">"; 358 String propertyName="\n<property name=\""; 359 String valuePrefix = "\" value=\""; 360 String valueSuffix = "\""; 361 String propertyEnd="/>"; 362 363 StringBuilder arcWriterBeanBuilder = new StringBuilder(); 364 arcWriterBeanBuilder.append("<bean id=\"arcWriter\" class=\"org.archive.modules.writer.ARCWriterProcessor\">\n"); 365 arcWriterBeanBuilder.append(propertyName + "compress" + valuePrefix 366 + Settings.get(HarvesterSettings.HERITRIX3_ARC_COMPRESSION) 367 + valueSuffix + propertyEnd); 368 arcWriterBeanBuilder.append(propertyName + "prefix" + valuePrefix 369 + ARCHIVE_FILE_PREFIX_PLACEHOLDER 370 + valueSuffix + propertyEnd); 371// arcWriterBeanBuilder.append(propertyName + "suffix" + valuePrefix 372// + Settings.get(HarvesterSettings.HERITRIX3_ARC_SUFFIX) 373// + valueSuffix + propertyEnd); 374 arcWriterBeanBuilder.append(propertyName + "maxFileSizeBytes" + valuePrefix 375 + Settings.get(HarvesterSettings.HERITRIX3_ARC_MAXSIZE) 376 + valueSuffix + propertyEnd); 377 arcWriterBeanBuilder.append(propertyName + "poolMaxActive" + valuePrefix 378 + Settings.get(HarvesterSettings.HERITRIX3_ARC_POOL_MAXACTIVE) 379 + valueSuffix + propertyEnd); 380 arcWriterBeanBuilder.append(propertyName + "skipIdenticalDigests" + valuePrefix 381 + Settings.get(HarvesterSettings.HERITRIX3_ARC_SKIP_IDENTICAL_DIGESTS) 382 + valueSuffix + propertyEnd); 383 384 arcWriterBeanBuilder.append("</bean>"); 385 386 return arcWriterBeanBuilder.toString(); 387 } 388 389 390 /** 391 * Insert WARC-archiver beans and remove placeholder for ARC-Archiver-beans 392 * It is an error, if the WARC place-holders doesnt't exist. 393 * It is not an error, if the property placeholder does not exist. 394 */ 395 private void setWarcArchiveformat() { 396 String warcWriterbeanReference = "<ref bean=\"warcWriter\"/>"; 397 String warcWriterProcessorBean = "<bean id=\"warcWriter\" class=\"dk.netarkivet.harvester.harvesting.NasWARCProcessor\">"; 398 String propertyName="\n<property name=\""; 399 String valuePrefix = "\" value=\""; 400 String valueSuffix = "\""; 401 String propertyEnd="/>"; 402 if (!template.contains(ARCHIVER_BEAN_REFERENCE_PLACEHOLDER)) { 403 throw new IllegalState("The placeholder '" + ARCHIVER_BEAN_REFERENCE_PLACEHOLDER 404 + "' is missing"); 405 } 406 if (!template.contains(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER)) { 407 throw new IllegalState("The placeholder '" + ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER 408 + "' is missing"); 409 } 410 StringBuilder propertyBuilder = new StringBuilder(); 411 propertyBuilder.append(propertyName + "template" + valuePrefix 412 + Settings.get(HarvesterSettings.HERITRIX3_WARC_TEMPLATE) 413 + valueSuffix + propertyEnd); 414 propertyBuilder.append(propertyName + "compress" + valuePrefix 415 + Settings.get(HarvesterSettings.HERITRIX3_WARC_COMPRESSION) 416 + valueSuffix + propertyEnd); 417 // Note: The prefix value will be replaced later by the setArchiveFilePrefix() method 418 propertyBuilder.append(propertyName + "prefix" + valuePrefix 419 + ARCHIVE_FILE_PREFIX_PLACEHOLDER 420 + valueSuffix + propertyEnd); 421 propertyBuilder.append(propertyName + "maxFileSizeBytes" + valuePrefix 422 + Settings.get(HarvesterSettings.HERITRIX3_WARC_MAXSIZE) 423 + valueSuffix + propertyEnd); 424 propertyBuilder.append(propertyName + "poolMaxActive" + valuePrefix 425 + Settings.get(HarvesterSettings.HERITRIX3_WARC_POOL_MAXACTIVE) 426 + valueSuffix + propertyEnd); 427 428 propertyBuilder.append(propertyName + "writeRequests" + valuePrefix 429 + Settings.get(HarvesterSettings.HERITRIX3_WARC_WRITE_REQUESTS) 430 + valueSuffix + propertyEnd); 431 propertyBuilder.append(propertyName + "writeMetadata" + valuePrefix 432 + Settings.get(HarvesterSettings.HERITRIX3_WARC_WRITE_METADATA) 433 + valueSuffix + propertyEnd); 434 propertyBuilder.append(propertyName + "writeMetadataOutlinks" + valuePrefix 435 + Settings.get(HarvesterSettings.HERITRIX3_WARC_WRITE_METADATA_OUTLINKS) 436 + valueSuffix + propertyEnd); 437 propertyBuilder.append(propertyName + "skipIdenticalDigests" + valuePrefix 438 + Settings.get(HarvesterSettings.HERITRIX3_WARC_SKIP_IDENTICAL_DIGESTS) 439 + valueSuffix + propertyEnd); 440 propertyBuilder.append(propertyName + "startNewFilesOnCheckpoint" + valuePrefix 441 + Settings.get(HarvesterSettings.HERITRIX3_WARC_START_NEW_FILES_ON_CHECKPOINT) 442 + valueSuffix + propertyEnd); 443 444 warcWriterProcessorBean += propertyBuilder.toString(); 445 warcWriterProcessorBean += "\n\n%{METADATA_ITEMS_PLACEHOLDER}\n</bean>"; 446 String templateNew = template.replace( 447 ARCHIVER_BEAN_REFERENCE_PLACEHOLDER, warcWriterbeanReference); 448 this.template = templateNew.replace(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER, 449 warcWriterProcessorBean); 450 } 451 452 @Override 453 /** 454 * With H3 template, we insert the crawlertraps into the template at once. 455 * They are inserted to be part of a org.archive.modules.deciderules.MatchesListRegexDecideRule 456 * bean. 457 * 458 * @param elementName The elementName is currently not used with H3 459 * @param crawlertraps A list of crawlertraps to be inserted 460 */ 461 public void insertCrawlerTraps(String elementName, List<String> crawlertraps) { 462// <bean class="org.archive.modules.deciderules.MatchesListRegexDecideRule"> 463// <!-- <property name="listLogicalOr" value="true" /> --> 464// <!-- <property name="regexList"> 465// <list> 466// CRAWLERTRAPS_PLACEHOLDER 467// </list> 468// </property> --> 469// </bean> 470 if (crawlertraps.isEmpty()) { 471 log.debug("No crawlertraps yet. No insertion is done"); 472 return; 473 } else if (!template.contains(CRAWLERTRAPS_PLACEHOLDER)) { 474 log.warn("The placeholder '" + CRAWLERTRAPS_PLACEHOLDER 475 + "' is absent from the template. No insertion is done at all. {} traps were ignored", 476 crawlertraps); 477 return; 478 } else { 479 log.info("Inserting {} crawlertraps into the template", crawlertraps.size()); 480 StringBuilder sb = new StringBuilder(); 481 for (String trap: crawlertraps) { 482 sb.append("<value>" + trap + "</value>\n"); 483 } 484 // Adding the placeholder again to be able to insert crawlertraps multiple times. 485 sb.append(CRAWLERTRAPS_PLACEHOLDER + "\n"); 486 String templateNew = template.replace(CRAWLERTRAPS_PLACEHOLDER, sb.toString()); 487 this.template = templateNew; 488 } 489 } 490 491 public String getMetadataInfo(MetadataInfo info) { 492 String infoStr = null; 493 if(metadataInfoMap.containsKey(info)) { 494 infoStr = metadataInfoMap.get(info); 495 } 496 return infoStr; 497 } 498 499 @Override 500 public void writeTemplate(OutputStream os) throws IOFailure { 501 try { 502 os.write(template.getBytes(Charset.forName("UTF-8"))); 503 } catch (IOException e) { 504 throw new IOFailure("Unable to write template to outputstream", e); 505 } 506 507 } 508 509 @Override 510 public boolean hasContent() { 511 throw new NotImplementedException("The hasContent method hasn't been implemented yet"); 512 } 513 514 @Override 515 public void writeToFile(File orderXmlFile) { 516 BufferedWriter writer = null; 517 try { 518 writer = new BufferedWriter( new FileWriter(orderXmlFile)); 519 writer.write(template); 520 } catch(IOException e) { 521 throw new IOFailure("Unable to write template to file '" + orderXmlFile.getAbsolutePath() + "'.", e); 522 } finally { 523 IOUtils.closeQuietly(writer); 524 } 525 } 526 527 @Override 528 public void setRecoverlogNode(File recoverlogGzFile) { 529 throw new NotImplementedException("This method has not yet been implemented"); 530 531 } 532 533 @Override 534 public void setDeduplicationIndexLocation(String absolutePath) { 535 if (!template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER)) { 536 throw new IllegalState("The placeholder for the deduplication index location property '" + DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER 537 + "' was not found. Maybe the placeholder has already been replaced with the correct value: " 538 + template); 539 } 540 String templateNew = template.replace(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER, absolutePath); 541 this.template = templateNew; 542 } 543 544 @Override 545 public void setSeedsFilePath(String absolutePath) { 546 log.debug("Note: SeedsFilePath is not set in h3"); 547 } 548 549 @Override 550 public void setArchiveFilePrefix(String archiveFilePrefix) { 551 if (!template.contains(ARCHIVE_FILE_PREFIX_PLACEHOLDER)) { 552 throw new IllegalState("The placeholder for the archive file prefix property '" 553 + ARCHIVE_FILE_PREFIX_PLACEHOLDER 554 + "' was not found. Maybe the placeholder has already been replaced with the correct value. The template looks like this: " 555 + template); 556 } 557 String templateNew = template.replace(ARCHIVE_FILE_PREFIX_PLACEHOLDER, archiveFilePrefix); 558 this.template = templateNew; 559 560 } 561 562 @Override 563 public void setDiskPath(String absolutePath) { 564 // NOP 565 log.warn("The DiskPath is not settable in the H3 template"); 566 } 567 568 @Override 569 public void removeDeduplicatorIfPresent() { 570 //NOP 571 log.debug("In H3 we don't remove the deduplicator, but just disable it."); 572 } 573 574 @Override public void enableOrDisableDeduplication(boolean enabled) { 575 final String replacement = Boolean.toString(enabled).toLowerCase(); 576 log.debug("Replacing deduplication enabled placeholder {} with {}.", DEDUPLICATION_ENABLED_PLACEHOLDER, replacement); 577 this.template = template.replace(DEDUPLICATION_ENABLED_PLACEHOLDER, replacement); 578 } 579 580 //<property name="metadataItems"> 581// <map> 582// <entry key="harvestInfo.version" value="1.03"/> <!-- TODO maybe not add this one --> 583// <entry key="harvestInfo.jobId" value="1"/> 584// <entry key="harvestInfo.channel" value="HIGH"/> 585// <entry key="harvestInfo.harvestNum" value="1"/> 586// <entry key="harvestInfo.origHarvestDefinitionID" value="1"/> 587// <entry key="harvestInfo.maxBytesPerDomain" value="100000"/> 588// <entry key="harvestInfo.maxObjectsPerDomain" value="-1"/> 589// <entry key="harvestInfo.orderXMLName" value="defaultOrderXml"/> 590// <entry key="harvestInfo.origHarvestDefinitionName" value="ddddddddd"/> 591// <entry key="harvestInfo.scheduleName" value="EveryHour"/> <!-- Optional. only relevant for Selective Harvests -- only inserted if not null and not-empty.-> 592// <entry key="harvestInfo.harvestFilenamePrefix" value="netarkivet-1-1"/> 593// <entry key="harvestInfo.jobSubmitDate" value="22. 10. 2014"/> 594// <entry key="harvestInfo.performer" value="performer"/> <!-- Optional - only inserted if not null and not-empty. --> 595// <entry key="harvestInfo.audience" value="audience"/> <!-- Optional - only inserted if not null and not-empty. --> 596// </map> 597// </property> 598 599 public void insertWarcInfoMetadata(Job ajob, String origHarvestdefinitionName, 600 String origHarvestdefinitionComments, String scheduleName, String performer) { 601 if (!template.contains(METADATA_ITEMS_PLACEHOLDER)) { 602 throw new IllegalState("The placeholder for the property '" + METADATA_ITEMS_PLACEHOLDER 603 + "' was not found. Maybe the placeholder has already been replaced with the correct value. The template looks like this: " 604 + template); 605 } 606 log.debug("Now in " + getClass().getName()); 607 String startMetadataEntry = "\n<entry key=\""; 608 String endMetadataEntry = "\"/>"; 609 String valuePart = "\" value=\""; 610 StringBuilder sb = new StringBuilder(); 611 sb.append("<property name=\"metadataItems\">\n<map>\n"); 612 613 // <entry key="harvestInfo.version" value="1.03"/> 614 615 sb.append(startMetadataEntry); 616 sb.append(HARVESTINFO_VERSION + valuePart + HARVESTINFO_VERSION_NUMBER + endMetadataEntry); 617 sb.append(startMetadataEntry); 618 sb.append(HARVESTINFO_JOBID + valuePart + ajob.getJobID() + endMetadataEntry); 619 620 sb.append(startMetadataEntry); 621 sb.append(HARVESTINFO_CHANNEL + valuePart + ajob.getChannel() + endMetadataEntry); 622 sb.append(startMetadataEntry); 623 sb.append(HARVESTINFO_HARVESTNUM + valuePart + ajob.getHarvestNum() + endMetadataEntry); 624 sb.append(startMetadataEntry); 625 sb.append(HARVESTINFO_ORIGHARVESTDEFINITIONID + valuePart + ajob.getOrigHarvestDefinitionID() + endMetadataEntry); 626 sb.append(startMetadataEntry); 627 sb.append(HARVESTINFO_MAXBYTESPERDOMAIN + valuePart + ajob.getMaxBytesPerDomain() + endMetadataEntry); 628 sb.append(startMetadataEntry); 629 sb.append(HARVESTINFO_MAXOBJECTSPERDOMAIN + valuePart + ajob.getMaxObjectsPerDomain() + endMetadataEntry); 630 sb.append(startMetadataEntry); 631 sb.append(HARVESTINFO_ORDERXMLNAME + valuePart + ajob.getOrderXMLName() + endMetadataEntry); 632 633 /* orderxml update date - only inserted if not null and not-empty. */ 634 /* take info from crawler-beans.cxml */ 635 String tmp = getMetadataInfo(MetadataInfo.TEMPLATE_UPDATE_DATE); 636 if (tmp != null && !tmp.isEmpty()){ 637 sb.append(startMetadataEntry); 638 sb.append(HARVESTINFO_ORDERXMLUPDATEDATE + valuePart + tmp + endMetadataEntry); 639 } 640 /* orderxml description - only inserted if not null and not-empty. */ 641 /* take info from crawler-beans.cxml */ 642 tmp = getMetadataInfo(MetadataInfo.TEMPLATE_DESCRIPTION); 643 if (tmp != null && !tmp.isEmpty()){ 644 sb.append(startMetadataEntry); 645 sb.append(HARVESTINFO_ORDERXMLDESCRIPTION + valuePart + tmp + endMetadataEntry); 646 } 647 648 sb.append(startMetadataEntry); 649 sb.append(HARVESTINFO_ORIGHARVESTDEFINITIONNAME + valuePart + 650 origHarvestdefinitionName + endMetadataEntry); 651 652 if(StringUtils.isNotEmpty(origHarvestdefinitionComments)) { 653 sb.append(startMetadataEntry); 654 sb.append(HARVESTINFO_ORIGHARVESTDEFINITIONCOMMENTS + valuePart + 655 origHarvestdefinitionComments + endMetadataEntry); 656 } 657 658 /* optional schedule-name - only inserted if not null and not-empty. */ 659 if (scheduleName != null && !scheduleName.isEmpty()) { 660 sb.append(startMetadataEntry); 661 sb.append(HARVESTINFO_SCHEDULENAME + valuePart + scheduleName + endMetadataEntry); 662 } 663 sb.append(startMetadataEntry); 664 sb.append(HARVESTINFO_HARVESTFILENAMEPREFIX + valuePart + ajob.getHarvestFilenamePrefix() + endMetadataEntry); 665 sb.append(startMetadataEntry); 666 sb.append(HARVESTINFO_JOBSUBMITDATE + valuePart + ArchiveDateConverter.getWarcDateFormat().format(ajob.getSubmittedDate()) + endMetadataEntry); 667 668 /* optional HARVESTINFO_PERFORMER - only inserted if not null and not-empty. */ 669 if (performer != null && !performer.isEmpty()){ 670 sb.append(startMetadataEntry); 671 sb.append(HARVESTINFO_PERFORMER + valuePart + performer + endMetadataEntry); 672 } 673 674 /* optional OPERATOR - only inserted if not null and not-empty. */ 675 /* take info from crawler-beans.cxml */ 676 String operator = getMetadataInfo(MetadataInfo.OPERATOR); 677 if (operator != null && !operator.isEmpty()){ 678 sb.append(startMetadataEntry); 679 sb.append(HARVESTINFO_OPERATOR + valuePart + operator + endMetadataEntry); 680 } 681 682 /* optional HARVESTINFO_AUDIENCE - only inserted if not null and not-empty. */ 683 if (ajob.getHarvestAudience() != null && !ajob.getHarvestAudience().isEmpty()) { 684 sb.append(startMetadataEntry); 685 sb.append(HARVESTINFO_AUDIENCE + valuePart + ajob.getHarvestAudience() + endMetadataEntry); 686 } 687 sb.append("\n</map>\n</property>\n"); 688 689 // Replace command 690 log.info("Adding WarcInfoMetadata " + sb.toString()); 691 String templateNew = template.replace(METADATA_ITEMS_PLACEHOLDER, sb.toString()); 692 this.template = templateNew; 693 } 694 695 @Override 696 public void insertAttributes(List<AttributeAndType> attributesAndTypes) { 697 ArgumentNotValid.checkNotNull(attributesAndTypes, "List<AttributeAndType> attributesAndTypes"); 698 for (AttributeAndType attributeAndType: attributesAndTypes) { 699 // initialize temp variables 700 Integer intVal = null; 701 String val = null; 702 AttributeTypeBase attributeType = attributeAndType.attributeType; 703 AttributeBase attribute = attributeAndType.attribute; 704 705 log.debug("Trying to insert the attribute {} into the template", attributeType.name); 706 switch (attributeType.viewtype) { 707 case 1: 708 if (attribute != null) { 709 intVal = attribute.getInteger(); 710 log.debug("Read explicitly value for attribute '{}'", attributeType.name, intVal); 711 } 712 if (intVal == null && attributeType.def_int != null) { 713 intVal = attributeType.def_int; 714 log.debug("Viewtype 1 attribute '{}' not set explicitly. Using default value '{}'", attributeType.name, intVal); 715 } 716 if (intVal != null) { 717 val = intVal.toString(); 718 } else { 719 val = ""; 720 } 721 log.info("Value selected for attribute {}: {}", attributeType.name, val); 722 break; 723 case 5: 724 if (attribute != null) { 725 intVal = attribute.getInteger(); 726 log.debug("Read explicitly value for attribute '{}'", attributeType.name, intVal); 727 } 728 if (intVal == null && attributeType.def_int != null) { 729 intVal = attributeType.def_int; 730 log.debug("Viewtype 5 attribute '{}' not set explicitly. Using default value '{}'", attributeType.name, intVal); 731 } 732 if (intVal != null && intVal > 0) { 733 val = "true"; 734 } else { 735 val = "false"; 736 } 737 log.info("Value selected for attribute '{}': '{}'", attributeType.name, val); 738 break; 739 case 6: 740 if (attribute != null) { 741 intVal = attribute.getInteger(); 742 log.debug("Read explicitly value for attribute '{}'", attributeType.name, intVal); 743 } 744 if (intVal == null && attributeType.def_int != null) { 745 intVal = attributeType.def_int; 746 log.debug("Viewtype 6 attribute '{}' not set explicitly. Using default value '{}'", attributeType.name, intVal); 747 } 748 if (intVal != null && intVal > 0) { 749 val = "obey"; 750 } else { 751 val = "ignore"; 752 } 753 log.info("Value selected for attribute '{}': '{}'", attributeType.name, val); 754 break; 755 } 756 String placeholder = "%{" + attributeType.name.toUpperCase() + "}"; 757 if (template.contains(placeholder)) { 758 String templateNew = template.replace("%{" + attributeType.name.toUpperCase() + "}", val); 759 this.template = templateNew; 760 } else { 761 log.warn("Placeholder '{}' not found in template. Therefore not substituted by '{}' in this template", 762 placeholder, val); 763 } 764 } 765 } 766 767 @Override 768 public void writeTemplate(JspWriter out) throws IOFailure { 769 try { 770 out.write(template); 771 } catch (IOException e) { 772 throw new IOFailure("Unable to write to JspWriter", e); 773 } 774 } 775 776 /** 777 * Hack to remove existing placeholders, that is still present after template 778 * manipulation is completed. 779 */ 780 public void removePlaceholders() { 781 template = template.replace(METADATA_ITEMS_PLACEHOLDER, ""); 782 template = template.replace(CRAWLERTRAPS_PLACEHOLDER, ""); 783 784 if (template.contains(METADATA_ITEMS_PLACEHOLDER)) { 785 throw new IllegalState("The placeholder for the property '" + METADATA_ITEMS_PLACEHOLDER 786 + "' should have been deleted now."); 787 } 788 if (template.contains(CRAWLERTRAPS_PLACEHOLDER)) { 789 throw new IllegalState("The placeholder for the property '" + CRAWLERTRAPS_PLACEHOLDER 790 + "' should have been deleted now."); 791 } 792 } 793}