001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.datamodel; 024 025import java.io.BufferedWriter; 026import java.io.File; 027import java.io.FileWriter; 028import java.io.IOException; 029import java.io.OutputStream; 030import java.io.Serializable; 031import java.nio.charset.Charset; 032import java.util.List; 033 034import javax.servlet.jsp.JspWriter; 035 036import org.apache.commons.io.IOUtils; 037import org.slf4j.Logger; 038import org.slf4j.LoggerFactory; 039 040import com.antiaction.raptor.dao.AttributeBase; 041import com.antiaction.raptor.dao.AttributeTypeBase; 042 043import dk.netarkivet.common.exceptions.ArgumentNotValid; 044import dk.netarkivet.common.exceptions.IOFailure; 045import dk.netarkivet.common.exceptions.IllegalState; 046import dk.netarkivet.common.exceptions.NotImplementedException; 047import dk.netarkivet.common.utils.Settings; 048import dk.netarkivet.common.utils.archive.ArchiveDateConverter; 049import dk.netarkivet.harvester.HarvesterSettings; 050import dk.netarkivet.harvester.datamodel.eav.EAV.AttributeAndType; 051 052/** 053 * Class encapsulating the Heritrix crawler-beans.cxml file 054 * <p> 055 * 056 * Heritrix3 has a new model based on spring, So the XPATH is no good for processing. 057 * Instead we use placeholders instead, marked by %{..} instead of ${..}, which is used by 058 * Heritrix3 already. 059 * 060 * The template is a H3 template if it contains the string: 061 * 062 * "xmlns="http://www.springframework.org/...." 063 * 064 */ 065public class H3HeritrixTemplate extends HeritrixTemplate implements Serializable { 066 067 private static final Logger log = LoggerFactory.getLogger(H3HeritrixTemplate.class); 068 069 private String template; 070 071 /** QuotaEnforcer states for this template. TODO necessary?? */ 072 private Long forceMaxbytesPerDomain; 073 private Long forceMaxobjectsPerDomain; 074 075 /** Has this HeritrixTemplate been verified. */ 076 private boolean verified; 077 078 public final static String METADATA_ITEMS_PLACEHOLDER = "%{METADATA_ITEMS_PLACEHOLDER}"; 079 public static final String MAX_TIME_SECONDS_PLACEHOLDER = "%{MAX_TIME_SECONDS_PLACEHOLDER}"; 080 public static final String CRAWLERTRAPS_PLACEHOLDER = "%{CRAWLERTRAPS_PLACEHOLDER}"; 081 082 public static final String DEDUPLICATION_BEAN_REFERENCE_PATTERN = "<ref bean=\"DeDuplicator\"/>"; 083 public static final String DEDUPLICATION_BEAN_PATTERN = "<bean id=\"DeDuplicator\""; 084 public static final String DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER 085 = "%{DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER}"; 086 087 public static final String ARCHIVE_FILE_PREFIX_PLACEHOLDER = "%{ARCHIVE_FILE_PREFIX_PLACEHOLDER}"; 088 089 public static final String FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER 090 = "%{FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER}"; 091 092 public static final String QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER = 093 "%{QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER}"; 094 095 public static final String QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER 096 = "%{QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER}"; 097 098 099 // PLACEHOLDERS for archiver beans (Maybe not necessary) 100 final String ARCHIVER_BEAN_REFERENCE_PLACEHOLDER = "%{ARCHIVER_BEAN_REFERENCE_PLACEHOLDER}"; 101 final String ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER = "%{ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER}"; 102 103 /** 104 * Constructor for HeritrixTemplate class. 105 * 106 * @param template_id The persistent id of the template in the database 107 * @param template The template as String object 108 * @throws ArgumentNotValid if template is null. 109 */ 110 public H3HeritrixTemplate(long template_id, String template) { 111 ArgumentNotValid.checkNotNull(template, "String template"); 112 this.template_id = template_id; 113 this.template = template; 114 } 115 116 /** 117 * return the template. 118 * 119 * @return the template 120 */ 121 public HeritrixTemplate getTemplate() { 122 return this; 123 } 124 125 /** 126 * Has Template been verified? 127 * 128 * @return true, if verified on construction, otherwise false 129 */ 130 public boolean isVerified() { 131 return verified; 132 } 133 134 /** 135 * Return HeritrixTemplate as XML. 136 * @return HeritrixTemplate as XML 137 */ 138 @Override 139 public String getXML() { 140 return template; 141 } 142 143 /** 144 * Update the maxTimeSeconds property in the heritrix3 template, if possible. 145 * @param maxJobRunningTimeSecondsL Force the harvestJob to end after this number of seconds 146 * Property of the org.archive.crawler.framework.CrawlLimitEnforcer 147 * <!-- <property name="maxTimeSeconds" value="0" /> --> 148 */ 149 @Override 150 public void setMaxJobRunningTime(Long maxJobRunningTimeSecondsL) { 151 if (template.contains(MAX_TIME_SECONDS_PLACEHOLDER)) { 152 this.template = template.replace(MAX_TIME_SECONDS_PLACEHOLDER, 153 Long.toString(maxJobRunningTimeSecondsL)); 154 } else { 155 log.warn("The placeholder '" + MAX_TIME_SECONDS_PLACEHOLDER 156 + "' was not found in the template. Therefore maxRunningTime not set"); 157 } 158 } 159 160 161 @Override 162 public void setMaxBytesPerDomain(Long maxbytesL) { 163 this.forceMaxbytesPerDomain = maxbytesL; 164 } 165 166 167 @Override 168 public Long getMaxBytesPerDomain() { 169 return this.forceMaxbytesPerDomain; 170 } 171 172 @Override 173 public void setMaxObjectsPerDomain(Long maxobjectsL) { 174 this.forceMaxobjectsPerDomain = maxobjectsL; 175 } 176 177 @Override 178 public Long getMaxObjectsPerDomain() { 179 return this.forceMaxobjectsPerDomain; 180 } 181 182 @Override 183 public boolean isValid() { 184 /* 185 StringBuilder errors = new StringBuilder(); 186 // check for Deduplication index-location placeholder and DEDUPLICATION_BEAN_PATTERN 187 if (template.contains(DEDUPLICATION_BEAN_PATTERN)) { 188 if (!template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER)) { 189 errors.append("Has DefdMissing placeholder '" + DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER + "'" 190 } 191 } 192 template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER) 193 && template.contains(deduplicationBeanPattern) 194 */ 195 return true; 196 } 197 198 @Override 199 // This method is used to decide, whether to request a deduplication index or not. 200 // Done by checking, if both 201 // - a DeDuplicator bean is present in the template 202 // and 203 // - a DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER is also present. 204 // and 205 // - a DeDuplicator reference bean is present in the template 206 public boolean IsDeduplicationEnabled() { 207 return (template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER) 208 && template.contains(DEDUPLICATION_BEAN_PATTERN) 209 && template.contains(DEDUPLICATION_BEAN_REFERENCE_PATTERN)); 210 } 211 212 /** 213 * Configuring the quota-enforcer, depending on budget definition. Object limit can be defined either by 214 * using the queue-total-budget property or the quota enforcer. Which is chosen is set by the argument 215 * maxObjectsIsSetByQuotaEnforcer}'s value. So quota enforcer is set as follows: 216 * If all values in the quotaEnforcer is infinity, it is in effect disabled 217 * <ul> 218 * <li>Object limit is not set by quota enforcer, disabled only if there is no byte limit.</li> 219 * <li>Object limit is set by quota enforcer, so it should be enabled if a byte or object limit is set.</li> 220 * </ul> 221 * 222 * @param maxObjectsIsSetByQuotaEnforcer Decides whether the maxObjectsIsSetByQuotaEnforcer or not. 223 * @param forceMaxBytesPerDomain The number of max bytes per domain enforced (can be no limit) 224 * @param forceMaxObjectsPerDomain The number of max objects per domain enforced (can be no limit) 225 */ 226 public void configureQuotaEnforcer(boolean maxObjectsIsSetByQuotaEnforcer, 227 long forceMaxBytesPerDomain, long forceMaxObjectsPerDomain) { 228 this.forceMaxobjectsPerDomain = forceMaxObjectsPerDomain; 229 this.forceMaxbytesPerDomain = forceMaxBytesPerDomain; 230 String tmp = template; 231 if (!maxObjectsIsSetByQuotaEnforcer) { 232 // SetMaxObjects in the global budget to forceMaxObjectsPerDomain?? 233 String tmp1 = tmp.replace( 234 FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER, Long.toString( forceMaxObjectsPerDomain )); 235 // SetMaxObjects to infinity in the quotaEnforcer 236 tmp = tmp1.replace(QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER, 237 Long.toString(Constants.HERITRIX_MAXOBJECTS_INFINITY)); 238 } else { 239 // SetMaxObjects in the global budget to Infinity 240 String tmp1 = tmp.replace( 241 FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER, Long.toString( Constants.HERITRIX_MAXOBJECTS_INFINITY )); 242 // SetMaxObjects to forceMaxObjectsPerDomain in the quotaEnforcer 243 tmp = tmp1.replace(QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER, 244 Long.toString(forceMaxObjectsPerDomain)); 245 } 246 247 // SetMaxbytes in the QuotaEnforcer to forceMaxBytesPerDomain 248 // Divide by 1024 since Heritrix uses KB rather than bytes, 249 // and add 1 to avoid to low limit due to rounding. 250 String maxBytesStringValue = "-1"; 251 if (forceMaxBytesPerDomain != Constants.HERITRIX_MAXBYTES_INFINITY) { 252 maxBytesStringValue = Long.toString(( forceMaxBytesPerDomain 253 / Constants.BYTES_PER_HERITRIX_BYTELIMIT_UNIT) + 1); 254 log.debug("MaxbytesPerDomain set to {} Kbytes per domain", maxBytesStringValue); 255 } else { 256 log.debug("MaxbytesPerDomain set to infinite number of Kbytes per domain"); 257 } 258 259 this.template = tmp.replace(QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER, maxBytesStringValue); 260 261 } 262 263 /** 264 * Make sure that Heritrix will archive its data in the chosen archiveFormat. 265 * 266 * @param archiveFormat the chosen archiveformat ('arc' or 'warc' supported) 267 * @throws ArgumentNotValid If the chosen archiveFormat is not supported. 268 */ 269 @Override 270 public void setArchiveFormat(String archiveFormat) { 271 if (!template.contains(ARCHIVER_BEAN_REFERENCE_PLACEHOLDER)){ 272 throw new IllegalState("The placeholder '" + ARCHIVER_BEAN_REFERENCE_PLACEHOLDER 273 + "' is missing. Unable to insert proper archive writer"); 274 } 275 if (!template.contains(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER)) { 276 throw new IllegalState("The placeholder '" + ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER 277 + "' is missing. Unable to insert proper archive writer"); 278 } 279 if ("arc".equalsIgnoreCase(archiveFormat)) { 280 log.debug("ARC format selected to be used by Heritrix3"); 281 setArcArchiveformat(); 282 } else if ("warc".equalsIgnoreCase(archiveFormat)) { 283 log.debug("WARC format selected to be used by Heritrix3"); 284 setWarcArchiveformat(); 285 } else { 286 throw new ArgumentNotValid("Configuration of '" + HarvesterSettings.HERITRIX_ARCHIVE_FORMAT 287 + "' is invalid! Unrecognized format '" + archiveFormat + "'."); 288 } 289 } 290 291 /** 292 * Set the archive-format as ARC. This means enabling the ARCWriterProcessor in the template 293 */ 294 private void setArcArchiveformat(){ 295 String arcWriterbeanReference = "<ref bean=\"arcWriter\"/>"; 296 String templateNew = template.replace(ARCHIVER_BEAN_REFERENCE_PLACEHOLDER, arcWriterbeanReference); 297 template = templateNew.replace(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER, getArcWriterProcessor()); 298 } 299 300 private String getArcWriterProcessor() { 301 302 // <bean id="arcWriter" class="org.archive.modules.writer.ARCWriterProcessor"> 303 // <!-- <property name="compress" value="true" /> --> 304 // <!-- <property name="prefix" value="IAH" /> --> 305 // <!-- <property name="suffix" value="${HOSTNAME}" /> --> 306 // <!-- <property name="maxFileSizeBytes" value="100000000" /> --> 307 // <!-- <property name="poolMaxActive" value="1" /> --> 308 // <!-- <property name="poolMaxWaitMs" value="300000" /> --> 309 // <!-- <property name="skipIdenticalDigests" value="false" /> --> 310 // <!-- <property name="maxTotalBytesToWrite" value="0" /> --> 311 // <!-- <property name="directory" value="." /> --> 312 // <!-- <property name="storePaths"> 313 // <list> 314 // <value>arcs</value> 315 // </list> 316 // </property> --> 317 // </bean> 318 // "<bean id=\"arcWriter\" class=\"org.archive.modules.writer.ARCWriterProcessor\">"; 319 String propertyName="\n<property name=\""; 320 String valuePrefix = "\" value=\""; 321 String valueSuffix = "\""; 322 String propertyEnd="/>"; 323 324 StringBuilder arcWriterBeanBuilder = new StringBuilder(); 325 arcWriterBeanBuilder.append("<bean id=\"arcWriter\" class=\"org.archive.modules.writer.ARCWriterProcessor\">\n"); 326 arcWriterBeanBuilder.append(propertyName + "compress" + valuePrefix 327 + Settings.get(HarvesterSettings.HERITRIX3_ARC_COMPRESSION) 328 + valueSuffix + propertyEnd); 329 arcWriterBeanBuilder.append(propertyName + "prefix" + valuePrefix 330 + ARCHIVE_FILE_PREFIX_PLACEHOLDER 331 + valueSuffix + propertyEnd); 332// arcWriterBeanBuilder.append(propertyName + "suffix" + valuePrefix 333// + Settings.get(HarvesterSettings.HERITRIX3_ARC_SUFFIX) 334// + valueSuffix + propertyEnd); 335 arcWriterBeanBuilder.append(propertyName + "maxFileSizeBytes" + valuePrefix 336 + Settings.get(HarvesterSettings.HERITRIX3_ARC_MAXSIZE) 337 + valueSuffix + propertyEnd); 338 arcWriterBeanBuilder.append(propertyName + "poolMaxActive" + valuePrefix 339 + Settings.get(HarvesterSettings.HERITRIX3_ARC_POOL_MAXACTIVE) 340 + valueSuffix + propertyEnd); 341 arcWriterBeanBuilder.append(propertyName + "skipIdenticalDigests" + valuePrefix 342 + Settings.get(HarvesterSettings.HERITRIX3_ARC_SKIP_IDENTICAL_DIGESTS) 343 + valueSuffix + propertyEnd); 344 345 arcWriterBeanBuilder.append("</bean>"); 346 347 return arcWriterBeanBuilder.toString(); 348 } 349 350 351 /** 352 * Insert WARC-archiver beans and remove placeholder for ARC-Archiver-beans 353 * It is an error, if the WARC place-holders doesnt't exist. 354 * It is not an error, if the property placeholder does not exist. 355 */ 356 private void setWarcArchiveformat() { 357 String warcWriterbeanReference = "<ref bean=\"warcWriter\"/>"; 358 String warcWriterProcessorBean = "<bean id=\"warcWriter\" class=\"dk.netarkivet.harvester.harvesting.NasWARCProcessor\">"; 359 String propertyName="\n<property name=\""; 360 String valuePrefix = "\" value=\""; 361 String valueSuffix = "\""; 362 String propertyEnd="/>"; 363 if (!template.contains(ARCHIVER_BEAN_REFERENCE_PLACEHOLDER)) { 364 throw new IllegalState("The placeholder '" + ARCHIVER_BEAN_REFERENCE_PLACEHOLDER 365 + "' is missing"); 366 } 367 if (!template.contains(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER)) { 368 throw new IllegalState("The placeholder '" + ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER 369 + "' is missing"); 370 } 371 StringBuilder propertyBuilder = new StringBuilder(); 372 propertyBuilder.append(propertyName + "template" + valuePrefix 373 + Settings.get(HarvesterSettings.HERITRIX3_WARC_TEMPLATE) 374 + valueSuffix + propertyEnd); 375 propertyBuilder.append(propertyName + "compress" + valuePrefix 376 + Settings.get(HarvesterSettings.HERITRIX3_WARC_COMPRESSION) 377 + valueSuffix + propertyEnd); 378 // Note: The prefix value will be replaced later by the setArchiveFilePrefix() method 379 propertyBuilder.append(propertyName + "prefix" + valuePrefix 380 + ARCHIVE_FILE_PREFIX_PLACEHOLDER 381 + valueSuffix + propertyEnd); 382 propertyBuilder.append(propertyName + "maxFileSizeBytes" + valuePrefix 383 + Settings.get(HarvesterSettings.HERITRIX3_WARC_MAXSIZE) 384 + valueSuffix + propertyEnd); 385 propertyBuilder.append(propertyName + "poolMaxActive" + valuePrefix 386 + Settings.get(HarvesterSettings.HERITRIX3_WARC_POOL_MAXACTIVE) 387 + valueSuffix + propertyEnd); 388 389 propertyBuilder.append(propertyName + "writeRequests" + valuePrefix 390 + Settings.get(HarvesterSettings.HERITRIX3_WARC_WRITE_REQUESTS) 391 + valueSuffix + propertyEnd); 392 propertyBuilder.append(propertyName + "writeMetadata" + valuePrefix 393 + Settings.get(HarvesterSettings.HERITRIX3_WARC_WRITE_METADATA) 394 + valueSuffix + propertyEnd); 395 propertyBuilder.append(propertyName + "writeMetadataOutlinks" + valuePrefix 396 + Settings.get(HarvesterSettings.HERITRIX3_WARC_WRITE_METADATA_OUTLINKS) 397 + valueSuffix + propertyEnd); 398 propertyBuilder.append(propertyName + "skipIdenticalDigests" + valuePrefix 399 + Settings.get(HarvesterSettings.HERITRIX3_WARC_SKIP_IDENTICAL_DIGESTS) 400 + valueSuffix + propertyEnd); 401 propertyBuilder.append(propertyName + "startNewFilesOnCheckpoint" + valuePrefix 402 + Settings.get(HarvesterSettings.HERITRIX3_WARC_START_NEW_FILES_ON_CHECKPOINT) 403 + valueSuffix + propertyEnd); 404 405 warcWriterProcessorBean += propertyBuilder.toString(); 406 warcWriterProcessorBean += "\n\n%{METADATA_ITEMS_PLACEHOLDER}\n</bean>"; 407 String templateNew = template.replace( 408 ARCHIVER_BEAN_REFERENCE_PLACEHOLDER, warcWriterbeanReference); 409 this.template = templateNew.replace(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER, 410 warcWriterProcessorBean); 411 } 412 413 @Override 414 /** 415 * With H3 template, we insert the crawlertraps into the template at once. 416 * They are inserted to be part of a org.archive.modules.deciderules.MatchesListRegexDecideRule 417 * bean. 418 * 419 * @param elementName The elementName is currently not used with H3 420 * @param crawlertraps A list of crawlertraps to be inserted 421 */ 422 public void insertCrawlerTraps(String elementName, List<String> crawlertraps) { 423// <bean class="org.archive.modules.deciderules.MatchesListRegexDecideRule"> 424// <!-- <property name="listLogicalOr" value="true" /> --> 425// <!-- <property name="regexList"> 426// <list> 427// CRAWLERTRAPS_PLACEHOLDER 428// </list> 429// </property> --> 430// </bean> 431 if (crawlertraps.isEmpty()) { 432 log.debug("No crawlertraps yet. No insertion is done"); 433 return; 434 } else if (!template.contains(CRAWLERTRAPS_PLACEHOLDER)) { 435 log.warn("The placeholder '" + CRAWLERTRAPS_PLACEHOLDER 436 + "' is absent from the template. No insertion is done at all. {} traps were ignored", 437 crawlertraps); 438 return; 439 } else { 440 log.info("Inserting {} crawlertraps into the template", crawlertraps.size()); 441 StringBuilder sb = new StringBuilder(); 442 for (String trap: crawlertraps) { 443 sb.append("<value>" + trap + "</value>\n"); 444 } 445 // Adding the placeholder again to be able to insert crawlertraps multiple times. 446 sb.append(CRAWLERTRAPS_PLACEHOLDER + "\n"); 447 String templateNew = template.replace(CRAWLERTRAPS_PLACEHOLDER, sb.toString()); 448 this.template = templateNew; 449 } 450 } 451 452 @Override 453 public void writeTemplate(OutputStream os) throws IOFailure { 454 try { 455 os.write(template.getBytes(Charset.forName("UTF-8"))); 456 } catch (IOException e) { 457 throw new IOFailure("Unable to write template to outputstream", e); 458 } 459 460 } 461 462 @Override 463 public boolean hasContent() { 464 throw new NotImplementedException("The hasContent method hasn't been implemented yet"); 465 } 466 467 @Override 468 public void writeToFile(File orderXmlFile) { 469 BufferedWriter writer = null; 470 try { 471 writer = new BufferedWriter( new FileWriter(orderXmlFile)); 472 writer.write(template); 473 } catch(IOException e) { 474 throw new IOFailure("Unable to write template to file '" + orderXmlFile.getAbsolutePath() + "'.", e); 475 } finally { 476 IOUtils.closeQuietly(writer); 477 } 478 } 479 480 @Override 481 public void setRecoverlogNode(File recoverlogGzFile) { 482 throw new NotImplementedException("This method has not yet been implemented"); 483 484 } 485 486 @Override 487 public void setDeduplicationIndexLocation(String absolutePath) { 488 if (!template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER)) { 489 throw new IllegalState("The placeholder for the deduplication index location property '" + DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER 490 + "' was not found. Maybe the placeholder has already been replaced with the correct value: " 491 + template); 492 } 493 String templateNew = template.replace(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER, absolutePath); 494 this.template = templateNew; 495 } 496 497 @Override 498 public void setSeedsFilePath(String absolutePath) { 499 log.debug("Note: SeedsFilePath is not set in h3"); 500 } 501 502 @Override 503 public void setArchiveFilePrefix(String archiveFilePrefix) { 504 if (!template.contains(ARCHIVE_FILE_PREFIX_PLACEHOLDER)) { 505 throw new IllegalState("The placeholder for the archive file prefix property '" 506 + ARCHIVE_FILE_PREFIX_PLACEHOLDER 507 + "' was not found. Maybe the placeholder has already been replaced with the correct value. The template looks like this: " 508 + template); 509 } 510 String templateNew = template.replace(ARCHIVE_FILE_PREFIX_PLACEHOLDER, archiveFilePrefix); 511 this.template = templateNew; 512 513 } 514 515 @Override 516 public void setDiskPath(String absolutePath) { 517 // NOP 518 log.warn("The DiskPath is not settable in the H3 template"); 519 } 520 521 @Override 522 public void removeDeduplicatorIfPresent() { 523 //NOP 524 log.warn("Removing the Deduplicator is not possible with the H3 templates and should not be required with the H3 template."); 525 } 526 527//<property name="metadataItems"> 528// <map> 529// <entry key="harvestInfo.version" value="1.03"/> <!-- TODO maybe not add this one --> 530// <entry key="harvestInfo.jobId" value="1"/> 531// <entry key="harvestInfo.channel" value="HIGH"/> 532// <entry key="harvestInfo.harvestNum" value="1"/> 533// <entry key="harvestInfo.origHarvestDefinitionID" value="1"/> 534// <entry key="harvestInfo.maxBytesPerDomain" value="100000"/> 535// <entry key="harvestInfo.maxObjectsPerDomain" value="-1"/> 536// <entry key="harvestInfo.orderXMLName" value="defaultOrderXml"/> 537// <entry key="harvestInfo.origHarvestDefinitionName" value="ddddddddd"/> 538// <entry key="harvestInfo.scheduleName" value="EveryHour"/> <!-- Optional. only relevant for Selective Harvests -- only inserted if not null and not-empty.-> 539// <entry key="harvestInfo.harvestFilenamePrefix" value="netarkivet-1-1"/> 540// <entry key="harvestInfo.jobSubmitDate" value="22. 10. 2014"/> 541// <entry key="harvestInfo.performer" value="performer"/> <!-- Optional - only inserted if not null and not-empty. --> 542// <entry key="harvestInfo.audience" value="audience"/> <!-- Optional - only inserted if not null and not-empty. --> 543// </map> 544// </property> 545 546 public void insertWarcInfoMetadata(Job ajob, String origHarvestdefinitionName, 547 String scheduleName, String performer) { 548 if (!template.contains(METADATA_ITEMS_PLACEHOLDER)) { 549 throw new IllegalState("The placeholder for the property '" + METADATA_ITEMS_PLACEHOLDER 550 + "' was not found. Maybe the placeholder has already been replaced with the correct value. The template looks like this: " 551 + template); 552 } 553 log.debug("Now in " + getClass().getName()); 554 String startMetadataEntry = "\n<entry key=\""; 555 String endMetadataEntry = "\"/>"; 556 String valuePart = "\" value=\""; 557 StringBuilder sb = new StringBuilder(); 558 sb.append("<property name=\"metadataItems\">\n<map>\n"); 559 560 // <entry key="harvestInfo.version" value="1.03"/> 561 562 sb.append(startMetadataEntry); 563 sb.append(HARVESTINFO_VERSION + valuePart + HARVESTINFO_VERSION_NUMBER + endMetadataEntry); 564 sb.append(startMetadataEntry); 565 sb.append(HARVESTINFO_JOBID + valuePart + ajob.getJobID() + endMetadataEntry); 566 567 sb.append(startMetadataEntry); 568 sb.append(HARVESTINFO_CHANNEL + valuePart + ajob.getChannel() + endMetadataEntry); 569 sb.append(startMetadataEntry); 570 sb.append(HARVESTINFO_HARVESTNUM + valuePart + ajob.getHarvestNum() + endMetadataEntry); 571 sb.append(startMetadataEntry); 572 sb.append(HARVESTINFO_ORIGHARVESTDEFINITIONID + valuePart + ajob.getOrigHarvestDefinitionID() + endMetadataEntry); 573 sb.append(startMetadataEntry); 574 sb.append(HARVESTINFO_MAXBYTESPERDOMAIN + valuePart + ajob.getMaxBytesPerDomain() + endMetadataEntry); 575 sb.append(startMetadataEntry); 576 sb.append(HARVESTINFO_MAXOBJECTSPERDOMAIN + valuePart + ajob.getMaxObjectsPerDomain() + endMetadataEntry); 577 sb.append(startMetadataEntry); 578 sb.append(HARVESTINFO_ORDERXMLNAME + valuePart + ajob.getOrderXMLName() + endMetadataEntry); 579 sb.append(startMetadataEntry); 580 sb.append(HARVESTINFO_ORIGHARVESTDEFINITIONNAME + valuePart + 581 origHarvestdefinitionName + endMetadataEntry); 582 583 /* optional schedule-name - only inserted if not null and not-empty. */ 584 if (scheduleName != null && !scheduleName.isEmpty()) { 585 sb.append(startMetadataEntry); 586 sb.append(HARVESTINFO_SCHEDULENAME + valuePart + scheduleName + endMetadataEntry); 587 } 588 sb.append(startMetadataEntry); 589 sb.append(HARVESTINFO_HARVESTFILENAMEPREFIX + valuePart + ajob.getHarvestFilenamePrefix() + endMetadataEntry); 590 sb.append(startMetadataEntry); 591 sb.append(HARVESTINFO_JOBSUBMITDATE + valuePart + ArchiveDateConverter.getWarcDateFormat().format(ajob.getSubmittedDate()) + endMetadataEntry); 592 593 /* optional HARVESTINFO_PERFORMER - only inserted if not null and not-empty. */ 594 if (performer != null && !performer.isEmpty()){ 595 sb.append(startMetadataEntry); 596 sb.append(HARVESTINFO_PERFORMER + valuePart + performer + endMetadataEntry); 597 } 598 599 /* optional HARVESTINFO_AUDIENCE - only inserted if not null and not-empty. */ 600 if (ajob.getHarvestAudience() != null && !ajob.getHarvestAudience().isEmpty()) { 601 sb.append(startMetadataEntry); 602 sb.append(HARVESTINFO_AUDIENCE + valuePart + ajob.getHarvestAudience() + endMetadataEntry); 603 } 604 sb.append("\n</map>\n</property>\n"); 605 606 // Replace command 607 log.info("Adding WarcInfoMetadata " + sb.toString()); 608 String templateNew = template.replace(METADATA_ITEMS_PLACEHOLDER, sb.toString()); 609 this.template = templateNew; 610 } 611 612 @Override 613 public void insertAttributes(List<AttributeAndType> attributesAndTypes) { 614 ArgumentNotValid.checkNotNull(attributesAndTypes, "List<AttributeAndType> attributesAndTypes"); 615 for (AttributeAndType attributeAndType: attributesAndTypes) { 616 // initialize temp variables 617 Integer intVal = null; 618 String val = null; 619 AttributeTypeBase attributeType = attributeAndType.attributeType; 620 AttributeBase attribute = attributeAndType.attribute; 621 622 log.debug("Trying to insert the attribute {} into the template", attributeType.name); 623 switch (attributeType.viewtype) { 624 case 1: 625 if (attribute != null) { 626 intVal = attribute.getInteger(); 627 log.debug("Read explicitly value for attribute '{}'", attributeType.name, intVal); 628 } 629 if (intVal == null && attributeType.def_int != null) { 630 intVal = attributeType.def_int; 631 log.debug("Viewtype 1 attribute '{}' not set explicitly. Using default value '{}'", attributeType.name, intVal); 632 } 633 if (intVal != null) { 634 val = intVal.toString(); 635 } else { 636 val = ""; 637 } 638 log.info("Value selected for attribute {}: {}", attributeType.name, val); 639 break; 640 case 5: 641 if (attribute != null) { 642 intVal = attribute.getInteger(); 643 log.debug("Read explicitly value for attribute '{}'", attributeType.name, intVal); 644 } 645 if (intVal == null && attributeType.def_int != null) { 646 intVal = attributeType.def_int; 647 log.debug("Viewtype 5 attribute '{}' not set explicitly. Using default value '{}'", attributeType.name, intVal); 648 } 649 if (intVal != null && intVal > 0) { 650 val = "true"; 651 } else { 652 val = "false"; 653 } 654 log.info("Value selected for attribute '{}': '{}'", attributeType.name, val); 655 break; 656 case 6: 657 if (attribute != null) { 658 intVal = attribute.getInteger(); 659 log.debug("Read explicitly value for attribute '{}'", attributeType.name, intVal); 660 } 661 if (intVal == null && attributeType.def_int != null) { 662 intVal = attributeType.def_int; 663 log.debug("Viewtype 6 attribute '{}' not set explicitly. Using default value '{}'", attributeType.name, intVal); 664 } 665 if (intVal != null && intVal > 0) { 666 val = "obey"; 667 } else { 668 val = "ignore"; 669 } 670 log.info("Value selected for attribute '{}': '{}'", attributeType.name, val); 671 break; 672 } 673 String placeholder = "%{" + attributeType.name.toUpperCase() + "}"; 674 if (template.contains(placeholder)) { 675 String templateNew = template.replace("%{" + attributeType.name.toUpperCase() + "}", val); 676 this.template = templateNew; 677 } else { 678 log.warn("Placeholder '{}' not found in template. Therefore not substituted by '{}' in this template", 679 placeholder, val); 680 } 681 } 682 } 683 684 @Override 685 public void writeTemplate(JspWriter out) throws IOFailure { 686 try { 687 out.write(template); 688 } catch (IOException e) { 689 throw new IOFailure("Unable to write to JspWriter", e); 690 } 691 } 692 693 /** 694 * Hack to remove existing placeholders, that is still present after template 695 * manipulation is completed. 696 */ 697 public void removePlaceholders() { 698 template = template.replace(METADATA_ITEMS_PLACEHOLDER, ""); 699 template = template.replace(CRAWLERTRAPS_PLACEHOLDER, ""); 700 701 if (template.contains(METADATA_ITEMS_PLACEHOLDER)) { 702 throw new IllegalState("The placeholder for the property '" + METADATA_ITEMS_PLACEHOLDER 703 + "' should have been deleted now."); 704 } 705 if (template.contains(CRAWLERTRAPS_PLACEHOLDER)) { 706 throw new IllegalState("The placeholder for the property '" + CRAWLERTRAPS_PLACEHOLDER 707 + "' should have been deleted now."); 708 } 709 } 710}