001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.datamodel; 024 025import java.io.BufferedWriter; 026import java.io.File; 027import java.io.FileWriter; 028import java.io.IOException; 029import java.io.OutputStream; 030import java.io.Serializable; 031import java.nio.charset.Charset; 032import java.util.HashMap; 033import java.util.List; 034import java.util.Map; 035import java.util.regex.Matcher; 036import java.util.regex.Pattern; 037 038import javax.servlet.jsp.JspWriter; 039 040import org.apache.commons.io.IOUtils; 041import org.apache.commons.lang.StringEscapeUtils; 042import org.apache.commons.lang.StringUtils; 043import org.slf4j.Logger; 044import org.slf4j.LoggerFactory; 045 046import com.antiaction.raptor.dao.AttributeBase; 047import com.antiaction.raptor.dao.AttributeTypeBase; 048 049import dk.netarkivet.common.CommonSettings; 050import dk.netarkivet.common.exceptions.ArgumentNotValid; 051import dk.netarkivet.common.exceptions.IOFailure; 052import dk.netarkivet.common.exceptions.IllegalState; 053import dk.netarkivet.common.exceptions.NotImplementedException; 054import dk.netarkivet.common.utils.Settings; 055import dk.netarkivet.common.utils.archive.ArchiveDateConverter; 056import dk.netarkivet.harvester.HarvesterSettings; 057import dk.netarkivet.harvester.datamodel.eav.EAV.AttributeAndType; 058 059/** 060 * Class encapsulating the Heritrix crawler-beans.cxml file 061 * <p> 062 * 063 * Heritrix3 has a new model based on spring, So the XPATH is no good for processing. 064 * Instead we use placeholders instead, marked by %{..} instead of ${..}, which is used by 065 * Heritrix3 already. 066 * 067 * The template is a H3 template if it contains the string: 068 * 069 * "xmlns="http://www.springframework.org/...." 070 * 071 */ 072public class H3HeritrixTemplate extends HeritrixTemplate implements Serializable { 073 074 private static final Logger log = LoggerFactory.getLogger(H3HeritrixTemplate.class); 075 076 private String template; 077 078 /** QuotaEnforcer states for this template. TODO necessary?? */ 079 private Long forceMaxbytesPerDomain; 080 private Long forceMaxobjectsPerDomain; 081 082 /** Has this HeritrixTemplate been verified. */ 083 private boolean verified; 084 085 public final static String METADATA_ITEMS_PLACEHOLDER = "%{METADATA_ITEMS_PLACEHOLDER}"; 086 public static final String MAX_TIME_SECONDS_PLACEHOLDER = "%{MAX_TIME_SECONDS_PLACEHOLDER}"; 087 public static final String CRAWLERTRAPS_PLACEHOLDER = "%{CRAWLERTRAPS_PLACEHOLDER}"; 088 089 public static final Pattern DEDUPLICATION_BEAN_REFERENCE_PATTERN = Pattern.compile(".*ref.*bean.*DeDuplicator.*", Pattern.DOTALL); 090 091 public static final Pattern DEDUPLICATION_BEAN_PATTERN = Pattern.compile(".*bean.*id.*DeDuplicator.*", Pattern.DOTALL); 092 public static final String DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER 093 = "%{DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER}"; 094 095 public static final String ARCHIVE_FILE_PREFIX_PLACEHOLDER = "%{ARCHIVE_FILE_PREFIX_PLACEHOLDER}"; 096 097 public static final String FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER 098 = "%{FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER}"; 099 100 public static final String QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER = 101 "%{QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER}"; 102 103 public static final String QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER 104 = "%{QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER}"; 105 106 public static final String DEDUPLICATION_ENABLED_PLACEHOLDER = "%{DEDUPLICATION_ENABLED_PLACEHOLDER}"; 107 108 109 // PLACEHOLDERS for archiver beans (Maybe not necessary) 110 final String ARCHIVER_BEAN_REFERENCE_PLACEHOLDER = "%{ARCHIVER_BEAN_REFERENCE_PLACEHOLDER}"; 111 final String ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER = "%{ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER}"; 112 113 // Placeholders for Umbra integration 114 public static final String UMBRA_SIMPLEOVERRIDES_PLACEHOLDER = "%{UMBRA_SIMPLEOVERRIDES_PLACEHOLDER}"; 115 public static final String UMBRA_PUBLISH_BEAN_PLACEHOLDER = "%{UMBRA_PUBLISH_BEAN_PLACEHOLDER}"; 116 public static final String UMBRA_RECEIVE_BEAN_PLACEHOLDER = "%{UMBRA_RECEIVE_BEAN_PLACEHOLDER}"; 117 public static final String UMBRA_BEAN_REF_PLACEHOLDER ="%{UMBRA_BEAN_REF_PLACEHOLDER}"; 118 119 //match theses properties in crawler-beans.cxml to add them into harvestInfo.xml 120 //for preservation purpose 121 public enum MetadataInfo { 122 TEMPLATE_DESCRIPTION("metadata\\.description=.+[\\r\\n]"), 123 TEMPLATE_UPDATE_DATE("metadata\\.date=.+[\\r\\n]"), 124 OPERATOR("metadata\\.operator=.+[\\r\\n]"); 125 126 private final String regex; 127 128 private MetadataInfo(String regex) { 129 this.regex = regex; 130 } 131 132 public String toString() { 133 return this.regex; 134 } 135 }; 136 137 public Map<MetadataInfo, String> metadataInfoMap; 138 139 /** 140 * Constructor for HeritrixTemplate class. 141 * 142 * @param template_id The persistent id of the template in the database 143 * @param template The template as String object 144 * @throws ArgumentNotValid if template is null. 145 */ 146 public H3HeritrixTemplate(long template_id, String template) { 147 ArgumentNotValid.checkNotNull(template, "String template"); 148 this.template_id = template_id; 149 this.template = template; 150 151 metadataInfoMap = new HashMap<MetadataInfo, String> (); 152 for(MetadataInfo metadataInfo : MetadataInfo.values()) { 153 Pattern p = Pattern.compile(metadataInfo.regex); 154 Matcher m = p.matcher(this.template); 155 if(m.find()) { 156 String operator = this.template.substring(m.start(), m.end()).trim(); 157 //return the value of the property after the = 158 metadataInfoMap.put(metadataInfo, operator.split("=")[1]); 159 } 160 } 161 } 162 163 /** 164 * return the template. 165 * 166 * @return the template 167 */ 168 public HeritrixTemplate getTemplate() { 169 return this; 170 } 171 172 /** 173 * Has Template been verified? 174 * 175 * @return true, if verified on construction, otherwise false 176 */ 177 public boolean isVerified() { 178 return verified; 179 } 180 181 /** 182 * Return HeritrixTemplate as XML. 183 * @return HeritrixTemplate as XML 184 */ 185 @Override 186 public String getXML() { 187 return template; 188 } 189 190 /** 191 * Update the maxTimeSeconds property in the heritrix3 template, if possible. 192 * @param maxJobRunningTimeSecondsL Force the harvestJob to end after this number of seconds 193 * Property of the org.archive.crawler.framework.CrawlLimitEnforcer 194 * <!-- <property name="maxTimeSeconds" value="0" /> --> 195 */ 196 @Override 197 public void setMaxJobRunningTime(Long maxJobRunningTimeSecondsL) { 198 if (template.contains(MAX_TIME_SECONDS_PLACEHOLDER)) { 199 this.template = template.replace(MAX_TIME_SECONDS_PLACEHOLDER, 200 Long.toString(maxJobRunningTimeSecondsL)); 201 } else { 202 log.warn("The placeholder '" + MAX_TIME_SECONDS_PLACEHOLDER 203 + "' was not found in the template. Therefore maxRunningTime not set"); 204 } 205 } 206 207 @Override 208 public void setMaxBytesPerDomain(Long maxbytesL) { 209 this.forceMaxbytesPerDomain = maxbytesL; 210 } 211 212 213 @Override 214 public Long getMaxBytesPerDomain() { 215 return this.forceMaxbytesPerDomain; 216 } 217 218 @Override 219 public void setMaxObjectsPerDomain(Long maxobjectsL) { 220 this.forceMaxobjectsPerDomain = maxobjectsL; 221 } 222 223 @Override 224 public Long getMaxObjectsPerDomain() { 225 return this.forceMaxobjectsPerDomain; 226 } 227 228 @Override 229 public boolean isValid() { 230 /* 231 StringBuilder errors = new StringBuilder(); 232 // check for Deduplication index-location placeholder and DEDUPLICATION_BEAN_PATTERN 233 if (template.contains(DEDUPLICATION_BEAN_PATTERN)) { 234 if (!template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER)) { 235 errors.append("Has DefdMissing placeholder '" + DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER + "'" 236 } 237 } 238 template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER) 239 && template.contains(deduplicationBeanPattern) 240 */ 241 return true; 242 } 243 244 /** 245 * Inserts all nevessary umbra-related beans in this template. 246 * @param jobName a String representing the job - must be unique for the this NAS environment for all time 247 * @param rabbitMQUrl the URL of the rabbitMQ socket connection (amqp://) to which umbra requests are to be sent 248 * @param limitSearchRegEx the regular expression used to limit the heritrix search-path of urls to be sent to Umbra. 249 */ 250 @Override 251 public void insertUmbrabean(String jobName, String rabbitMQUrl, String limitSearchRegEx) 252 { 253 this.template = this.template.replace(UMBRA_SIMPLEOVERRIDES_PLACEHOLDER, 254 getUmbraBeanInformationInSimpleoverridesBean(jobName, rabbitMQUrl, limitSearchRegEx)); 255 this.template = this.template.replace(UMBRA_PUBLISH_BEAN_PLACEHOLDER, getUmbrabeanPlaceholder()); 256 this.template = this.template.replace(UMBRA_RECEIVE_BEAN_PLACEHOLDER, getAmqpUrlreceiverPlaceholder()); 257 this.template = this.template.replace(UMBRA_BEAN_REF_PLACEHOLDER, getCallUmbrabean()); 258 } 259 260 261 /** 262 * Umbrabean text from the current harvest job that will replace the placeholder in the Simpleoverride bean 263 * @param jobName a String representing the job - must be unique for the this NAS environment for all time 264 * @param rabbitMQUrl the URL of the rabbitMQ socket connection (amqp://) to which umbra requests are to be sent 265 * @param limitSearchRegEx the regular expression used to limit the heritrix search-path of urls to be sent to Umbra. 266 */ 267 public String getUmbraBeanInformationInSimpleoverridesBean(String jobName, String rabbitMQUrl, String limitSearchRegEx) { 268 // umbraBean.clientId=MySpecialJobName 269 // umbraBean.amqpUri=amqp://guest:guest@activemq:5672/%2f 270 // ## The following rule restricts umbra to processing only on seeds or links, leaving embeds and redirects 271 // ## to be handled by the browser itself 272 // umbraBean.shouldProcessRule.rules[1].regex=^$|.*L 273 274 StringBuilder umbrabeanBuilder = new StringBuilder(); 275 umbrabeanBuilder.append("\n"); 276 umbrabeanBuilder.append("umbraBean.clientId=" + Settings.get(CommonSettings.ENVIRONMENT_NAME) + "_" + jobName); 277 umbrabeanBuilder.append("\n"); 278 umbrabeanBuilder.append("umbraBean.amqpUri="+rabbitMQUrl); 279 umbrabeanBuilder.append("\n"); 280 umbrabeanBuilder.append("## The following rule restricts umbra to processing only on seeds or links, leaving embeds and redirects"); 281 umbrabeanBuilder.append("## to be handled by the browser itself"); 282 umbrabeanBuilder.append("\n"); 283 umbrabeanBuilder.append("umbraBean.shouldProcessRule.rules[1].regex="+limitSearchRegEx); 284 umbrabeanBuilder.append("\n"); 285 return umbrabeanBuilder.toString(); 286 } 287 288 /** 289 * Umbrabean text that will replace UMBRA_BEAN_PLACEHOLDER in the template * 290 */ 291 public String getUmbrabeanPlaceholder() { 292 // <!-- 293 // Bean that sends messages (urls) to umbra. 294 // --> 295 // <bean id="umbraBean" class="org.archive.modules.AMQPPublishProcessor"> 296 // <property name="clientId" value="[see override]"/> 297 // <property name="amqpUri" value="[see override]"/> 298 // <property name="shouldProcessRule"> 299 // <bean class="org.archive.modules.deciderules.DecideRuleSequence"> 300 // <property name="rules"> 301 // <list> 302 // <bean class="org.archive.modules.deciderules.RejectDecideRule" /> 303 // <bean class="org.archive.modules.deciderules.HopsPathMatchesRegexDecideRule"> 304 // <property name="regex" value="[see override]"/> 305 // </bean> 306 // </list> 307 // </property> 308 // </bean> 309 // </property> 310 // </bean> 311 312 StringBuilder umbrabeanBuilder = new StringBuilder(); 313 umbrabeanBuilder.append("<!-- Bean that sends messages (urls) to umbra. -->"); 314 umbrabeanBuilder.append("<bean id=\"umbraBean\" class=\"org.archive.modules.AMQPPublishProcessor\">"); 315 umbrabeanBuilder.append("<property name=\"clientId\" value=\"[see override]\"/>"); 316 umbrabeanBuilder.append("<property name=\"amqpUri\" value=\"[see override]\"/>"); 317 umbrabeanBuilder.append(" <property name=\"shouldProcessRule\">"); 318 umbrabeanBuilder.append(" <bean class=\"org.archive.modules.deciderules.DecideRuleSequence\">"); 319 umbrabeanBuilder.append(" <property name=\"rules\">"); 320 umbrabeanBuilder.append(" <list>"); 321 umbrabeanBuilder.append(" <bean class=\"org.archive.modules.deciderules.RejectDecideRule\" />"); 322 umbrabeanBuilder.append(" <bean class=\"org.archive.modules.deciderules.HopsPathMatchesRegexDecideRule\">"); 323 umbrabeanBuilder.append(" <property name=\"regex\" value=\"[see override]\"/>"); 324 umbrabeanBuilder.append(" </bean>"); 325 umbrabeanBuilder.append(" </list>"); 326 umbrabeanBuilder.append(" </property>"); 327 umbrabeanBuilder.append(" </bean>"); 328 umbrabeanBuilder.append(" </property>"); 329 umbrabeanBuilder.append(" </bean>"); 330 331 return umbrabeanBuilder.toString(); 332 } 333 334 335 /** 336 * AMQP url receiver text that will replace AMQP_URLRECEIVER_PLACEHOLDER in the template * 337 */ 338 public String getAmqpUrlreceiverPlaceholder() { 339 // <!-- 340 // Bean that receives messages (urls) from umbra and places them in the Heritrix frontier. 341 // --> 342 // <bean class="org.archive.crawler.frontier.AMQPUrlReceiver"> 343 // <property name="amqpUri"> 344 // <bean class="org.springframework.beans.factory.config.PropertyPathFactoryBean"> 345 // <property name="targetObject" ref="umbraBean"/> 346 // <property name="propertyPath" value="amqpUri" /> 347 // </bean> 348 // </property> 349 // <property name="queueName"> 350 // <bean class="org.springframework.beans.factory.config.PropertyPathFactoryBean"> 351 // <property name="targetObject" ref="umbraBean"/> 352 // <property name="propertyPath" value="clientId" /> 353 // </bean> 354 // </property> 355 // </bean> 356 357 StringBuilder amqpUrlReceiverBeanBuilder = new StringBuilder(); 358 amqpUrlReceiverBeanBuilder.append("<!-- Bean that receives messages (urls) from umbra and places them in the Heritrix frontier -->"); 359 amqpUrlReceiverBeanBuilder.append("<bean class=\"org.archive.crawler.frontier.AMQPUrlReceiver\">"); 360 amqpUrlReceiverBeanBuilder.append(" <property name=\"amqpUri\">"); 361 amqpUrlReceiverBeanBuilder.append(" <bean class=\"org.springframework.beans.factory.config.PropertyPathFactoryBean\">"); 362 amqpUrlReceiverBeanBuilder.append(" <property name=\"targetObject\" ref=\"umbraBean\"/>"); 363 amqpUrlReceiverBeanBuilder.append(" <property name=\"propertyPath\" value=\"amqpUri\" />"); 364 amqpUrlReceiverBeanBuilder.append(" </bean>"); 365 amqpUrlReceiverBeanBuilder.append(" </property>"); 366 amqpUrlReceiverBeanBuilder.append(" <property name=\"queueName\">"); 367 amqpUrlReceiverBeanBuilder.append(" <bean class=\"org.springframework.beans.factory.config.PropertyPathFactoryBean\">"); 368 amqpUrlReceiverBeanBuilder.append(" <property name=\"targetObject\" ref=\"umbraBean\"/>"); 369 amqpUrlReceiverBeanBuilder.append(" <property name=\"propertyPath\" value=\"clientId\" />"); 370 amqpUrlReceiverBeanBuilder.append(" </bean>"); 371 amqpUrlReceiverBeanBuilder.append(" </property>"); 372 amqpUrlReceiverBeanBuilder.append("</bean>"); 373 374 return amqpUrlReceiverBeanBuilder.toString(); 375 } 376 377 378 /** 379 * Call of the Umbra bean text that will replace CALL_UMBRABEAN_PLACEHOLDER in the template * 380 */ 381 public String getCallUmbrabean() { 382 // <ref bean="umbraBean"/> 383 384 return " <ref bean=\"umbraBean\"/>"; 385 } 386 387 @Override 388 // This method is used to decide, whether to request a deduplication index or not. 389 // Done by checking, if both 390 // - a DeDuplicator bean is present in the template 391 // and 392 // - a DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER is also present. 393 // and 394 // - a DeDuplicator reference bean is present in the template 395 public boolean IsDeduplicationEnabled() { 396 return (template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER) 397 && DEDUPLICATION_BEAN_PATTERN.matcher(template).matches() 398 && DEDUPLICATION_BEAN_REFERENCE_PATTERN.matcher(template).matches()); 399 } 400 401 /** 402 * Configuring the quota-enforcer, depending on budget definition. Object limit can be defined either by 403 * using the queue-total-budget property or the quota enforcer. Which is chosen is set by the argument 404 * maxObjectsIsSetByQuotaEnforcer}'s value. So quota enforcer is set as follows: 405 * If all values in the quotaEnforcer is infinity, it is in effect disabled 406 * <ul> 407 * <li>Object limit is not set by quota enforcer, disabled only if there is no byte limit.</li> 408 * <li>Object limit is set by quota enforcer, so it should be enabled if a byte or object limit is set.</li> 409 * </ul> 410 * 411 * @param maxObjectsIsSetByQuotaEnforcer Decides whether the maxObjectsIsSetByQuotaEnforcer or not. 412 * @param forceMaxBytesPerDomain The number of max bytes per domain enforced (can be no limit) 413 * @param forceMaxObjectsPerDomain The number of max objects per domain enforced (can be no limit) 414 */ 415 public void configureQuotaEnforcer(boolean maxObjectsIsSetByQuotaEnforcer, 416 long forceMaxBytesPerDomain, long forceMaxObjectsPerDomain) { 417 this.forceMaxobjectsPerDomain = forceMaxObjectsPerDomain; 418 this.forceMaxbytesPerDomain = forceMaxBytesPerDomain; 419 String tmp = template; 420 if (!maxObjectsIsSetByQuotaEnforcer) { 421 // SetMaxObjects in the global budget to forceMaxObjectsPerDomain?? 422 String tmp1 = tmp.replace( 423 FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER, Long.toString( forceMaxObjectsPerDomain )); 424 // SetMaxObjects to infinity in the quotaEnforcer 425 tmp = tmp1.replace(QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER, 426 Long.toString(Constants.HERITRIX_MAXOBJECTS_INFINITY)); 427 } else { 428 // SetMaxObjects in the global budget to Infinity 429 String tmp1 = tmp.replace( 430 FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER, Long.toString( Constants.HERITRIX_MAXOBJECTS_INFINITY )); 431 // SetMaxObjects to forceMaxObjectsPerDomain in the quotaEnforcer 432 tmp = tmp1.replace(QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER, 433 Long.toString(forceMaxObjectsPerDomain)); 434 } 435 436 // SetMaxbytes in the QuotaEnforcer to forceMaxBytesPerDomain 437 // Divide by 1024 since Heritrix uses KB rather than bytes, 438 // and add 1 to avoid to low limit due to rounding. 439 String maxBytesStringValue = "-1"; 440 if (forceMaxBytesPerDomain != Constants.HERITRIX_MAXBYTES_INFINITY) { 441 maxBytesStringValue = Long.toString(( forceMaxBytesPerDomain 442 / Constants.BYTES_PER_HERITRIX_BYTELIMIT_UNIT) + 1); 443 log.debug("MaxbytesPerDomain set to {} Kbytes per domain", maxBytesStringValue); 444 } else { 445 log.debug("MaxbytesPerDomain set to infinite number of Kbytes per domain"); 446 } 447 448 this.template = tmp.replace(QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER, maxBytesStringValue); 449 450 } 451 452 /** 453 * Make sure that Heritrix will archive its data in the chosen archiveFormat. 454 * 455 * @param archiveFormat the chosen archiveformat ('arc' or 'warc' supported) 456 * @throws ArgumentNotValid If the chosen archiveFormat is not supported. 457 */ 458 @Override 459 public void setArchiveFormat(String archiveFormat) { 460 if (!template.contains(ARCHIVER_BEAN_REFERENCE_PLACEHOLDER)){ 461 throw new IllegalState("The placeholder '" + ARCHIVER_BEAN_REFERENCE_PLACEHOLDER 462 + "' is missing. Unable to insert proper archive writer"); 463 } 464 if (!template.contains(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER)) { 465 throw new IllegalState("The placeholder '" + ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER 466 + "' is missing. Unable to insert proper archive writer"); 467 } 468 if ("arc".equalsIgnoreCase(archiveFormat)) { 469 log.debug("ARC format selected to be used by Heritrix3"); 470 setArcArchiveformat(); 471 } else if ("warc".equalsIgnoreCase(archiveFormat)) { 472 log.debug("WARC format selected to be used by Heritrix3"); 473 setWarcArchiveformat(); 474 } else { 475 throw new ArgumentNotValid("Configuration of '" + HarvesterSettings.HERITRIX_ARCHIVE_FORMAT 476 + "' is invalid! Unrecognized format '" + archiveFormat + "'."); 477 } 478 } 479 480 /** 481 * Set the archive-format as ARC. This means enabling the ARCWriterProcessor in the template 482 */ 483 private void setArcArchiveformat(){ 484 String arcWriterbeanReference = "<ref bean=\"arcWriter\"/>"; 485 String templateNew = template.replace(ARCHIVER_BEAN_REFERENCE_PLACEHOLDER, arcWriterbeanReference); 486 template = templateNew.replace(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER, getArcWriterProcessor()); 487 } 488 489 private String getArcWriterProcessor() { 490 491 // <bean id="arcWriter" class="org.archive.modules.writer.ARCWriterProcessor"> 492 // <!-- <property name="compress" value="true" /> --> 493 // <!-- <property name="prefix" value="IAH" /> --> 494 // <!-- <property name="suffix" value="${HOSTNAME}" /> --> 495 // <!-- <property name="maxFileSizeBytes" value="100000000" /> --> 496 // <!-- <property name="poolMaxActive" value="1" /> --> 497 // <!-- <property name="poolMaxWaitMs" value="300000" /> --> 498 // <!-- <property name="skipIdenticalDigests" value="false" /> --> 499 // <!-- <property name="maxTotalBytesToWrite" value="0" /> --> 500 // <!-- <property name="directory" value="." /> --> 501 // <!-- <property name="storePaths"> 502 // <list> 503 // <value>arcs</value> 504 // </list> 505 // </property> --> 506 // </bean> 507 // "<bean id=\"arcWriter\" class=\"org.archive.modules.writer.ARCWriterProcessor\">"; 508 String propertyName="\n<property name=\""; 509 String valuePrefix = "\" value=\""; 510 String valueSuffix = "\""; 511 String propertyEnd="/>"; 512 513 StringBuilder arcWriterBeanBuilder = new StringBuilder(); 514 arcWriterBeanBuilder.append("<bean id=\"arcWriter\" class=\"org.archive.modules.writer.ARCWriterProcessor\">\n"); 515 arcWriterBeanBuilder.append(propertyName + "compress" + valuePrefix 516 + Settings.get(HarvesterSettings.HERITRIX3_ARC_COMPRESSION) 517 + valueSuffix + propertyEnd); 518 arcWriterBeanBuilder.append(propertyName + "prefix" + valuePrefix 519 + ARCHIVE_FILE_PREFIX_PLACEHOLDER 520 + valueSuffix + propertyEnd); 521// arcWriterBeanBuilder.append(propertyName + "suffix" + valuePrefix 522// + Settings.get(HarvesterSettings.HERITRIX3_ARC_SUFFIX) 523// + valueSuffix + propertyEnd); 524 arcWriterBeanBuilder.append(propertyName + "maxFileSizeBytes" + valuePrefix 525 + Settings.get(HarvesterSettings.HERITRIX3_ARC_MAXSIZE) 526 + valueSuffix + propertyEnd); 527 arcWriterBeanBuilder.append(propertyName + "poolMaxActive" + valuePrefix 528 + Settings.get(HarvesterSettings.HERITRIX3_ARC_POOL_MAXACTIVE) 529 + valueSuffix + propertyEnd); 530 arcWriterBeanBuilder.append(propertyName + "skipIdenticalDigests" + valuePrefix 531 + Settings.get(HarvesterSettings.HERITRIX3_ARC_SKIP_IDENTICAL_DIGESTS) 532 + valueSuffix + propertyEnd); 533 534 arcWriterBeanBuilder.append("</bean>"); 535 536 return arcWriterBeanBuilder.toString(); 537 } 538 539 540 /** 541 * Insert WARC-archiver beans and remove placeholder for ARC-Archiver-beans 542 * It is an error, if the WARC place-holders doesnt't exist. 543 * It is not an error, if the property placeholder does not exist. 544 */ 545 private void setWarcArchiveformat() { 546 String warcWriterbeanReference = "<ref bean=\"warcWriter\"/>"; 547 String warcWriterProcessorBean = "<bean id=\"warcWriter\" class=\"dk.netarkivet.harvester.harvesting.NasWARCProcessor\">"; 548 String propertyName="\n<property name=\""; 549 String valuePrefix = "\" value=\""; 550 String valueSuffix = "\""; 551 String propertyEnd="/>"; 552 if (!template.contains(ARCHIVER_BEAN_REFERENCE_PLACEHOLDER)) { 553 throw new IllegalState("The placeholder '" + ARCHIVER_BEAN_REFERENCE_PLACEHOLDER 554 + "' is missing"); 555 } 556 if (!template.contains(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER)) { 557 throw new IllegalState("The placeholder '" + ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER 558 + "' is missing"); 559 } 560 StringBuilder propertyBuilder = new StringBuilder(); 561 propertyBuilder.append(propertyName + "template" + valuePrefix 562 + Settings.get(HarvesterSettings.HERITRIX3_WARC_TEMPLATE) 563 + valueSuffix + propertyEnd); 564 propertyBuilder.append(propertyName + "compress" + valuePrefix 565 + Settings.get(HarvesterSettings.HERITRIX3_WARC_COMPRESSION) 566 + valueSuffix + propertyEnd); 567 // Note: The prefix value will be replaced later by the setArchiveFilePrefix() method 568 propertyBuilder.append(propertyName + "prefix" + valuePrefix 569 + ARCHIVE_FILE_PREFIX_PLACEHOLDER 570 + valueSuffix + propertyEnd); 571 propertyBuilder.append(propertyName + "maxFileSizeBytes" + valuePrefix 572 + Settings.get(HarvesterSettings.HERITRIX3_WARC_MAXSIZE) 573 + valueSuffix + propertyEnd); 574 propertyBuilder.append(propertyName + "poolMaxActive" + valuePrefix 575 + Settings.get(HarvesterSettings.HERITRIX3_WARC_POOL_MAXACTIVE) 576 + valueSuffix + propertyEnd); 577 578 propertyBuilder.append(propertyName + "writeRequests" + valuePrefix 579 + Settings.get(HarvesterSettings.HERITRIX3_WARC_WRITE_REQUESTS) 580 + valueSuffix + propertyEnd); 581 propertyBuilder.append(propertyName + "writeMetadata" + valuePrefix 582 + Settings.get(HarvesterSettings.HERITRIX3_WARC_WRITE_METADATA) 583 + valueSuffix + propertyEnd); 584 propertyBuilder.append(propertyName + "writeMetadataOutlinks" + valuePrefix 585 + Settings.get(HarvesterSettings.HERITRIX3_WARC_WRITE_METADATA_OUTLINKS) 586 + valueSuffix + propertyEnd); 587 propertyBuilder.append(propertyName + "skipIdenticalDigests" + valuePrefix 588 + Settings.get(HarvesterSettings.HERITRIX3_WARC_SKIP_IDENTICAL_DIGESTS) 589 + valueSuffix + propertyEnd); 590 propertyBuilder.append(propertyName + "startNewFilesOnCheckpoint" + valuePrefix 591 + Settings.get(HarvesterSettings.HERITRIX3_WARC_START_NEW_FILES_ON_CHECKPOINT) 592 + valueSuffix + propertyEnd); 593 594 warcWriterProcessorBean += propertyBuilder.toString(); 595 warcWriterProcessorBean += "\n\n%{METADATA_ITEMS_PLACEHOLDER}\n</bean>"; 596 String templateNew = template.replace( 597 ARCHIVER_BEAN_REFERENCE_PLACEHOLDER, warcWriterbeanReference); 598 this.template = templateNew.replace(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER, 599 warcWriterProcessorBean); 600 } 601 602 @Override 603 /** 604 * With H3 template, we insert the crawlertraps into the template at once. 605 * They are inserted to be part of a org.archive.modules.deciderules.MatchesListRegexDecideRule 606 * bean. 607 * 608 * @param elementName The elementName is currently not used with H3 609 * @param crawlertraps A list of crawlertraps to be inserted 610 */ 611 public void insertCrawlerTraps(String elementName, List<String> crawlertraps) { 612// <bean class="org.archive.modules.deciderules.MatchesListRegexDecideRule"> 613// <!-- <property name="listLogicalOr" value="true" /> --> 614// <!-- <property name="regexList"> 615// <list> 616// CRAWLERTRAPS_PLACEHOLDER 617// </list> 618// </property> --> 619// </bean> 620 if (crawlertraps.isEmpty()) { 621 log.debug("No crawlertraps yet. No insertion is done"); 622 return; 623 } else if (!template.contains(CRAWLERTRAPS_PLACEHOLDER)) { 624 log.warn("The placeholder '" + CRAWLERTRAPS_PLACEHOLDER 625 + "' is absent from the template. No insertion is done at all. {} traps were ignored", 626 crawlertraps); 627 return; 628 } else { 629 log.info("Inserting {} crawlertraps into the template", crawlertraps.size()); 630 StringBuilder sb = new StringBuilder(); 631 sb.append("<!-- crawlertraps from " + elementName + " -->\n"); 632 for (String trap: crawlertraps) { 633 sb.append("<value>" + trap + "</value>\n"); 634 } 635 // Adding the placeholder again to be able to insert crawlertraps multiple times. 636 sb.append(CRAWLERTRAPS_PLACEHOLDER + "\n"); 637 String templateNew = template.replace(CRAWLERTRAPS_PLACEHOLDER, sb.toString()); 638 this.template = templateNew; 639 } 640 } 641 642 public String getMetadataInfo(MetadataInfo info) { 643 String infoStr = null; 644 if(metadataInfoMap.containsKey(info)) { 645 infoStr = metadataInfoMap.get(info); 646 } 647 return infoStr; 648 } 649 650 @Override 651 public void writeTemplate(OutputStream os) throws IOFailure { 652 try { 653 os.write(template.getBytes(Charset.forName("UTF-8"))); 654 } catch (IOException e) { 655 throw new IOFailure("Unable to write template to outputstream", e); 656 } 657 658 } 659 660 @Override 661 public boolean hasContent() { 662 throw new NotImplementedException("The hasContent method hasn't been implemented yet"); 663 } 664 665 @Override 666 public void writeToFile(File orderXmlFile) { 667 BufferedWriter writer = null; 668 try { 669 writer = new BufferedWriter( new FileWriter(orderXmlFile)); 670 writer.write(template); 671 } catch(IOException e) { 672 throw new IOFailure("Unable to write template to file '" + orderXmlFile.getAbsolutePath() + "'.", e); 673 } finally { 674 IOUtils.closeQuietly(writer); 675 } 676 } 677 678 @Override 679 public void setRecoverlogNode(File recoverlogGzFile) { 680 throw new NotImplementedException("This method has not yet been implemented"); 681 682 } 683 684 @Override 685 public void setDeduplicationIndexLocation(String absolutePath) { 686 if (!template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER)) { 687 throw new IllegalState("The placeholder for the deduplication index location property '" + DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER 688 + "' was not found. Maybe the placeholder has already been replaced with the correct value: " 689 + template); 690 } 691 String templateNew = template.replace(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER, absolutePath); 692 this.template = templateNew; 693 } 694 695 @Override 696 public void setSeedsFilePath(String absolutePath) { 697 log.debug("Note: SeedsFilePath is not set in h3"); 698 } 699 700 @Override 701 public void setArchiveFilePrefix(String archiveFilePrefix) { 702 if (!template.contains(ARCHIVE_FILE_PREFIX_PLACEHOLDER)) { 703 throw new IllegalState("The placeholder for the archive file prefix property '" 704 + ARCHIVE_FILE_PREFIX_PLACEHOLDER 705 + "' was not found. Maybe the placeholder has already been replaced with the correct value. The template looks like this: " 706 + template); 707 } 708 String templateNew = template.replace(ARCHIVE_FILE_PREFIX_PLACEHOLDER, archiveFilePrefix); 709 this.template = templateNew; 710 711 } 712 713 @Override 714 public void setDiskPath(String absolutePath) { 715 // NOP 716 log.warn("The DiskPath is not settable in the H3 template"); 717 } 718 719 @Override 720 public void removeDeduplicatorIfPresent() { 721 //NOP 722 log.debug("In H3 we don't remove the deduplicator, but just disable it."); 723 } 724 725 @Override public void enableOrDisableDeduplication(boolean enabled) { 726 final String replacement = Boolean.toString(enabled).toLowerCase(); 727 log.debug("Replacing deduplication enabled placeholder {} with {}.", DEDUPLICATION_ENABLED_PLACEHOLDER, replacement); 728 this.template = template.replace(DEDUPLICATION_ENABLED_PLACEHOLDER, replacement); 729 } 730 731 //<property name="metadataItems"> 732// <map> 733// <entry key="harvestInfo.version" value="1.03"/> <!-- TODO maybe not add this one --> 734// <entry key="harvestInfo.jobId" value="1"/> 735// <entry key="harvestInfo.channel" value="HIGH"/> 736// <entry key="harvestInfo.harvestNum" value="1"/> 737// <entry key="harvestInfo.origHarvestDefinitionID" value="1"/> 738// <entry key="harvestInfo.maxBytesPerDomain" value="100000"/> 739// <entry key="harvestInfo.maxObjectsPerDomain" value="-1"/> 740// <entry key="harvestInfo.orderXMLName" value="defaultOrderXml"/> 741// <entry key="harvestInfo.origHarvestDefinitionName" value="ddddddddd"/> 742// <entry key="harvestInfo.scheduleName" value="EveryHour"/> <!-- Optional. only relevant for Selective Harvests -- only inserted if not null and not-empty.-> 743// <entry key="harvestInfo.harvestFilenamePrefix" value="netarkivet-1-1"/> 744// <entry key="harvestInfo.jobSubmitDate" value="22. 10. 2014"/> 745// <entry key="harvestInfo.performer" value="performer"/> <!-- Optional - only inserted if not null and not-empty. --> 746// <entry key="harvestInfo.audience" value="audience"/> <!-- Optional - only inserted if not null and not-empty. --> 747// </map> 748// </property> 749 750 public void insertWarcInfoMetadata(Job ajob, String origHarvestdefinitionName, 751 String origHarvestdefinitionComments, String scheduleName, String performer) { 752 if (!template.contains(METADATA_ITEMS_PLACEHOLDER)) { 753 throw new IllegalState("The placeholder for the property '" + METADATA_ITEMS_PLACEHOLDER 754 + "' was not found. Maybe the placeholder has already been replaced with the correct value. The template looks like this: " 755 + template); 756 } 757 log.debug("Now in " + getClass().getName()); 758 String startMetadataEntry = "\n<entry key=\""; 759 String endMetadataEntry = "\"/>"; 760 String valuePart = "\" value=\""; 761 StringBuilder sb = new StringBuilder(); 762 sb.append("<property name=\"metadataItems\">\n<map>\n"); 763 764 // <entry key="harvestInfo.version" value="1.03"/> 765 766 sb.append(startMetadataEntry); 767 sb.append(HARVESTINFO_VERSION + valuePart + HARVESTINFO_VERSION_NUMBER + endMetadataEntry); 768 sb.append(startMetadataEntry); 769 sb.append(HARVESTINFO_JOBID + valuePart + ajob.getJobID() + endMetadataEntry); 770 771 sb.append(startMetadataEntry); 772 sb.append(HARVESTINFO_CHANNEL + valuePart + ajob.getChannel() + endMetadataEntry); 773 sb.append(startMetadataEntry); 774 sb.append(HARVESTINFO_HARVESTNUM + valuePart + ajob.getHarvestNum() + endMetadataEntry); 775 sb.append(startMetadataEntry); 776 sb.append(HARVESTINFO_ORIGHARVESTDEFINITIONID + valuePart + ajob.getOrigHarvestDefinitionID() + endMetadataEntry); 777 sb.append(startMetadataEntry); 778 sb.append(HARVESTINFO_MAXBYTESPERDOMAIN + valuePart + ajob.getMaxBytesPerDomain() + endMetadataEntry); 779 sb.append(startMetadataEntry); 780 sb.append(HARVESTINFO_MAXOBJECTSPERDOMAIN + valuePart + ajob.getMaxObjectsPerDomain() + endMetadataEntry); 781 sb.append(startMetadataEntry); 782 sb.append(HARVESTINFO_ORDERXMLNAME + valuePart + ajob.getOrderXMLName() + endMetadataEntry); 783 784 /* orderxml update date - only inserted if not null and not-empty. */ 785 /* take info from crawler-beans.cxml */ 786 String tmp = getMetadataInfo(MetadataInfo.TEMPLATE_UPDATE_DATE); 787 if (tmp != null && !tmp.isEmpty()){ 788 sb.append(startMetadataEntry); 789 sb.append(HARVESTINFO_ORDERXMLUPDATEDATE + valuePart + tmp + endMetadataEntry); 790 } 791 /* orderxml description - only inserted if not null and not-empty. */ 792 /* take info from crawler-beans.cxml */ 793 tmp = getMetadataInfo(MetadataInfo.TEMPLATE_DESCRIPTION); 794 if (tmp != null && !tmp.isEmpty()){ 795 sb.append(startMetadataEntry); 796 sb.append(HARVESTINFO_ORDERXMLDESCRIPTION + valuePart + StringEscapeUtils.escapeXml(tmp) + endMetadataEntry); 797 } 798 799 sb.append(startMetadataEntry); 800 sb.append(HARVESTINFO_ORIGHARVESTDEFINITIONNAME + valuePart + 801 StringEscapeUtils.escapeXml(origHarvestdefinitionName) + endMetadataEntry); 802 803 if(StringUtils.isNotEmpty(origHarvestdefinitionComments)) { 804 sb.append(startMetadataEntry); 805 sb.append(HARVESTINFO_ORIGHARVESTDEFINITIONCOMMENTS + valuePart + 806 StringEscapeUtils.escapeXml(origHarvestdefinitionComments) + endMetadataEntry); 807 } 808 809 /* optional schedule-name - only inserted if not null and not-empty. */ 810 if (scheduleName != null && !scheduleName.isEmpty()) { 811 sb.append(startMetadataEntry); 812 sb.append(HARVESTINFO_SCHEDULENAME + valuePart + scheduleName + endMetadataEntry); 813 } 814 sb.append(startMetadataEntry); 815 sb.append(HARVESTINFO_HARVESTFILENAMEPREFIX + valuePart + ajob.getHarvestFilenamePrefix() + endMetadataEntry); 816 sb.append(startMetadataEntry); 817 sb.append(HARVESTINFO_JOBSUBMITDATE + valuePart + ArchiveDateConverter.getWarcDateFormat().format(ajob.getSubmittedDate()) + endMetadataEntry); 818 819 /* optional HARVESTINFO_PERFORMER - only inserted if not null and not-empty. */ 820 if (performer != null && !performer.isEmpty()){ 821 sb.append(startMetadataEntry); 822 sb.append(HARVESTINFO_PERFORMER + valuePart + StringEscapeUtils.escapeXml(performer) + endMetadataEntry); 823 } 824 825 /* optional OPERATOR - only inserted if not null and not-empty. */ 826 /* take info from crawler-beans.cxml */ 827 String operator = getMetadataInfo(MetadataInfo.OPERATOR); 828 if (operator != null && !operator.isEmpty()){ 829 sb.append(startMetadataEntry); 830 sb.append(HARVESTINFO_OPERATOR + valuePart + StringEscapeUtils.escapeXml(operator) + endMetadataEntry); 831 } 832 833 /* optional HARVESTINFO_AUDIENCE - only inserted if not null and not-empty. */ 834 if (ajob.getHarvestAudience() != null && !ajob.getHarvestAudience().isEmpty()) { 835 sb.append(startMetadataEntry); 836 sb.append(HARVESTINFO_AUDIENCE + valuePart + StringEscapeUtils.escapeXml(ajob.getHarvestAudience()) + endMetadataEntry); 837 } 838 sb.append("\n</map>\n</property>\n"); 839 840 // Replace command 841 log.info("Adding WarcInfoMetadata " + sb.toString()); 842 String templateNew = template.replace(METADATA_ITEMS_PLACEHOLDER, sb.toString()); 843 this.template = templateNew; 844 } 845 846 @Override 847 public void insertAttributes(List<AttributeAndType> attributesAndTypes) { 848 ArgumentNotValid.checkNotNull(attributesAndTypes, "List<AttributeAndType> attributesAndTypes"); 849 for (AttributeAndType attributeAndType: attributesAndTypes) { 850 // initialize temp variables 851 Integer intVal = null; 852 String val = null; 853 AttributeTypeBase attributeType = attributeAndType.attributeType; 854 AttributeBase attribute = attributeAndType.attribute; 855 856 log.debug("Trying to insert the attribute {} into the template", attributeType.name); 857 switch (attributeType.viewtype) { 858 case 1: 859 if (attribute != null) { 860 intVal = attribute.getInteger(); 861 log.debug("Read explicitly value for attribute '{}'", attributeType.name, intVal); 862 } 863 if (intVal == null && attributeType.def_int != null) { 864 intVal = attributeType.def_int; 865 log.debug("Viewtype 1 attribute '{}' not set explicitly. Using default value '{}'", attributeType.name, intVal); 866 } 867 if (intVal != null) { 868 val = intVal.toString(); 869 } else { 870 val = ""; 871 } 872 log.info("Value selected for attribute {}: {}", attributeType.name, val); 873 break; 874 case 5: 875 if (attribute != null) { 876 intVal = attribute.getInteger(); 877 log.debug("Read explicitly value for attribute '{}'", attributeType.name, intVal); 878 } 879 if (intVal == null && attributeType.def_int != null) { 880 intVal = attributeType.def_int; 881 log.debug("Viewtype 5 attribute '{}' not set explicitly. Using default value '{}'", attributeType.name, intVal); 882 } 883 if (intVal != null && intVal > 0) { 884 val = "true"; 885 } else { 886 val = "false"; 887 } 888 log.info("Value selected for attribute '{}': '{}'", attributeType.name, val); 889 break; 890 case 6: 891 if (attribute != null) { 892 intVal = attribute.getInteger(); 893 log.debug("Read explicitly value for attribute '{}'", attributeType.name, intVal); 894 } 895 if (intVal == null && attributeType.def_int != null) { 896 intVal = attributeType.def_int; 897 log.debug("Viewtype 6 attribute '{}' not set explicitly. Using default value '{}'", attributeType.name, intVal); 898 } 899 if (intVal != null && intVal > 0) { 900 val = "obey"; 901 } else { 902 val = "ignore"; 903 } 904 log.info("Value selected for attribute '{}': '{}'", attributeType.name, val); 905 break; 906 } 907 String placeholder = "%{" + attributeType.name.toUpperCase() + "}"; 908 if (template.contains(placeholder)) { 909 String templateNew = template.replace("%{" + attributeType.name.toUpperCase() + "}", val); 910 this.template = templateNew; 911 } else { 912 log.warn("Placeholder '{}' not found in template. Therefore not substituted by '{}' in this template", 913 placeholder, val); 914 } 915 } 916 } 917 918 @Override 919 public void writeTemplate(JspWriter out) throws IOFailure { 920 try { 921 out.write(template); 922 } catch (IOException e) { 923 throw new IOFailure("Unable to write to JspWriter", e); 924 } 925 } 926 927 /** 928 * Hack to remove existing placeholders, that is still present after template 929 * manipulation is completed. 930 */ 931 public void removePlaceholders() { 932 String[] optionalPlaceholders = new String[] { 933 METADATA_ITEMS_PLACEHOLDER, 934 CRAWLERTRAPS_PLACEHOLDER, 935 UMBRA_PUBLISH_BEAN_PLACEHOLDER, 936 UMBRA_SIMPLEOVERRIDES_PLACEHOLDER, 937 UMBRA_BEAN_REF_PLACEHOLDER, 938 UMBRA_RECEIVE_BEAN_PLACEHOLDER}; 939 for (String placeholder: optionalPlaceholders) { 940 template = template.replace(placeholder, ""); 941 } 942 } 943}