001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.datamodel;
024
025import java.io.BufferedWriter;
026import java.io.File;
027import java.io.FileWriter;
028import java.io.IOException;
029import java.io.OutputStream;
030import java.io.Serializable;
031import java.nio.charset.Charset;
032import java.util.HashMap;
033import java.util.List;
034import java.util.Map;
035import java.util.regex.Matcher;
036import java.util.regex.Pattern;
037
038import javax.servlet.jsp.JspWriter;
039
040import org.apache.commons.io.IOUtils;
041import org.apache.commons.lang.StringEscapeUtils;
042import org.apache.commons.lang.StringUtils;
043import org.slf4j.Logger;
044import org.slf4j.LoggerFactory;
045
046import com.antiaction.raptor.dao.AttributeBase;
047import com.antiaction.raptor.dao.AttributeTypeBase;
048
049import dk.netarkivet.common.CommonSettings;
050import dk.netarkivet.common.exceptions.ArgumentNotValid;
051import dk.netarkivet.common.exceptions.IOFailure;
052import dk.netarkivet.common.exceptions.IllegalState;
053import dk.netarkivet.common.exceptions.NotImplementedException;
054import dk.netarkivet.common.utils.Settings;
055import dk.netarkivet.common.utils.archive.ArchiveDateConverter;
056import dk.netarkivet.harvester.HarvesterSettings;
057import dk.netarkivet.harvester.datamodel.eav.EAV.AttributeAndType;
058
059/**
060 * Class encapsulating the Heritrix crawler-beans.cxml file 
061 * <p>
062 * 
063 * Heritrix3 has a new model based on spring, So the XPATH is no good for processing.
064 * Instead we use placeholders instead, marked by %{..} instead of ${..}, which is used by 
065 * Heritrix3 already.
066 * 
067 * The template is a H3 template if it contains the string: 
068 * 
069 * "xmlns="http://www.springframework.org/...."
070 * 
071 */
072public class H3HeritrixTemplate extends HeritrixTemplate implements Serializable {
073
074    private static final Logger log = LoggerFactory.getLogger(H3HeritrixTemplate.class);
075
076    private String template;
077
078    /** QuotaEnforcer states for this template. TODO necessary?? */
079    private Long forceMaxbytesPerDomain;
080    private Long forceMaxobjectsPerDomain;
081
082    /** Has this HeritrixTemplate been verified. */
083    private boolean verified;
084
085    public final static String METADATA_ITEMS_PLACEHOLDER = "%{METADATA_ITEMS_PLACEHOLDER}";
086    public static final String MAX_TIME_SECONDS_PLACEHOLDER = "%{MAX_TIME_SECONDS_PLACEHOLDER}";
087    public static final String CRAWLERTRAPS_PLACEHOLDER = "%{CRAWLERTRAPS_PLACEHOLDER}";
088
089    public static final Pattern DEDUPLICATION_BEAN_REFERENCE_PATTERN = Pattern.compile(".*ref.*bean.*DeDuplicator.*", Pattern.DOTALL);
090
091    public static final Pattern DEDUPLICATION_BEAN_PATTERN =  Pattern.compile(".*bean.*id.*DeDuplicator.*", Pattern.DOTALL);
092    public static final String DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER
093        = "%{DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER}";
094
095    public static final String ARCHIVE_FILE_PREFIX_PLACEHOLDER = "%{ARCHIVE_FILE_PREFIX_PLACEHOLDER}";
096
097    public static final String FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER
098        = "%{FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER}";
099
100    public static final String QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER =
101                "%{QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER}";
102
103    public static final String QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER
104        = "%{QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER}";
105
106        public static final String DEDUPLICATION_ENABLED_PLACEHOLDER = "%{DEDUPLICATION_ENABLED_PLACEHOLDER}";
107
108
109    // PLACEHOLDERS for archiver beans (Maybe not necessary)
110    final String ARCHIVER_BEAN_REFERENCE_PLACEHOLDER = "%{ARCHIVER_BEAN_REFERENCE_PLACEHOLDER}";
111        final String ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER = "%{ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER}";
112
113        // Placeholders for Umbra integration
114        public static final String UMBRA_SIMPLEOVERRIDES_PLACEHOLDER = "%{UMBRA_SIMPLEOVERRIDES_PLACEHOLDER}";
115        public static final String UMBRA_PUBLISH_BEAN_PLACEHOLDER = "%{UMBRA_PUBLISH_BEAN_PLACEHOLDER}";
116        public static final String UMBRA_RECEIVE_BEAN_PLACEHOLDER = "%{UMBRA_RECEIVE_BEAN_PLACEHOLDER}";
117        public static final String UMBRA_BEAN_REF_PLACEHOLDER ="%{UMBRA_BEAN_REF_PLACEHOLDER}";
118
119        //match theses properties in crawler-beans.cxml to add them into harvestInfo.xml
120        //for preservation purpose
121        public enum MetadataInfo {
122                TEMPLATE_DESCRIPTION("metadata\\.description=.+[\\r\\n]"),
123                TEMPLATE_UPDATE_DATE("metadata\\.date=.+[\\r\\n]"),
124                OPERATOR("metadata\\.operator=.+[\\r\\n]");
125
126                private final String regex;
127
128                private MetadataInfo(String regex) {
129                        this.regex = regex;
130                }
131
132                public String toString() {
133                        return this.regex;
134                }
135        };
136
137        public Map<MetadataInfo, String> metadataInfoMap;
138
139    /**
140     * Constructor for HeritrixTemplate class.
141     *
142     * @param template_id The persistent id of the template in the database
143     * @param template The template as String object
144     * @throws ArgumentNotValid if template is null.
145     */
146    public H3HeritrixTemplate(long template_id, String template) {
147        ArgumentNotValid.checkNotNull(template, "String template");
148        this.template_id = template_id;
149        this.template = template;
150
151        metadataInfoMap = new HashMap<MetadataInfo, String> ();
152        for(MetadataInfo metadataInfo : MetadataInfo.values()) {
153            Pattern p = Pattern.compile(metadataInfo.regex);
154            Matcher m = p.matcher(this.template);
155            if(m.find()) {
156                String operator = this.template.substring(m.start(), m.end()).trim();
157                //return the value of the property after the =
158                metadataInfoMap.put(metadataInfo, operator.split("=")[1]);
159            }
160        }
161    }
162
163        /**
164     * return the template.
165     *
166     * @return the template
167     */
168    public HeritrixTemplate getTemplate() {
169        return this;
170    }
171
172    /**
173     * Has Template been verified?
174     *
175     * @return true, if verified on construction, otherwise false
176     */
177    public boolean isVerified() {
178        return verified;
179    }
180
181    /**
182     * Return HeritrixTemplate as XML.
183     * @return HeritrixTemplate as XML
184     */
185    @Override
186    public String getXML() {
187        return template;
188    }
189
190    /**
191     * Update the maxTimeSeconds property in the heritrix3 template, if possible.
192     * @param maxJobRunningTimeSecondsL Force the harvestJob to end after this number of seconds
193     * Property of the org.archive.crawler.framework.CrawlLimitEnforcer
194     * <!-- <property name="maxTimeSeconds" value="0" /> -->
195     */
196    @Override
197        public void setMaxJobRunningTime(Long maxJobRunningTimeSecondsL) {
198                if (template.contains(MAX_TIME_SECONDS_PLACEHOLDER)) {
199                this.template = template.replace(MAX_TIME_SECONDS_PLACEHOLDER,
200                                Long.toString(maxJobRunningTimeSecondsL));
201                } else {
202                        log.warn("The placeholder '" + MAX_TIME_SECONDS_PLACEHOLDER
203                                        + "' was not found in the template. Therefore maxRunningTime not set");
204                }
205        }
206
207        @Override
208        public void setMaxBytesPerDomain(Long maxbytesL) {
209                this.forceMaxbytesPerDomain = maxbytesL;
210        }
211
212
213        @Override
214        public Long getMaxBytesPerDomain() {
215                return this.forceMaxbytesPerDomain;
216        }
217
218        @Override
219        public void setMaxObjectsPerDomain(Long maxobjectsL) {
220                this.forceMaxobjectsPerDomain = maxobjectsL;
221        }
222
223        @Override
224        public Long getMaxObjectsPerDomain() {
225                return this.forceMaxobjectsPerDomain;
226        }
227
228        @Override
229        public boolean isValid() {
230                /*
231                StringBuilder errors = new StringBuilder();
232                // check for Deduplication index-location placeholder and DEDUPLICATION_BEAN_PATTERN
233                if (template.contains(DEDUPLICATION_BEAN_PATTERN)) {
234                        if (!template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER)) {
235                                errors.append("Has DefdMissing placeholder '" +  DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER + "'"
236                        }
237                } 
238                template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER) 
239                && template.contains(deduplicationBeanPattern)
240                */
241                return true;
242        }
243
244        /**
245         * Inserts all nevessary umbra-related beans in this template.
246         * @param jobName a String representing the job - must be unique for the this NAS environment for all time
247         * @param rabbitMQUrl the URL of the rabbitMQ socket connection (amqp://) to which umbra requests are to be sent
248         * @param limitSearchRegEx the regular expression used to limit the heritrix search-path of urls to be sent to Umbra.
249         */
250        @Override
251        public void insertUmbrabean(String jobName, String rabbitMQUrl, String limitSearchRegEx)
252        {
253                this.template = this.template.replace(UMBRA_SIMPLEOVERRIDES_PLACEHOLDER,
254                                getUmbraBeanInformationInSimpleoverridesBean(jobName, rabbitMQUrl, limitSearchRegEx));
255                this.template = this.template.replace(UMBRA_PUBLISH_BEAN_PLACEHOLDER, getUmbrabeanPlaceholder());
256                this.template = this.template.replace(UMBRA_RECEIVE_BEAN_PLACEHOLDER, getAmqpUrlreceiverPlaceholder());
257                this.template = this.template.replace(UMBRA_BEAN_REF_PLACEHOLDER, getCallUmbrabean());
258        }
259
260
261        /**
262         * Umbrabean text from the current harvest job that will replace the placeholder in the Simpleoverride bean
263         * @param jobName a String representing the job - must be unique for the this NAS environment for all time
264         * @param rabbitMQUrl the URL of the rabbitMQ socket connection (amqp://) to which umbra requests are to be sent
265         * @param limitSearchRegEx the regular expression used to limit the heritrix search-path of urls to be sent to Umbra.
266         */
267        public String getUmbraBeanInformationInSimpleoverridesBean(String jobName, String rabbitMQUrl, String limitSearchRegEx) {
268                //      umbraBean.clientId=MySpecialJobName
269                //      umbraBean.amqpUri=amqp://guest:guest@activemq:5672/%2f
270                //      ## The following rule restricts umbra to processing only on seeds or links, leaving embeds and redirects
271                //      ## to be handled by the browser itself
272                //      umbraBean.shouldProcessRule.rules[1].regex=^$|.*L
273
274                StringBuilder umbrabeanBuilder = new StringBuilder();
275                umbrabeanBuilder.append("\n");
276                umbrabeanBuilder.append("umbraBean.clientId=" + Settings.get(CommonSettings.ENVIRONMENT_NAME) + "_" + jobName);
277                umbrabeanBuilder.append("\n");
278                umbrabeanBuilder.append("umbraBean.amqpUri="+rabbitMQUrl);
279                umbrabeanBuilder.append("\n");
280                umbrabeanBuilder.append("## The following rule restricts umbra to processing only on seeds or links, leaving embeds and redirects");
281                umbrabeanBuilder.append("## to be handled by the browser itself");
282                umbrabeanBuilder.append("\n");
283                umbrabeanBuilder.append("umbraBean.shouldProcessRule.rules[1].regex="+limitSearchRegEx);
284                umbrabeanBuilder.append("\n");
285                return umbrabeanBuilder.toString();
286        }
287
288        /**
289         * Umbrabean text that will replace UMBRA_BEAN_PLACEHOLDER in the template       *
290         */
291        public String getUmbrabeanPlaceholder() {
292                // <!--
293                //                              Bean that sends messages (urls) to umbra.
294                //              -->
295                // <bean id="umbraBean" class="org.archive.modules.AMQPPublishProcessor">
296                //  <property name="clientId" value="[see override]"/>
297                //  <property name="amqpUri" value="[see override]"/>
298                //  <property name="shouldProcessRule">
299                //   <bean class="org.archive.modules.deciderules.DecideRuleSequence">
300                //    <property name="rules">
301                //     <list>
302                //      <bean class="org.archive.modules.deciderules.RejectDecideRule" />
303                //      <bean class="org.archive.modules.deciderules.HopsPathMatchesRegexDecideRule">
304                //       <property name="regex" value="[see override]"/>
305                //      </bean>
306                //     </list>
307                //    </property>
308                //   </bean>
309                //  </property>
310                // </bean>
311
312                StringBuilder umbrabeanBuilder = new StringBuilder();
313                umbrabeanBuilder.append("<!-- Bean that sends messages (urls) to umbra. -->");
314                umbrabeanBuilder.append("<bean id=\"umbraBean\" class=\"org.archive.modules.AMQPPublishProcessor\">");
315                umbrabeanBuilder.append("<property name=\"clientId\" value=\"[see override]\"/>");
316                umbrabeanBuilder.append("<property name=\"amqpUri\" value=\"[see override]\"/>");
317                umbrabeanBuilder.append("  <property name=\"shouldProcessRule\">");
318                umbrabeanBuilder.append("   <bean class=\"org.archive.modules.deciderules.DecideRuleSequence\">");
319                umbrabeanBuilder.append("    <property name=\"rules\">");
320                umbrabeanBuilder.append("     <list>");
321                umbrabeanBuilder.append("      <bean class=\"org.archive.modules.deciderules.RejectDecideRule\" />");
322                umbrabeanBuilder.append("      <bean class=\"org.archive.modules.deciderules.HopsPathMatchesRegexDecideRule\">");
323                umbrabeanBuilder.append("       <property name=\"regex\" value=\"[see override]\"/>");
324                umbrabeanBuilder.append("      </bean>");
325                umbrabeanBuilder.append("     </list>");
326                umbrabeanBuilder.append("    </property>");
327                umbrabeanBuilder.append("   </bean>");
328                umbrabeanBuilder.append("  </property>");
329                umbrabeanBuilder.append(" </bean>");
330
331                return umbrabeanBuilder.toString();
332        }
333
334
335        /**
336         * AMQP url receiver text that will replace AMQP_URLRECEIVER_PLACEHOLDER in the template         *
337         */
338        public String getAmqpUrlreceiverPlaceholder() {
339                // <!--
340                //      Bean that receives messages (urls) from umbra and places them in the Heritrix frontier.
341                //                      -->
342                // <bean class="org.archive.crawler.frontier.AMQPUrlReceiver">
343                //  <property name="amqpUri">
344                //   <bean class="org.springframework.beans.factory.config.PropertyPathFactoryBean">
345                //          <property name="targetObject" ref="umbraBean"/>
346                //          <property name="propertyPath" value="amqpUri" />
347                //         </bean>
348                //  </property>
349                //  <property name="queueName">
350                //      <bean class="org.springframework.beans.factory.config.PropertyPathFactoryBean">
351                //       <property name="targetObject" ref="umbraBean"/>
352                //       <property name="propertyPath" value="clientId" />
353                //      </bean>
354                //  </property>
355                // </bean>
356
357                StringBuilder amqpUrlReceiverBeanBuilder = new StringBuilder();
358                amqpUrlReceiverBeanBuilder.append("<!-- Bean that receives messages (urls) from umbra and places them in the Heritrix frontier -->");
359                amqpUrlReceiverBeanBuilder.append("<bean class=\"org.archive.crawler.frontier.AMQPUrlReceiver\">");
360                amqpUrlReceiverBeanBuilder.append(" <property name=\"amqpUri\">");
361                amqpUrlReceiverBeanBuilder.append("   <bean class=\"org.springframework.beans.factory.config.PropertyPathFactoryBean\">");
362                amqpUrlReceiverBeanBuilder.append("    <property name=\"targetObject\" ref=\"umbraBean\"/>");
363                amqpUrlReceiverBeanBuilder.append("    <property name=\"propertyPath\" value=\"amqpUri\" />");
364                amqpUrlReceiverBeanBuilder.append("      </bean>");
365                amqpUrlReceiverBeanBuilder.append("    </property>");
366                amqpUrlReceiverBeanBuilder.append(" <property name=\"queueName\">");
367                amqpUrlReceiverBeanBuilder.append("   <bean class=\"org.springframework.beans.factory.config.PropertyPathFactoryBean\">");
368                amqpUrlReceiverBeanBuilder.append("    <property name=\"targetObject\" ref=\"umbraBean\"/>");
369                amqpUrlReceiverBeanBuilder.append("    <property name=\"propertyPath\" value=\"clientId\" />");
370                amqpUrlReceiverBeanBuilder.append("   </bean>");
371                amqpUrlReceiverBeanBuilder.append(" </property>");
372                amqpUrlReceiverBeanBuilder.append("</bean>");
373
374                return amqpUrlReceiverBeanBuilder.toString();
375        }
376
377
378        /**
379         * Call of the Umbra bean text that will replace CALL_UMBRABEAN_PLACEHOLDER in the template      *
380         */
381        public String getCallUmbrabean() {
382                //          <ref bean="umbraBean"/>
383
384                return "    <ref bean=\"umbraBean\"/>";
385        }
386
387        @Override
388        // This method is used to decide, whether to request a deduplication index or not.
389        // Done by checking, if both  
390        //   - a DeDuplicator bean is present in the template
391        // and
392        //   - a  DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER is also present.
393        // and 
394        //   - a DeDuplicator reference bean is present in the template
395        public boolean IsDeduplicationEnabled() {
396                return (template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER) 
397                                && DEDUPLICATION_BEAN_PATTERN.matcher(template).matches()
398                                && DEDUPLICATION_BEAN_REFERENCE_PATTERN.matcher(template).matches());
399        }       
400
401        /**
402     * Configuring the quota-enforcer, depending on budget definition. Object limit can be defined either by
403     * using the queue-total-budget property or the quota enforcer. Which is chosen is set by the argument
404     * maxObjectsIsSetByQuotaEnforcer}'s value. So quota enforcer is set as follows:
405     * If all values in the quotaEnforcer is infinity, it is in effect disabled
406     * <ul>
407     * <li>Object limit is not set by quota enforcer, disabled only if there is no byte limit.</li>
408     * <li>Object limit is set by quota enforcer, so it should be enabled if a byte or object limit is set.</li>
409     * </ul>
410     *
411     * @param maxObjectsIsSetByQuotaEnforcer Decides whether the maxObjectsIsSetByQuotaEnforcer or not.
412     * @param forceMaxBytesPerDomain The number of max bytes per domain enforced (can be no limit)
413     * @param forceMaxObjectsPerDomain The number of max objects per domain enforced (can be no limit)
414     */
415        public void configureQuotaEnforcer(boolean maxObjectsIsSetByQuotaEnforcer,
416                        long forceMaxBytesPerDomain, long forceMaxObjectsPerDomain) {
417                this.forceMaxobjectsPerDomain = forceMaxObjectsPerDomain;
418                this.forceMaxbytesPerDomain = forceMaxBytesPerDomain;
419                String tmp = template;
420                if (!maxObjectsIsSetByQuotaEnforcer) {
421                        // SetMaxObjects in the global budget to forceMaxObjectsPerDomain??
422                        String tmp1 = tmp.replace(
423                                        FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER, Long.toString( forceMaxObjectsPerDomain ));
424                        // SetMaxObjects to infinity in the quotaEnforcer
425                        tmp = tmp1.replace(QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER, 
426                                        Long.toString(Constants.HERITRIX_MAXOBJECTS_INFINITY));
427                } else {
428                        // SetMaxObjects in the global budget to Infinity
429                        String tmp1 = tmp.replace(
430                                        FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER, Long.toString( Constants.HERITRIX_MAXOBJECTS_INFINITY ));                      
431                        // SetMaxObjects to forceMaxObjectsPerDomain in the quotaEnforcer
432                        tmp = tmp1.replace(QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER, 
433                                        Long.toString(forceMaxObjectsPerDomain));
434                }
435                
436                // SetMaxbytes in the QuotaEnforcer to forceMaxBytesPerDomain
437                // Divide by 1024 since Heritrix uses KB rather than bytes,
438                // and add 1 to avoid to low limit due to rounding.
439                String maxBytesStringValue = "-1";
440                if (forceMaxBytesPerDomain != Constants.HERITRIX_MAXBYTES_INFINITY) {
441                        maxBytesStringValue = Long.toString(( forceMaxBytesPerDomain 
442                                        / Constants.BYTES_PER_HERITRIX_BYTELIMIT_UNIT) + 1);
443                        log.debug("MaxbytesPerDomain set to {} Kbytes per domain", maxBytesStringValue);
444                } else {
445                        log.debug("MaxbytesPerDomain set to infinite number of Kbytes per domain");     
446                }
447                
448                this.template = tmp.replace(QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER, maxBytesStringValue);
449                
450        }
451        
452         /**
453     * Make sure that Heritrix will archive its data in the chosen archiveFormat.
454     *
455     * @param archiveFormat the chosen archiveformat ('arc' or 'warc' supported)
456     * @throws ArgumentNotValid If the chosen archiveFormat is not supported.
457     */
458        @Override
459        public void setArchiveFormat(String archiveFormat) {
460                if (!template.contains(ARCHIVER_BEAN_REFERENCE_PLACEHOLDER)){
461                throw new IllegalState("The placeholder '" + ARCHIVER_BEAN_REFERENCE_PLACEHOLDER 
462                                        + "' is missing. Unable to insert proper archive writer");
463        }
464        if (!template.contains(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER)) {
465                        throw new IllegalState("The placeholder '" + ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER 
466                                        + "' is missing. Unable to insert proper archive writer");
467                }
468                if ("arc".equalsIgnoreCase(archiveFormat)) {
469                        log.debug("ARC format selected to be used by Heritrix3");
470                        setArcArchiveformat();
471                } else if ("warc".equalsIgnoreCase(archiveFormat)) {
472                        log.debug("WARC format selected to be used by Heritrix3");
473                        setWarcArchiveformat();
474                } else {
475                        throw new ArgumentNotValid("Configuration of '" + HarvesterSettings.HERITRIX_ARCHIVE_FORMAT
476                                        + "' is invalid! Unrecognized format '" + archiveFormat + "'.");
477                }
478        }
479
480        /**
481         * Set the archive-format as ARC. This means enabling the ARCWriterProcessor in the template
482         */
483        private void setArcArchiveformat(){
484                String arcWriterbeanReference = "<ref bean=\"arcWriter\"/>";
485        String templateNew = template.replace(ARCHIVER_BEAN_REFERENCE_PLACEHOLDER, arcWriterbeanReference);
486        template = templateNew.replace(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER, getArcWriterProcessor()); 
487    }
488                
489        private String getArcWriterProcessor() {
490
491            //      <bean id="arcWriter" class="org.archive.modules.writer.ARCWriterProcessor">
492            //            <!-- <property name="compress" value="true" /> -->
493            //            <!-- <property name="prefix" value="IAH" /> -->
494            //            <!-- <property name="suffix" value="${HOSTNAME}" /> -->
495            //            <!-- <property name="maxFileSizeBytes" value="100000000" /> -->
496            //            <!-- <property name="poolMaxActive" value="1" /> -->
497            //            <!-- <property name="poolMaxWaitMs" value="300000" /> -->
498            //            <!-- <property name="skipIdenticalDigests" value="false" /> -->
499            //            <!-- <property name="maxTotalBytesToWrite" value="0" /> -->
500            //            <!-- <property name="directory" value="." /> -->
501            //            <!-- <property name="storePaths">
502            //                  <list>
503            //                   <value>arcs</value>
504            //                  </list>
505            //                 </property> -->
506            //           </bean>
507            // "<bean id=\"arcWriter\" class=\"org.archive.modules.writer.ARCWriterProcessor\">";
508            String propertyName="\n<property name=\"";
509            String valuePrefix = "\" value=\"";
510            String valueSuffix = "\"";
511            String propertyEnd="/>";
512
513            StringBuilder arcWriterBeanBuilder = new StringBuilder();
514            arcWriterBeanBuilder.append("<bean id=\"arcWriter\" class=\"org.archive.modules.writer.ARCWriterProcessor\">\n");
515            arcWriterBeanBuilder.append(propertyName + "compress" + valuePrefix
516                    + Settings.get(HarvesterSettings.HERITRIX3_ARC_COMPRESSION) 
517                    + valueSuffix + propertyEnd); 
518            arcWriterBeanBuilder.append(propertyName + "prefix" + valuePrefix
519                    + ARCHIVE_FILE_PREFIX_PLACEHOLDER
520                    + valueSuffix + propertyEnd);
521//          arcWriterBeanBuilder.append(propertyName + "suffix" + valuePrefix
522//                  + Settings.get(HarvesterSettings.HERITRIX3_ARC_SUFFIX) 
523//                  + valueSuffix + propertyEnd); 
524            arcWriterBeanBuilder.append(propertyName + "maxFileSizeBytes" + valuePrefix
525                    + Settings.get(HarvesterSettings.HERITRIX3_ARC_MAXSIZE) 
526                    + valueSuffix + propertyEnd); 
527            arcWriterBeanBuilder.append(propertyName + "poolMaxActive" + valuePrefix
528                    + Settings.get(HarvesterSettings.HERITRIX3_ARC_POOL_MAXACTIVE) 
529                    + valueSuffix + propertyEnd); 
530            arcWriterBeanBuilder.append(propertyName + "skipIdenticalDigests" + valuePrefix 
531                                + Settings.get(HarvesterSettings.HERITRIX3_ARC_SKIP_IDENTICAL_DIGESTS)
532                                + valueSuffix + propertyEnd);
533
534            arcWriterBeanBuilder.append("</bean>");
535
536            return arcWriterBeanBuilder.toString();                           
537        }
538
539                
540        /** 
541         * Insert WARC-archiver beans and remove placeholder for ARC-Archiver-beans
542         * It is an error, if the WARC place-holders doesnt't exist.
543         * It is not an error, if the property placeholder does not exist.
544         */
545        private void setWarcArchiveformat() {           
546                String warcWriterbeanReference = "<ref bean=\"warcWriter\"/>";
547                String warcWriterProcessorBean = "<bean id=\"warcWriter\" class=\"dk.netarkivet.harvester.harvesting.NasWARCProcessor\">";
548                String propertyName="\n<property name=\"";
549                String valuePrefix = "\" value=\"";
550                String valueSuffix = "\"";
551                String propertyEnd="/>";
552                if (!template.contains(ARCHIVER_BEAN_REFERENCE_PLACEHOLDER)) {
553                        throw new IllegalState("The placeholder '" + ARCHIVER_BEAN_REFERENCE_PLACEHOLDER 
554                                        + "' is missing");
555                }
556                if (!template.contains(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER)) {
557                        throw new IllegalState("The placeholder '" + ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER 
558                                        + "' is missing");
559                }
560                StringBuilder propertyBuilder = new StringBuilder();
561                propertyBuilder.append(propertyName + "template" + valuePrefix 
562                      + Settings.get(HarvesterSettings.HERITRIX3_WARC_TEMPLATE)
563              + valueSuffix + propertyEnd);                             
564                propertyBuilder.append(propertyName + "compress" + valuePrefix 
565                      + Settings.get(HarvesterSettings.HERITRIX3_WARC_COMPRESSION) 
566                                + valueSuffix + propertyEnd);
567                // Note: The prefix value will be replaced later by the setArchiveFilePrefix() method
568                propertyBuilder.append(propertyName + "prefix" + valuePrefix 
569                                + ARCHIVE_FILE_PREFIX_PLACEHOLDER
570                                + valueSuffix + propertyEnd);
571                propertyBuilder.append(propertyName + "maxFileSizeBytes" + valuePrefix
572                      + Settings.get(HarvesterSettings.HERITRIX3_WARC_MAXSIZE)
573                      + valueSuffix + propertyEnd);
574                propertyBuilder.append(propertyName + "poolMaxActive" + valuePrefix
575                + Settings.get(HarvesterSettings.HERITRIX3_WARC_POOL_MAXACTIVE)
576                + valueSuffix + propertyEnd);
577          
578                propertyBuilder.append(propertyName + "writeRequests" + valuePrefix 
579                                + Settings.get(HarvesterSettings.HERITRIX3_WARC_WRITE_REQUESTS)
580                                + valueSuffix + propertyEnd);
581                propertyBuilder.append(propertyName + "writeMetadata" + valuePrefix 
582                                + Settings.get(HarvesterSettings.HERITRIX3_WARC_WRITE_METADATA)
583                                + valueSuffix + propertyEnd);
584                propertyBuilder.append(propertyName + "writeMetadataOutlinks" + valuePrefix 
585                                + Settings.get(HarvesterSettings.HERITRIX3_WARC_WRITE_METADATA_OUTLINKS)
586                                + valueSuffix + propertyEnd);
587                propertyBuilder.append(propertyName + "skipIdenticalDigests" + valuePrefix 
588                                + Settings.get(HarvesterSettings.HERITRIX3_WARC_SKIP_IDENTICAL_DIGESTS)
589                                + valueSuffix + propertyEnd);
590                propertyBuilder.append(propertyName + "startNewFilesOnCheckpoint" + valuePrefix 
591                                + Settings.get(HarvesterSettings.HERITRIX3_WARC_START_NEW_FILES_ON_CHECKPOINT)
592                                + valueSuffix + propertyEnd);
593                
594                warcWriterProcessorBean += propertyBuilder.toString();
595                warcWriterProcessorBean += "\n\n%{METADATA_ITEMS_PLACEHOLDER}\n</bean>";
596                String templateNew = template.replace(
597                                ARCHIVER_BEAN_REFERENCE_PLACEHOLDER, warcWriterbeanReference);
598                this.template = templateNew.replace(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER,
599                                warcWriterProcessorBean);
600        }
601
602        @Override
603        /**
604         * With H3 template, we insert the crawlertraps into the template at once.
605         * They are inserted to be part of a org.archive.modules.deciderules.MatchesListRegexDecideRule
606         * bean.
607         * 
608         * @param elementName The elementName is currently not used with H3
609         * @param crawlertraps A list of crawlertraps to be inserted
610         */
611        public void insertCrawlerTraps(String elementName, List<String> crawlertraps) {
612//      <bean class="org.archive.modules.deciderules.MatchesListRegexDecideRule">
613//      <!-- <property name="listLogicalOr" value="true" /> -->
614//      <!-- <property name="regexList">
615//            <list>
616//            CRAWLERTRAPS_PLACEHOLDER 
617//            </list>
618//           </property> -->
619//     </bean>
620        if (crawlertraps.isEmpty()) {
621                log.debug("No crawlertraps yet. No insertion is done");
622                return;
623        } else if (!template.contains(CRAWLERTRAPS_PLACEHOLDER)) {      
624                log.warn("The placeholder '" + CRAWLERTRAPS_PLACEHOLDER 
625                                + "' is absent from the template. No insertion is done at all. {} traps were ignored", 
626                                crawlertraps);
627                return;
628        } else {
629                log.info("Inserting {} crawlertraps into the template", crawlertraps.size());
630                StringBuilder sb = new StringBuilder();
631                sb.append("<!-- crawlertraps from " + elementName + " -->\n");
632                for (String trap: crawlertraps) {
633                        sb.append("<value>" + trap + "</value>\n");
634                }
635                // Adding the placeholder again to be able to insert crawlertraps multiple times.
636                sb.append(CRAWLERTRAPS_PLACEHOLDER + "\n"); 
637                String templateNew = template.replace(CRAWLERTRAPS_PLACEHOLDER, sb.toString());
638                this.template = templateNew;
639        }
640        }
641        
642        public String getMetadataInfo(MetadataInfo info) {
643                String infoStr = null;
644                if(metadataInfoMap.containsKey(info)) {
645                        infoStr = metadataInfoMap.get(info);
646                }
647                return infoStr;
648        }
649
650        @Override
651        public void writeTemplate(OutputStream os) throws IOFailure {
652                try {
653                        os.write(template.getBytes(Charset.forName("UTF-8")));
654                } catch (IOException e) {
655                        throw new IOFailure("Unable to write template to outputstream", e);
656                }
657                
658        }
659
660        @Override
661        public boolean hasContent() {
662                throw new NotImplementedException("The hasContent method hasn't been implemented yet");
663        }
664
665        @Override
666        public void writeToFile(File orderXmlFile) {
667                BufferedWriter writer = null;
668                try {
669                        writer = new BufferedWriter( new FileWriter(orderXmlFile));
670                        writer.write(template);
671                } catch(IOException e) {
672                        throw new IOFailure("Unable to write template to file '" + orderXmlFile.getAbsolutePath() + "'.", e);
673                } finally {
674                        IOUtils.closeQuietly(writer);
675                }
676        }
677
678        @Override
679        public void setRecoverlogNode(File recoverlogGzFile) {
680                throw new NotImplementedException("This method has not yet been implemented");
681                
682        }
683
684        @Override
685        public void setDeduplicationIndexLocation(String absolutePath) {
686                if (!template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER)) {
687                        throw new IllegalState("The placeholder for the deduplication index location property '" +  DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER 
688                                        + "' was not found. Maybe the placeholder has already been replaced with the correct value: " 
689                                        + template);
690                }
691        String templateNew = template.replace(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER, absolutePath); 
692        this.template = templateNew;
693        }
694
695        @Override
696        public void setSeedsFilePath(String absolutePath) {
697         log.debug("Note: SeedsFilePath is not set in h3");
698        }
699
700        @Override
701        public void setArchiveFilePrefix(String archiveFilePrefix) {
702                if (!template.contains(ARCHIVE_FILE_PREFIX_PLACEHOLDER)) {
703                        throw new IllegalState("The placeholder for the archive file prefix property '" 
704                                        + ARCHIVE_FILE_PREFIX_PLACEHOLDER 
705                                        + "' was not found. Maybe the placeholder has already been replaced with the correct value. The template looks like this: " 
706                                        + template);
707                }
708                String templateNew = template.replace(ARCHIVE_FILE_PREFIX_PLACEHOLDER, archiveFilePrefix);              
709        this.template = templateNew;
710                
711        }
712
713        @Override
714        public void setDiskPath(String absolutePath) {
715                // NOP
716                log.warn("The DiskPath is not settable in the H3 template");
717        }
718
719        @Override
720        public void removeDeduplicatorIfPresent() {
721                //NOP
722                log.debug("In H3 we don't remove the deduplicator, but just disable it.");
723        }
724
725        @Override public void enableOrDisableDeduplication(boolean enabled) {
726                final String replacement = Boolean.toString(enabled).toLowerCase();
727                log.debug("Replacing deduplication enabled placeholder {} with {}.", DEDUPLICATION_ENABLED_PLACEHOLDER, replacement);
728                this.template = template.replace(DEDUPLICATION_ENABLED_PLACEHOLDER, replacement);
729        }
730
731        //<property name="metadataItems">
732//  <map>
733//        <entry key="harvestInfo.version" value="1.03"/> <!-- TODO maybe not add this one -->
734//        <entry key="harvestInfo.jobId" value="1"/>
735//        <entry key="harvestInfo.channel" value="HIGH"/>
736//        <entry key="harvestInfo.harvestNum" value="1"/>
737//        <entry key="harvestInfo.origHarvestDefinitionID" value="1"/>
738//        <entry key="harvestInfo.maxBytesPerDomain" value="100000"/>
739//        <entry key="harvestInfo.maxObjectsPerDomain" value="-1"/>
740//        <entry key="harvestInfo.orderXMLName" value="defaultOrderXml"/>
741//        <entry key="harvestInfo.origHarvestDefinitionName" value="ddddddddd"/>
742//        <entry key="harvestInfo.scheduleName" value="EveryHour"/> <!-- Optional. only relevant for Selective Harvests -- only inserted if not null and not-empty.->
743//        <entry key="harvestInfo.harvestFilenamePrefix" value="netarkivet-1-1"/>
744//        <entry key="harvestInfo.jobSubmitDate" value="22. 10. 2014"/>
745//        <entry key="harvestInfo.performer" value="performer"/> <!-- Optional - only inserted if not null and not-empty. -->
746//        <entry key="harvestInfo.audience" value="audience"/> <!-- Optional - only inserted if not null and not-empty. -->
747//  </map>
748//  </property>
749
750        public void insertWarcInfoMetadata(Job ajob, String origHarvestdefinitionName, 
751                        String origHarvestdefinitionComments, String scheduleName, String performer) {
752                if (!template.contains(METADATA_ITEMS_PLACEHOLDER)) {
753                        throw new IllegalState("The placeholder for the property '" + METADATA_ITEMS_PLACEHOLDER  
754                                        + "' was not found. Maybe the placeholder has already been replaced with the correct value. The template looks like this: " 
755                                        + template); 
756                }
757                log.debug("Now in " + getClass().getName());
758                String startMetadataEntry = "\n<entry key=\"";
759                String endMetadataEntry = "\"/>";
760                String valuePart = "\" value=\"";
761                StringBuilder sb = new StringBuilder();
762                sb.append("<property name=\"metadataItems\">\n<map>\n");
763                
764                // <entry key="harvestInfo.version" value="1.03"/>
765                
766                sb.append(startMetadataEntry);
767                sb.append(HARVESTINFO_VERSION + valuePart + HARVESTINFO_VERSION_NUMBER + endMetadataEntry); 
768                sb.append(startMetadataEntry);
769                sb.append(HARVESTINFO_JOBID + valuePart + ajob.getJobID() + endMetadataEntry);
770
771                sb.append(startMetadataEntry);
772                sb.append(HARVESTINFO_CHANNEL + valuePart + ajob.getChannel() + endMetadataEntry);
773                sb.append(startMetadataEntry);
774                sb.append(HARVESTINFO_HARVESTNUM + valuePart + ajob.getHarvestNum() + endMetadataEntry);
775                sb.append(startMetadataEntry);
776                sb.append(HARVESTINFO_ORIGHARVESTDEFINITIONID + valuePart + ajob.getOrigHarvestDefinitionID() + endMetadataEntry);
777                sb.append(startMetadataEntry);
778                sb.append(HARVESTINFO_MAXBYTESPERDOMAIN + valuePart + ajob.getMaxBytesPerDomain() + endMetadataEntry);
779                sb.append(startMetadataEntry);
780                sb.append(HARVESTINFO_MAXOBJECTSPERDOMAIN + valuePart + ajob.getMaxObjectsPerDomain() + endMetadataEntry);
781                sb.append(startMetadataEntry);
782                sb.append(HARVESTINFO_ORDERXMLNAME + valuePart + ajob.getOrderXMLName() + endMetadataEntry);
783
784                /* orderxml update date - only inserted if not null and not-empty. */
785                /* take info from crawler-beans.cxml */
786                String tmp = getMetadataInfo(MetadataInfo.TEMPLATE_UPDATE_DATE);
787                if (tmp != null && !tmp.isEmpty()){
788                        sb.append(startMetadataEntry);
789                        sb.append(HARVESTINFO_ORDERXMLUPDATEDATE + valuePart + tmp  + endMetadataEntry);
790                }
791                /* orderxml description - only inserted if not null and not-empty. */
792                /* take info from crawler-beans.cxml */
793                tmp = getMetadataInfo(MetadataInfo.TEMPLATE_DESCRIPTION);
794                if (tmp != null && !tmp.isEmpty()){
795                        sb.append(startMetadataEntry);
796                        sb.append(HARVESTINFO_ORDERXMLDESCRIPTION + valuePart + StringEscapeUtils.escapeXml(tmp)  + endMetadataEntry);
797                }
798
799                sb.append(startMetadataEntry);
800                sb.append(HARVESTINFO_ORIGHARVESTDEFINITIONNAME + valuePart + 
801                                StringEscapeUtils.escapeXml(origHarvestdefinitionName) + endMetadataEntry);
802                
803                if(StringUtils.isNotEmpty(origHarvestdefinitionComments)) {
804                        sb.append(startMetadataEntry);
805                        sb.append(HARVESTINFO_ORIGHARVESTDEFINITIONCOMMENTS + valuePart + 
806                                        StringEscapeUtils.escapeXml(origHarvestdefinitionComments) + endMetadataEntry);
807                }
808                
809                /* optional schedule-name - only inserted if not null and not-empty. */
810                if (scheduleName != null && !scheduleName.isEmpty()) {
811                        sb.append(startMetadataEntry);
812                        sb.append(HARVESTINFO_SCHEDULENAME + valuePart + scheduleName + endMetadataEntry);
813                }
814                sb.append(startMetadataEntry);
815                sb.append(HARVESTINFO_HARVESTFILENAMEPREFIX + valuePart + ajob.getHarvestFilenamePrefix() + endMetadataEntry);
816                sb.append(startMetadataEntry);
817                sb.append(HARVESTINFO_JOBSUBMITDATE + valuePart + ArchiveDateConverter.getWarcDateFormat().format(ajob.getSubmittedDate()) + endMetadataEntry);
818                
819                /* optional HARVESTINFO_PERFORMER - only inserted if not null and not-empty. */
820                if (performer != null && !performer.isEmpty()){
821                        sb.append(startMetadataEntry);
822                        sb.append(HARVESTINFO_PERFORMER + valuePart + StringEscapeUtils.escapeXml(performer)  + endMetadataEntry);
823                }
824                
825                /* optional OPERATOR - only inserted if not null and not-empty. */
826                /* take info from crawler-beans.cxml */
827                String operator = getMetadataInfo(MetadataInfo.OPERATOR);
828                if (operator != null && !operator.isEmpty()){
829                        sb.append(startMetadataEntry);
830                        sb.append(HARVESTINFO_OPERATOR + valuePart + StringEscapeUtils.escapeXml(operator)  + endMetadataEntry);
831                }
832                
833                /* optional HARVESTINFO_AUDIENCE - only inserted if not null and not-empty. */
834                if (ajob.getHarvestAudience() != null && !ajob.getHarvestAudience().isEmpty()) {
835                        sb.append(startMetadataEntry);
836                        sb.append(HARVESTINFO_AUDIENCE + valuePart + StringEscapeUtils.escapeXml(ajob.getHarvestAudience()) + endMetadataEntry);
837                }
838                sb.append("\n</map>\n</property>\n");
839                
840                // Replace command
841                log.info("Adding WarcInfoMetadata " + sb.toString());
842                String templateNew = template.replace(METADATA_ITEMS_PLACEHOLDER, sb.toString());
843                this.template = templateNew;
844        }
845
846        @Override
847        public void insertAttributes(List<AttributeAndType> attributesAndTypes) {
848            ArgumentNotValid.checkNotNull(attributesAndTypes, "List<AttributeAndType> attributesAndTypes");
849            for (AttributeAndType attributeAndType: attributesAndTypes) {
850                // initialize temp variables
851                Integer intVal = null;
852                String val = null;
853                AttributeTypeBase attributeType = attributeAndType.attributeType;
854                AttributeBase attribute = attributeAndType.attribute;
855
856                log.debug("Trying to insert the attribute {} into the template", attributeType.name);
857                switch (attributeType.viewtype) {
858                case 1:
859                    if (attribute != null) {
860                        intVal = attribute.getInteger();
861                        log.debug("Read explicitly value for attribute '{}'", attributeType.name, intVal);
862                    }
863                    if (intVal == null && attributeType.def_int != null) {
864                        intVal = attributeType.def_int;
865                        log.debug("Viewtype 1 attribute '{}' not set explicitly. Using default value '{}'",  attributeType.name, intVal);
866                    }
867                    if (intVal != null) {
868                        val = intVal.toString();
869                    } else {
870                        val = "";
871                    }
872                    log.info("Value selected for attribute {}: {}", attributeType.name, val);
873                    break;
874                case 5:
875                    if (attribute != null) {
876                        intVal = attribute.getInteger();
877                        log.debug("Read explicitly value for attribute '{}'", attributeType.name, intVal);
878                    }
879                    if (intVal == null && attributeType.def_int != null) {
880                        intVal = attributeType.def_int;
881                        log.debug("Viewtype 5 attribute '{}' not set explicitly. Using default value '{}'", attributeType.name, intVal);
882                    }
883                    if (intVal != null && intVal > 0) {
884                        val = "true";
885                    } else {
886                        val = "false";
887                    }
888                    log.info("Value selected for attribute '{}': '{}'", attributeType.name, val);
889                    break;
890                case 6:
891                    if (attribute != null) {
892                        intVal = attribute.getInteger();
893                        log.debug("Read explicitly value for attribute '{}'", attributeType.name, intVal);
894                    }
895                    if (intVal == null && attributeType.def_int != null) {
896                        intVal = attributeType.def_int;
897                        log.debug("Viewtype 6 attribute '{}' not set explicitly. Using default value '{}'", attributeType.name, intVal);
898                    }
899                    if (intVal != null && intVal > 0) {
900                        val = "obey";
901                    } else {
902                        val = "ignore";
903                    }
904                    log.info("Value selected for attribute '{}': '{}'", attributeType.name, val);
905                    break;
906                }
907                String placeholder = "%{" + attributeType.name.toUpperCase() + "}";
908                if (template.contains(placeholder)) {
909                    String templateNew = template.replace("%{" + attributeType.name.toUpperCase() + "}", val);
910                    this.template = templateNew;
911                } else {
912                    log.warn("Placeholder '{}' not found in template. Therefore not substituted by '{}' in this template", 
913                            placeholder, val); 
914                }
915            }
916        }
917
918        @Override
919        public void writeTemplate(JspWriter out) throws IOFailure {
920                try {
921                        out.write(template);
922                } catch (IOException e) {
923                        throw new IOFailure("Unable to write to JspWriter", e);
924                }
925        }
926
927        /**
928         *  Hack to remove existing placeholders, that is still present after template 
929         *  manipulation is completed.
930         */
931        public void removePlaceholders() {
932                String[] optionalPlaceholders = new String[] {
933                                METADATA_ITEMS_PLACEHOLDER,
934                                CRAWLERTRAPS_PLACEHOLDER,
935                                UMBRA_PUBLISH_BEAN_PLACEHOLDER,
936                                UMBRA_SIMPLEOVERRIDES_PLACEHOLDER,
937                                UMBRA_BEAN_REF_PLACEHOLDER,
938                                UMBRA_RECEIVE_BEAN_PLACEHOLDER};
939                for (String placeholder: optionalPlaceholders) {
940                        template = template.replace(placeholder, "");
941                }
942        }
943}