001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.datamodel;
024
025import java.io.BufferedWriter;
026import java.io.File;
027import java.io.FileWriter;
028import java.io.IOException;
029import java.io.OutputStream;
030import java.io.Serializable;
031import java.nio.charset.Charset;
032import java.util.List;
033
034import javax.servlet.jsp.JspWriter;
035
036import org.apache.commons.io.IOUtils;
037import org.slf4j.Logger;
038import org.slf4j.LoggerFactory;
039
040import com.antiaction.raptor.dao.AttributeBase;
041import com.antiaction.raptor.dao.AttributeTypeBase;
042
043import dk.netarkivet.common.exceptions.ArgumentNotValid;
044import dk.netarkivet.common.exceptions.IOFailure;
045import dk.netarkivet.common.exceptions.IllegalState;
046import dk.netarkivet.common.exceptions.NotImplementedException;
047import dk.netarkivet.common.utils.Settings;
048import dk.netarkivet.common.utils.archive.ArchiveDateConverter;
049import dk.netarkivet.harvester.HarvesterSettings;
050import dk.netarkivet.harvester.datamodel.eav.EAV.AttributeAndType;
051
052/**
053 * Class encapsulating the Heritrix crawler-beans.cxml file 
054 * <p>
055 * 
056 * Heritrix3 has a new model based on spring, So the XPATH is no good for processing.
057 * Instead we use placeholders instead, marked by %{..} instead of ${..}, which is used by 
058 * Heritrix3 already.
059 * 
060 * The template is a H3 template if it contains the string: 
061 * 
062 * "xmlns="http://www.springframework.org/...."
063 * 
064 */
065public class H3HeritrixTemplate extends HeritrixTemplate implements Serializable {
066
067    private static final Logger log = LoggerFactory.getLogger(H3HeritrixTemplate.class);
068
069    private String template;
070    
071    /** QuotaEnforcer states for this template. TODO necessary?? */
072    private Long forceMaxbytesPerDomain;
073    private Long forceMaxobjectsPerDomain; 
074   
075    /** Has this HeritrixTemplate been verified. */
076    private boolean verified;
077
078    public final static String METADATA_ITEMS_PLACEHOLDER = "%{METADATA_ITEMS_PLACEHOLDER}";
079    public static final String MAX_TIME_SECONDS_PLACEHOLDER = "%{MAX_TIME_SECONDS_PLACEHOLDER}";
080    public static final String CRAWLERTRAPS_PLACEHOLDER = "%{CRAWLERTRAPS_PLACEHOLDER}";
081    
082    public static final String DEDUPLICATION_BEAN_REFERENCE_PATTERN = "<ref bean=\"DeDuplicator\"/>";
083    public static final String DEDUPLICATION_BEAN_PATTERN =  "<bean id=\"DeDuplicator\"";
084    public static final String DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER 
085        = "%{DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER}"; 
086
087    public static final String ARCHIVE_FILE_PREFIX_PLACEHOLDER = "%{ARCHIVE_FILE_PREFIX_PLACEHOLDER}";
088        
089    public static final String FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER 
090        = "%{FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER}";
091    
092    public static final String QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER = 
093                "%{QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER}";
094    
095    public static final String QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER 
096        = "%{QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER}"; 
097    
098    
099    // PLACEHOLDERS for archiver beans (Maybe not necessary)
100    final String ARCHIVER_BEAN_REFERENCE_PLACEHOLDER = "%{ARCHIVER_BEAN_REFERENCE_PLACEHOLDER}";        
101        final String ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER = "%{ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER}";
102        
103    /**
104     * Constructor for HeritrixTemplate class.
105     *
106     * @param template_id The persistent id of the template in the database
107     * @param template The template as String object
108     * @throws ArgumentNotValid if template is null.
109     */
110    public H3HeritrixTemplate(long template_id, String template) {
111        ArgumentNotValid.checkNotNull(template, "String template");
112        this.template_id = template_id;
113        this.template = template;
114    }
115    
116        /**
117     * return the template.
118     *
119     * @return the template
120     */
121    public HeritrixTemplate getTemplate() {
122        return this;
123    }
124
125    /**
126     * Has Template been verified?
127     *
128     * @return true, if verified on construction, otherwise false
129     */
130    public boolean isVerified() {
131        return verified;
132    }
133
134    /**
135     * Return HeritrixTemplate as XML.
136     * @return HeritrixTemplate as XML
137     */
138    @Override
139    public String getXML() {
140        return template;
141    }
142    
143    /**
144     * Update the maxTimeSeconds property in the heritrix3 template, if possible.
145     * @param maxJobRunningTimeSecondsL Force the harvestJob to end after this number of seconds 
146     * Property of the org.archive.crawler.framework.CrawlLimitEnforcer
147     * <!-- <property name="maxTimeSeconds" value="0" /> -->
148     */
149    @Override
150        public void setMaxJobRunningTime(Long maxJobRunningTimeSecondsL) {
151                if (template.contains(MAX_TIME_SECONDS_PLACEHOLDER)) {
152                this.template = template.replace(MAX_TIME_SECONDS_PLACEHOLDER, 
153                                Long.toString(maxJobRunningTimeSecondsL));
154                } else {
155                        log.warn("The placeholder '" + MAX_TIME_SECONDS_PLACEHOLDER 
156                                        + "' was not found in the template. Therefore maxRunningTime not set");
157                }
158        }
159
160    
161        @Override
162        public void setMaxBytesPerDomain(Long maxbytesL) {
163                this.forceMaxbytesPerDomain = maxbytesL;                
164        }       
165  
166
167        @Override
168        public Long getMaxBytesPerDomain() {
169                return this.forceMaxbytesPerDomain;
170        }
171
172        @Override
173        public void setMaxObjectsPerDomain(Long maxobjectsL) {
174                this.forceMaxobjectsPerDomain = maxobjectsL;
175        }
176
177        @Override
178        public Long getMaxObjectsPerDomain() {
179                return this.forceMaxobjectsPerDomain;
180        }
181    
182        @Override
183        public boolean isValid() {
184                /*
185                StringBuilder errors = new StringBuilder();
186                // check for Deduplication index-location placeholder and DEDUPLICATION_BEAN_PATTERN
187                if (template.contains(DEDUPLICATION_BEAN_PATTERN)) {
188                        if (!template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER)) {
189                                errors.append("Has DefdMissing placeholder '" +  DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER + "'"
190                        }
191                } 
192                template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER) 
193                && template.contains(deduplicationBeanPattern)
194                */
195                return true;
196        }
197
198        @Override
199        // This method is used to decide, whether to request a deduplication index or not.
200        // Done by checking, if both  
201        //   - a DeDuplicator bean is present in the template
202        // and
203        //   - a  DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER is also present.
204        // and 
205        //   - a DeDuplicator reference bean is present in the template
206        public boolean IsDeduplicationEnabled() {
207                return (template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER) 
208                                && template.contains(DEDUPLICATION_BEAN_PATTERN)
209                                && template.contains(DEDUPLICATION_BEAN_REFERENCE_PATTERN)); 
210        }       
211
212        /**
213     * Configuring the quota-enforcer, depending on budget definition. Object limit can be defined either by
214     * using the queue-total-budget property or the quota enforcer. Which is chosen is set by the argument
215     * maxObjectsIsSetByQuotaEnforcer}'s value. So quota enforcer is set as follows:
216     * If all values in the quotaEnforcer is infinity, it is in effect disabled
217     * <ul>
218     * <li>Object limit is not set by quota enforcer, disabled only if there is no byte limit.</li>
219     * <li>Object limit is set by quota enforcer, so it should be enabled if a byte or object limit is set.</li>
220     * </ul>
221     *
222     * @param maxObjectsIsSetByQuotaEnforcer Decides whether the maxObjectsIsSetByQuotaEnforcer or not.
223     * @param forceMaxBytesPerDomain The number of max bytes per domain enforced (can be no limit)
224     * @param forceMaxObjectsPerDomain The number of max objects per domain enforced (can be no limit)
225     */
226        public void configureQuotaEnforcer(boolean maxObjectsIsSetByQuotaEnforcer,
227                        long forceMaxBytesPerDomain, long forceMaxObjectsPerDomain) {
228                this.forceMaxobjectsPerDomain = forceMaxObjectsPerDomain;
229                this.forceMaxbytesPerDomain = forceMaxBytesPerDomain;
230                String tmp = template;
231                if (!maxObjectsIsSetByQuotaEnforcer) {
232                        // SetMaxObjects in the global budget to forceMaxObjectsPerDomain??
233                        String tmp1 = tmp.replace(
234                                        FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER, Long.toString( forceMaxObjectsPerDomain ));
235                        // SetMaxObjects to infinity in the quotaEnforcer
236                        tmp = tmp1.replace(QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER, 
237                                        Long.toString(Constants.HERITRIX_MAXOBJECTS_INFINITY));
238                } else {
239                        // SetMaxObjects in the global budget to Infinity
240                        String tmp1 = tmp.replace(
241                                        FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER, Long.toString( Constants.HERITRIX_MAXOBJECTS_INFINITY ));                      
242                        // SetMaxObjects to forceMaxObjectsPerDomain in the quotaEnforcer
243                        tmp = tmp1.replace(QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER, 
244                                        Long.toString(forceMaxObjectsPerDomain));
245                }
246                
247                // SetMaxbytes in the QuotaEnforcer to forceMaxBytesPerDomain
248                // Divide by 1024 since Heritrix uses KB rather than bytes,
249                // and add 1 to avoid to low limit due to rounding.
250                String maxBytesStringValue = "-1";
251                if (forceMaxBytesPerDomain != Constants.HERITRIX_MAXBYTES_INFINITY) {
252                        maxBytesStringValue = Long.toString(( forceMaxBytesPerDomain 
253                                        / Constants.BYTES_PER_HERITRIX_BYTELIMIT_UNIT) + 1);
254                        log.debug("MaxbytesPerDomain set to {} Kbytes per domain", maxBytesStringValue);
255                } else {
256                        log.debug("MaxbytesPerDomain set to infinite number of Kbytes per domain");     
257                }
258                
259                this.template = tmp.replace(QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER, maxBytesStringValue);
260                
261        }
262        
263         /**
264     * Make sure that Heritrix will archive its data in the chosen archiveFormat.
265     *
266     * @param archiveFormat the chosen archiveformat ('arc' or 'warc' supported)
267     * @throws ArgumentNotValid If the chosen archiveFormat is not supported.
268     */
269        @Override
270        public void setArchiveFormat(String archiveFormat) {
271                if (!template.contains(ARCHIVER_BEAN_REFERENCE_PLACEHOLDER)){
272                throw new IllegalState("The placeholder '" + ARCHIVER_BEAN_REFERENCE_PLACEHOLDER 
273                                        + "' is missing. Unable to insert proper archive writer");
274        }
275        if (!template.contains(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER)) {
276                        throw new IllegalState("The placeholder '" + ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER 
277                                        + "' is missing. Unable to insert proper archive writer");
278                }
279                if ("arc".equalsIgnoreCase(archiveFormat)) {
280                        log.debug("ARC format selected to be used by Heritrix3");
281                        setArcArchiveformat();
282                } else if ("warc".equalsIgnoreCase(archiveFormat)) {
283                        log.debug("WARC format selected to be used by Heritrix3");
284                        setWarcArchiveformat();
285                } else {
286                        throw new ArgumentNotValid("Configuration of '" + HarvesterSettings.HERITRIX_ARCHIVE_FORMAT
287                                        + "' is invalid! Unrecognized format '" + archiveFormat + "'.");
288                }
289        }
290
291        /**
292         * Set the archive-format as ARC. This means enabling the ARCWriterProcessor in the template
293         */
294        private void setArcArchiveformat(){
295                String arcWriterbeanReference = "<ref bean=\"arcWriter\"/>";
296        String templateNew = template.replace(ARCHIVER_BEAN_REFERENCE_PLACEHOLDER, arcWriterbeanReference);
297        template = templateNew.replace(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER, getArcWriterProcessor()); 
298    }
299                
300        private String getArcWriterProcessor() {
301
302            //      <bean id="arcWriter" class="org.archive.modules.writer.ARCWriterProcessor">
303            //            <!-- <property name="compress" value="true" /> -->
304            //            <!-- <property name="prefix" value="IAH" /> -->
305            //            <!-- <property name="suffix" value="${HOSTNAME}" /> -->
306            //            <!-- <property name="maxFileSizeBytes" value="100000000" /> -->
307            //            <!-- <property name="poolMaxActive" value="1" /> -->
308            //            <!-- <property name="poolMaxWaitMs" value="300000" /> -->
309            //            <!-- <property name="skipIdenticalDigests" value="false" /> -->
310            //            <!-- <property name="maxTotalBytesToWrite" value="0" /> -->
311            //            <!-- <property name="directory" value="." /> -->
312            //            <!-- <property name="storePaths">
313            //                  <list>
314            //                   <value>arcs</value>
315            //                  </list>
316            //                 </property> -->
317            //           </bean>
318            // "<bean id=\"arcWriter\" class=\"org.archive.modules.writer.ARCWriterProcessor\">";
319            String propertyName="\n<property name=\"";
320            String valuePrefix = "\" value=\"";
321            String valueSuffix = "\"";
322            String propertyEnd="/>";
323
324            StringBuilder arcWriterBeanBuilder = new StringBuilder();
325            arcWriterBeanBuilder.append("<bean id=\"arcWriter\" class=\"org.archive.modules.writer.ARCWriterProcessor\">\n");
326            arcWriterBeanBuilder.append(propertyName + "compress" + valuePrefix
327                    + Settings.get(HarvesterSettings.HERITRIX3_ARC_COMPRESSION) 
328                    + valueSuffix + propertyEnd); 
329            arcWriterBeanBuilder.append(propertyName + "prefix" + valuePrefix
330                    + ARCHIVE_FILE_PREFIX_PLACEHOLDER
331                    + valueSuffix + propertyEnd);
332//          arcWriterBeanBuilder.append(propertyName + "suffix" + valuePrefix
333//                  + Settings.get(HarvesterSettings.HERITRIX3_ARC_SUFFIX) 
334//                  + valueSuffix + propertyEnd); 
335            arcWriterBeanBuilder.append(propertyName + "maxFileSizeBytes" + valuePrefix
336                    + Settings.get(HarvesterSettings.HERITRIX3_ARC_MAXSIZE) 
337                    + valueSuffix + propertyEnd); 
338            arcWriterBeanBuilder.append(propertyName + "poolMaxActive" + valuePrefix
339                    + Settings.get(HarvesterSettings.HERITRIX3_ARC_POOL_MAXACTIVE) 
340                    + valueSuffix + propertyEnd); 
341            arcWriterBeanBuilder.append(propertyName + "skipIdenticalDigests" + valuePrefix 
342                                + Settings.get(HarvesterSettings.HERITRIX3_ARC_SKIP_IDENTICAL_DIGESTS)
343                                + valueSuffix + propertyEnd);
344
345            arcWriterBeanBuilder.append("</bean>");
346
347            return arcWriterBeanBuilder.toString();                           
348        }
349
350                
351        /** 
352         * Insert WARC-archiver beans and remove placeholder for ARC-Archiver-beans
353         * It is an error, if the WARC place-holders doesnt't exist.
354         * It is not an error, if the property placeholder does not exist.
355         */
356        private void setWarcArchiveformat() {           
357                String warcWriterbeanReference = "<ref bean=\"warcWriter\"/>";
358                String warcWriterProcessorBean = "<bean id=\"warcWriter\" class=\"dk.netarkivet.harvester.harvesting.NasWARCProcessor\">";
359                String propertyName="\n<property name=\"";
360                String valuePrefix = "\" value=\"";
361                String valueSuffix = "\"";
362                String propertyEnd="/>";
363                if (!template.contains(ARCHIVER_BEAN_REFERENCE_PLACEHOLDER)) {
364                        throw new IllegalState("The placeholder '" + ARCHIVER_BEAN_REFERENCE_PLACEHOLDER 
365                                        + "' is missing");
366                }
367                if (!template.contains(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER)) {
368                        throw new IllegalState("The placeholder '" + ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER 
369                                        + "' is missing");
370                }
371                StringBuilder propertyBuilder = new StringBuilder();
372                propertyBuilder.append(propertyName + "template" + valuePrefix 
373                      + Settings.get(HarvesterSettings.HERITRIX3_WARC_TEMPLATE)
374              + valueSuffix + propertyEnd);                             
375                propertyBuilder.append(propertyName + "compress" + valuePrefix 
376                      + Settings.get(HarvesterSettings.HERITRIX3_WARC_COMPRESSION) 
377                                + valueSuffix + propertyEnd);
378                // Note: The prefix value will be replaced later by the setArchiveFilePrefix() method
379                propertyBuilder.append(propertyName + "prefix" + valuePrefix 
380                                + ARCHIVE_FILE_PREFIX_PLACEHOLDER
381                                + valueSuffix + propertyEnd);
382                propertyBuilder.append(propertyName + "maxFileSizeBytes" + valuePrefix
383                      + Settings.get(HarvesterSettings.HERITRIX3_WARC_MAXSIZE)
384                      + valueSuffix + propertyEnd);
385                propertyBuilder.append(propertyName + "poolMaxActive" + valuePrefix
386                + Settings.get(HarvesterSettings.HERITRIX3_WARC_POOL_MAXACTIVE)
387                + valueSuffix + propertyEnd);
388          
389                propertyBuilder.append(propertyName + "writeRequests" + valuePrefix 
390                                + Settings.get(HarvesterSettings.HERITRIX3_WARC_WRITE_REQUESTS)
391                                + valueSuffix + propertyEnd);
392                propertyBuilder.append(propertyName + "writeMetadata" + valuePrefix 
393                                + Settings.get(HarvesterSettings.HERITRIX3_WARC_WRITE_METADATA)
394                                + valueSuffix + propertyEnd);
395                propertyBuilder.append(propertyName + "writeMetadataOutlinks" + valuePrefix 
396                                + Settings.get(HarvesterSettings.HERITRIX3_WARC_WRITE_METADATA_OUTLINKS)
397                                + valueSuffix + propertyEnd);
398                propertyBuilder.append(propertyName + "skipIdenticalDigests" + valuePrefix 
399                                + Settings.get(HarvesterSettings.HERITRIX3_WARC_SKIP_IDENTICAL_DIGESTS)
400                                + valueSuffix + propertyEnd);
401                propertyBuilder.append(propertyName + "startNewFilesOnCheckpoint" + valuePrefix 
402                                + Settings.get(HarvesterSettings.HERITRIX3_WARC_START_NEW_FILES_ON_CHECKPOINT)
403                                + valueSuffix + propertyEnd);
404                
405                warcWriterProcessorBean += propertyBuilder.toString();
406                warcWriterProcessorBean += "\n\n%{METADATA_ITEMS_PLACEHOLDER}\n</bean>";
407                String templateNew = template.replace(
408                                ARCHIVER_BEAN_REFERENCE_PLACEHOLDER, warcWriterbeanReference);
409                this.template = templateNew.replace(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER,
410                                warcWriterProcessorBean);
411        }
412
413        @Override
414        /**
415         * With H3 template, we insert the crawlertraps into the template at once.
416         * They are inserted to be part of a org.archive.modules.deciderules.MatchesListRegexDecideRule
417         * bean.
418         * 
419         * @param elementName The elementName is currently not used with H3
420         * @param crawlertraps A list of crawlertraps to be inserted
421         */
422        public void insertCrawlerTraps(String elementName, List<String> crawlertraps) {
423//      <bean class="org.archive.modules.deciderules.MatchesListRegexDecideRule">
424//      <!-- <property name="listLogicalOr" value="true" /> -->
425//      <!-- <property name="regexList">
426//            <list>
427//            CRAWLERTRAPS_PLACEHOLDER 
428//            </list>
429//           </property> -->
430//     </bean>
431        if (crawlertraps.isEmpty()) {
432                log.debug("No crawlertraps yet. No insertion is done");
433                return;
434        } else if (!template.contains(CRAWLERTRAPS_PLACEHOLDER)) {      
435                log.warn("The placeholder '" + CRAWLERTRAPS_PLACEHOLDER 
436                                + "' is absent from the template. No insertion is done at all. {} traps were ignored", 
437                                crawlertraps);
438                return;
439        } else {
440                log.info("Inserting {} crawlertraps into the template", crawlertraps.size());
441                StringBuilder sb = new StringBuilder();
442                for (String trap: crawlertraps) {
443                        sb.append("<value>" + trap + "</value>\n");
444                }
445                // Adding the placeholder again to be able to insert crawlertraps multiple times.
446                sb.append(CRAWLERTRAPS_PLACEHOLDER + "\n"); 
447                String templateNew = template.replace(CRAWLERTRAPS_PLACEHOLDER, sb.toString());
448                this.template = templateNew;
449        }
450        }
451
452        @Override
453        public void writeTemplate(OutputStream os) throws IOFailure {
454                try {
455                        os.write(template.getBytes(Charset.forName("UTF-8")));
456                } catch (IOException e) {
457                        throw new IOFailure("Unable to write template to outputstream", e);
458                }
459                
460        }
461
462        @Override
463        public boolean hasContent() {
464                throw new NotImplementedException("The hasContent method hasn't been implemented yet");
465        }
466
467        @Override
468        public void writeToFile(File orderXmlFile) {
469                BufferedWriter writer = null;
470                try {
471                        writer = new BufferedWriter( new FileWriter(orderXmlFile));
472                        writer.write(template);
473                } catch(IOException e) {
474                        throw new IOFailure("Unable to write template to file '" + orderXmlFile.getAbsolutePath() + "'.", e);
475                } finally {
476                        IOUtils.closeQuietly(writer);
477                }
478        }
479
480        @Override
481        public void setRecoverlogNode(File recoverlogGzFile) {
482                throw new NotImplementedException("This method has not yet been implemented");
483                
484        }
485
486        @Override
487        public void setDeduplicationIndexLocation(String absolutePath) {
488                if (!template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER)) {
489                        throw new IllegalState("The placeholder for the deduplication index location property '" +  DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER 
490                                        + "' was not found. Maybe the placeholder has already been replaced with the correct value: " 
491                                        + template);
492                }
493        String templateNew = template.replace(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER, absolutePath); 
494        this.template = templateNew;
495        }
496
497        @Override
498        public void setSeedsFilePath(String absolutePath) {
499         log.debug("Note: SeedsFilePath is not set in h3");
500        }
501
502        @Override
503        public void setArchiveFilePrefix(String archiveFilePrefix) {
504                if (!template.contains(ARCHIVE_FILE_PREFIX_PLACEHOLDER)) {
505                        throw new IllegalState("The placeholder for the archive file prefix property '" 
506                                        + ARCHIVE_FILE_PREFIX_PLACEHOLDER 
507                                        + "' was not found. Maybe the placeholder has already been replaced with the correct value. The template looks like this: " 
508                                        + template);
509                }
510                String templateNew = template.replace(ARCHIVE_FILE_PREFIX_PLACEHOLDER, archiveFilePrefix);              
511        this.template = templateNew;
512                
513        }
514
515        @Override
516        public void setDiskPath(String absolutePath) {
517                // NOP
518                log.warn("The DiskPath is not settable in the H3 template");
519        }
520
521        @Override
522        public void removeDeduplicatorIfPresent() {
523                //NOP
524                log.warn("Removing the Deduplicator is not possible with the H3 templates and should not be required with the H3 template.");
525        }
526        
527//<property name="metadataItems">
528//  <map>
529//        <entry key="harvestInfo.version" value="1.03"/> <!-- TODO maybe not add this one -->
530//        <entry key="harvestInfo.jobId" value="1"/>
531//        <entry key="harvestInfo.channel" value="HIGH"/>
532//        <entry key="harvestInfo.harvestNum" value="1"/>
533//        <entry key="harvestInfo.origHarvestDefinitionID" value="1"/>
534//        <entry key="harvestInfo.maxBytesPerDomain" value="100000"/>
535//        <entry key="harvestInfo.maxObjectsPerDomain" value="-1"/>
536//        <entry key="harvestInfo.orderXMLName" value="defaultOrderXml"/>
537//        <entry key="harvestInfo.origHarvestDefinitionName" value="ddddddddd"/>
538//        <entry key="harvestInfo.scheduleName" value="EveryHour"/> <!-- Optional. only relevant for Selective Harvests -- only inserted if not null and not-empty.->
539//        <entry key="harvestInfo.harvestFilenamePrefix" value="netarkivet-1-1"/>
540//        <entry key="harvestInfo.jobSubmitDate" value="22. 10. 2014"/>
541//        <entry key="harvestInfo.performer" value="performer"/> <!-- Optional - only inserted if not null and not-empty. -->
542//        <entry key="harvestInfo.audience" value="audience"/> <!-- Optional - only inserted if not null and not-empty. -->
543//  </map>
544//  </property>
545
546        public void insertWarcInfoMetadata(Job ajob, String origHarvestdefinitionName, 
547                        String scheduleName, String performer) {
548                if (!template.contains(METADATA_ITEMS_PLACEHOLDER)) {
549                        throw new IllegalState("The placeholder for the property '" + METADATA_ITEMS_PLACEHOLDER  
550                                        + "' was not found. Maybe the placeholder has already been replaced with the correct value. The template looks like this: " 
551                                        + template); 
552                }
553                log.debug("Now in " + getClass().getName());
554                String startMetadataEntry = "\n<entry key=\"";
555                String endMetadataEntry = "\"/>";
556                String valuePart = "\" value=\"";
557                StringBuilder sb = new StringBuilder();
558                sb.append("<property name=\"metadataItems\">\n<map>\n");
559                
560                // <entry key="harvestInfo.version" value="1.03"/>
561                
562                sb.append(startMetadataEntry);
563                sb.append(HARVESTINFO_VERSION + valuePart + HARVESTINFO_VERSION_NUMBER + endMetadataEntry); 
564                sb.append(startMetadataEntry);
565                sb.append(HARVESTINFO_JOBID + valuePart + ajob.getJobID() + endMetadataEntry);
566
567                sb.append(startMetadataEntry);
568                sb.append(HARVESTINFO_CHANNEL + valuePart + ajob.getChannel() + endMetadataEntry);
569                sb.append(startMetadataEntry);
570                sb.append(HARVESTINFO_HARVESTNUM + valuePart + ajob.getHarvestNum() + endMetadataEntry);
571                sb.append(startMetadataEntry);
572                sb.append(HARVESTINFO_ORIGHARVESTDEFINITIONID + valuePart + ajob.getOrigHarvestDefinitionID() + endMetadataEntry);
573                sb.append(startMetadataEntry);
574                sb.append(HARVESTINFO_MAXBYTESPERDOMAIN + valuePart + ajob.getMaxBytesPerDomain() + endMetadataEntry);
575                sb.append(startMetadataEntry);
576                sb.append(HARVESTINFO_MAXOBJECTSPERDOMAIN + valuePart + ajob.getMaxObjectsPerDomain() + endMetadataEntry);
577                sb.append(startMetadataEntry);
578                sb.append(HARVESTINFO_ORDERXMLNAME + valuePart + ajob.getOrderXMLName() + endMetadataEntry);
579                sb.append(startMetadataEntry);
580                sb.append(HARVESTINFO_ORIGHARVESTDEFINITIONNAME + valuePart + 
581                                origHarvestdefinitionName + endMetadataEntry);
582                
583                /* optional schedule-name - only inserted if not null and not-empty. */
584                if (scheduleName != null && !scheduleName.isEmpty()) {
585                        sb.append(startMetadataEntry);
586                        sb.append(HARVESTINFO_SCHEDULENAME + valuePart + scheduleName + endMetadataEntry);
587                }
588                sb.append(startMetadataEntry);
589                sb.append(HARVESTINFO_HARVESTFILENAMEPREFIX + valuePart + ajob.getHarvestFilenamePrefix() + endMetadataEntry);
590                sb.append(startMetadataEntry);
591                sb.append(HARVESTINFO_JOBSUBMITDATE + valuePart + ArchiveDateConverter.getWarcDateFormat().format(ajob.getSubmittedDate()) + endMetadataEntry);
592                
593                /* optional HARVESTINFO_PERFORMER - only inserted if not null and not-empty. */
594                if (performer != null && !performer.isEmpty()){
595                        sb.append(startMetadataEntry);
596                        sb.append(HARVESTINFO_PERFORMER + valuePart + performer  + endMetadataEntry);
597                }
598                
599                /* optional HARVESTINFO_AUDIENCE - only inserted if not null and not-empty. */
600                if (ajob.getHarvestAudience() != null && !ajob.getHarvestAudience().isEmpty()) {
601                        sb.append(startMetadataEntry);
602                        sb.append(HARVESTINFO_AUDIENCE + valuePart + ajob.getHarvestAudience() + endMetadataEntry);
603                }
604                sb.append("\n</map>\n</property>\n");
605                
606                // Replace command
607                log.info("Adding WarcInfoMetadata " + sb.toString());
608                String templateNew = template.replace(METADATA_ITEMS_PLACEHOLDER, sb.toString());
609                this.template = templateNew;
610        }
611
612        @Override
613        public void insertAttributes(List<AttributeAndType> attributesAndTypes) {
614            ArgumentNotValid.checkNotNull(attributesAndTypes, "List<AttributeAndType> attributesAndTypes");
615            for (AttributeAndType attributeAndType: attributesAndTypes) {
616                // initialize temp variables
617                Integer intVal = null;
618                String val = null;
619                AttributeTypeBase attributeType = attributeAndType.attributeType;
620                AttributeBase attribute = attributeAndType.attribute;
621
622                log.debug("Trying to insert the attribute {} into the template", attributeType.name);
623                switch (attributeType.viewtype) {
624                case 1:
625                    if (attribute != null) {
626                        intVal = attribute.getInteger();
627                        log.debug("Read explicitly value for attribute '{}'", attributeType.name, intVal);
628                    }
629                    if (intVal == null && attributeType.def_int != null) {
630                        intVal = attributeType.def_int;
631                        log.debug("Viewtype 1 attribute '{}' not set explicitly. Using default value '{}'",  attributeType.name, intVal);
632                    }
633                    if (intVal != null) {
634                        val = intVal.toString();
635                    } else {
636                        val = "";
637                    }
638                    log.info("Value selected for attribute {}: {}", attributeType.name, val);
639                    break;
640                case 5:
641                    if (attribute != null) {
642                        intVal = attribute.getInteger();
643                        log.debug("Read explicitly value for attribute '{}'", attributeType.name, intVal);
644                    }
645                    if (intVal == null && attributeType.def_int != null) {
646                        intVal = attributeType.def_int;
647                        log.debug("Viewtype 5 attribute '{}' not set explicitly. Using default value '{}'", attributeType.name, intVal);
648                    }
649                    if (intVal != null && intVal > 0) {
650                        val = "true";
651                    } else {
652                        val = "false";
653                    }
654                    log.info("Value selected for attribute '{}': '{}'", attributeType.name, val);
655                    break;
656                case 6:
657                    if (attribute != null) {
658                        intVal = attribute.getInteger();
659                        log.debug("Read explicitly value for attribute '{}'", attributeType.name, intVal);
660                    }
661                    if (intVal == null && attributeType.def_int != null) {
662                        intVal = attributeType.def_int;
663                        log.debug("Viewtype 6 attribute '{}' not set explicitly. Using default value '{}'", attributeType.name, intVal);
664                    }
665                    if (intVal != null && intVal > 0) {
666                        val = "obey";
667                    } else {
668                        val = "ignore";
669                    }
670                    log.info("Value selected for attribute '{}': '{}'", attributeType.name, val);
671                    break;
672                }
673                String placeholder = "%{" + attributeType.name.toUpperCase() + "}";
674                if (template.contains(placeholder)) {
675                    String templateNew = template.replace("%{" + attributeType.name.toUpperCase() + "}", val);
676                    this.template = templateNew;
677                } else {
678                    log.warn("Placeholder '{}' not found in template. Therefore not substituted by '{}' in this template", 
679                            placeholder, val); 
680                }
681            }
682        }
683
684        @Override
685        public void writeTemplate(JspWriter out) throws IOFailure {
686                try {
687                        out.write(template);
688                } catch (IOException e) {
689                        throw new IOFailure("Unable to write to JspWriter", e);
690                }
691        }
692        
693        /**
694         *  Hack to remove existing placeholders, that is still present after template 
695         *  manipulation is completed.
696         */
697        public void removePlaceholders() {
698                template = template.replace(METADATA_ITEMS_PLACEHOLDER, "");
699                template = template.replace(CRAWLERTRAPS_PLACEHOLDER, "");
700                
701                if (template.contains(METADATA_ITEMS_PLACEHOLDER)) {
702                        throw new IllegalState("The placeholder for the property '" + METADATA_ITEMS_PLACEHOLDER  
703                                        + "' should have been deleted now."); 
704                }
705                if (template.contains(CRAWLERTRAPS_PLACEHOLDER)) {
706                        throw new IllegalState("The placeholder for the property '" + CRAWLERTRAPS_PLACEHOLDER  
707                                        + "' should have been deleted now."); 
708                }               
709        }
710}