001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.datamodel;
024
025import java.io.BufferedWriter;
026import java.io.File;
027import java.io.FileWriter;
028import java.io.IOException;
029import java.io.OutputStream;
030import java.io.Serializable;
031import java.nio.charset.Charset;
032import java.util.HashMap;
033import java.util.List;
034import java.util.Map;
035import java.util.regex.Matcher;
036import java.util.regex.Pattern;
037
038import javax.servlet.jsp.JspWriter;
039
040import org.apache.commons.io.IOUtils;
041import org.apache.commons.lang.StringUtils;
042import org.slf4j.Logger;
043import org.slf4j.LoggerFactory;
044
045import com.antiaction.raptor.dao.AttributeBase;
046import com.antiaction.raptor.dao.AttributeTypeBase;
047
048import dk.netarkivet.common.exceptions.ArgumentNotValid;
049import dk.netarkivet.common.exceptions.IOFailure;
050import dk.netarkivet.common.exceptions.IllegalState;
051import dk.netarkivet.common.exceptions.NotImplementedException;
052import dk.netarkivet.common.utils.Settings;
053import dk.netarkivet.common.utils.archive.ArchiveDateConverter;
054import dk.netarkivet.harvester.HarvesterSettings;
055import dk.netarkivet.harvester.datamodel.eav.EAV.AttributeAndType;
056
057/**
058 * Class encapsulating the Heritrix crawler-beans.cxml file 
059 * <p>
060 * 
061 * Heritrix3 has a new model based on spring, So the XPATH is no good for processing.
062 * Instead we use placeholders instead, marked by %{..} instead of ${..}, which is used by 
063 * Heritrix3 already.
064 * 
065 * The template is a H3 template if it contains the string: 
066 * 
067 * "xmlns="http://www.springframework.org/...."
068 * 
069 */
070public class H3HeritrixTemplate extends HeritrixTemplate implements Serializable {
071
072    private static final Logger log = LoggerFactory.getLogger(H3HeritrixTemplate.class);
073
074    private String template;
075    
076    /** QuotaEnforcer states for this template. TODO necessary?? */
077    private Long forceMaxbytesPerDomain;
078    private Long forceMaxobjectsPerDomain; 
079   
080    /** Has this HeritrixTemplate been verified. */
081    private boolean verified;
082
083    public final static String METADATA_ITEMS_PLACEHOLDER = "%{METADATA_ITEMS_PLACEHOLDER}";
084    public static final String MAX_TIME_SECONDS_PLACEHOLDER = "%{MAX_TIME_SECONDS_PLACEHOLDER}";
085    public static final String CRAWLERTRAPS_PLACEHOLDER = "%{CRAWLERTRAPS_PLACEHOLDER}";
086
087    public static final Pattern DEDUPLICATION_BEAN_REFERENCE_PATTERN = Pattern.compile(".*ref.*bean.*DeDuplicator.*", Pattern.DOTALL);
088
089    public static final Pattern DEDUPLICATION_BEAN_PATTERN =  Pattern.compile(".*bean.*id.*DeDuplicator.*", Pattern.DOTALL);
090    public static final String DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER 
091        = "%{DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER}"; 
092
093    public static final String ARCHIVE_FILE_PREFIX_PLACEHOLDER = "%{ARCHIVE_FILE_PREFIX_PLACEHOLDER}";
094        
095    public static final String FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER 
096        = "%{FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER}";
097    
098    public static final String QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER = 
099                "%{QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER}";
100    
101    public static final String QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER 
102        = "%{QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER}";
103
104        public static final String DEDUPLICATION_ENABLED_PLACEHOLDER = "%{DEDUPLICATION_ENABLED_PLACEHOLDER}";
105    
106    
107    // PLACEHOLDERS for archiver beans (Maybe not necessary)
108    final String ARCHIVER_BEAN_REFERENCE_PLACEHOLDER = "%{ARCHIVER_BEAN_REFERENCE_PLACEHOLDER}";        
109        final String ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER = "%{ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER}";
110        
111        //match theses properties in crawler-beans.cxml to add them into harvestInfo.xml
112        //for preservation purpose
113        public enum MetadataInfo {
114                TEMPLATE_DESCRIPTION("metadata\\.description=.+[\\r\\n]"),
115                TEMPLATE_UPDATE_DATE("metadata\\.date=.+[\\r\\n]"),
116                OPERATOR("metadata\\.operator=.+[\\r\\n]");
117                
118                private final String regex;
119                
120                private MetadataInfo(String regex) {
121                        this.regex = regex;
122                }
123                
124                public String toString() {
125                        return this.regex;
126                }
127        };
128        
129        public Map<MetadataInfo, String> metadataInfoMap;
130        
131    /**
132     * Constructor for HeritrixTemplate class.
133     *
134     * @param template_id The persistent id of the template in the database
135     * @param template The template as String object
136     * @throws ArgumentNotValid if template is null.
137     */
138    public H3HeritrixTemplate(long template_id, String template) {
139        ArgumentNotValid.checkNotNull(template, "String template");
140        this.template_id = template_id;
141        this.template = template;
142        
143        metadataInfoMap = new HashMap<MetadataInfo, String> ();
144        for(MetadataInfo metadataInfo : MetadataInfo.values()) {
145            Pattern p = Pattern.compile(metadataInfo.regex);
146            Matcher m = p.matcher(this.template);
147            if(m.find()) {
148                String operator = this.template.substring(m.start(), m.end()).trim();
149                //return the value of the property after the =
150                metadataInfoMap.put(metadataInfo, operator.split("=")[1]);
151            }
152        }
153    }
154    
155        /**
156     * return the template.
157     *
158     * @return the template
159     */
160    public HeritrixTemplate getTemplate() {
161        return this;
162    }
163
164    /**
165     * Has Template been verified?
166     *
167     * @return true, if verified on construction, otherwise false
168     */
169    public boolean isVerified() {
170        return verified;
171    }
172
173    /**
174     * Return HeritrixTemplate as XML.
175     * @return HeritrixTemplate as XML
176     */
177    @Override
178    public String getXML() {
179        return template;
180    }
181    
182    /**
183     * Update the maxTimeSeconds property in the heritrix3 template, if possible.
184     * @param maxJobRunningTimeSecondsL Force the harvestJob to end after this number of seconds 
185     * Property of the org.archive.crawler.framework.CrawlLimitEnforcer
186     * <!-- <property name="maxTimeSeconds" value="0" /> -->
187     */
188    @Override
189        public void setMaxJobRunningTime(Long maxJobRunningTimeSecondsL) {
190                if (template.contains(MAX_TIME_SECONDS_PLACEHOLDER)) {
191                this.template = template.replace(MAX_TIME_SECONDS_PLACEHOLDER, 
192                                Long.toString(maxJobRunningTimeSecondsL));
193                } else {
194                        log.warn("The placeholder '" + MAX_TIME_SECONDS_PLACEHOLDER 
195                                        + "' was not found in the template. Therefore maxRunningTime not set");
196                }
197        }
198
199    
200        @Override
201        public void setMaxBytesPerDomain(Long maxbytesL) {
202                this.forceMaxbytesPerDomain = maxbytesL;                
203        }       
204  
205
206        @Override
207        public Long getMaxBytesPerDomain() {
208                return this.forceMaxbytesPerDomain;
209        }
210
211        @Override
212        public void setMaxObjectsPerDomain(Long maxobjectsL) {
213                this.forceMaxobjectsPerDomain = maxobjectsL;
214        }
215
216        @Override
217        public Long getMaxObjectsPerDomain() {
218                return this.forceMaxobjectsPerDomain;
219        }
220    
221        @Override
222        public boolean isValid() {
223                /*
224                StringBuilder errors = new StringBuilder();
225                // check for Deduplication index-location placeholder and DEDUPLICATION_BEAN_PATTERN
226                if (template.contains(DEDUPLICATION_BEAN_PATTERN)) {
227                        if (!template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER)) {
228                                errors.append("Has DefdMissing placeholder '" +  DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER + "'"
229                        }
230                } 
231                template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER) 
232                && template.contains(deduplicationBeanPattern)
233                */
234                return true;
235        }
236
237        @Override
238        // This method is used to decide, whether to request a deduplication index or not.
239        // Done by checking, if both  
240        //   - a DeDuplicator bean is present in the template
241        // and
242        //   - a  DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER is also present.
243        // and 
244        //   - a DeDuplicator reference bean is present in the template
245        public boolean IsDeduplicationEnabled() {
246                return (template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER) 
247                                && DEDUPLICATION_BEAN_PATTERN.matcher(template).matches()
248                                && DEDUPLICATION_BEAN_REFERENCE_PATTERN.matcher(template).matches());
249        }       
250
251        /**
252     * Configuring the quota-enforcer, depending on budget definition. Object limit can be defined either by
253     * using the queue-total-budget property or the quota enforcer. Which is chosen is set by the argument
254     * maxObjectsIsSetByQuotaEnforcer}'s value. So quota enforcer is set as follows:
255     * If all values in the quotaEnforcer is infinity, it is in effect disabled
256     * <ul>
257     * <li>Object limit is not set by quota enforcer, disabled only if there is no byte limit.</li>
258     * <li>Object limit is set by quota enforcer, so it should be enabled if a byte or object limit is set.</li>
259     * </ul>
260     *
261     * @param maxObjectsIsSetByQuotaEnforcer Decides whether the maxObjectsIsSetByQuotaEnforcer or not.
262     * @param forceMaxBytesPerDomain The number of max bytes per domain enforced (can be no limit)
263     * @param forceMaxObjectsPerDomain The number of max objects per domain enforced (can be no limit)
264     */
265        public void configureQuotaEnforcer(boolean maxObjectsIsSetByQuotaEnforcer,
266                        long forceMaxBytesPerDomain, long forceMaxObjectsPerDomain) {
267                this.forceMaxobjectsPerDomain = forceMaxObjectsPerDomain;
268                this.forceMaxbytesPerDomain = forceMaxBytesPerDomain;
269                String tmp = template;
270                if (!maxObjectsIsSetByQuotaEnforcer) {
271                        // SetMaxObjects in the global budget to forceMaxObjectsPerDomain??
272                        String tmp1 = tmp.replace(
273                                        FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER, Long.toString( forceMaxObjectsPerDomain ));
274                        // SetMaxObjects to infinity in the quotaEnforcer
275                        tmp = tmp1.replace(QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER, 
276                                        Long.toString(Constants.HERITRIX_MAXOBJECTS_INFINITY));
277                } else {
278                        // SetMaxObjects in the global budget to Infinity
279                        String tmp1 = tmp.replace(
280                                        FRONTIER_QUEUE_TOTAL_BUDGET_PLACEHOLDER, Long.toString( Constants.HERITRIX_MAXOBJECTS_INFINITY ));                      
281                        // SetMaxObjects to forceMaxObjectsPerDomain in the quotaEnforcer
282                        tmp = tmp1.replace(QUOTA_ENFORCER_GROUP_MAX_FETCH_SUCCES_PLACEHOLDER, 
283                                        Long.toString(forceMaxObjectsPerDomain));
284                }
285                
286                // SetMaxbytes in the QuotaEnforcer to forceMaxBytesPerDomain
287                // Divide by 1024 since Heritrix uses KB rather than bytes,
288                // and add 1 to avoid to low limit due to rounding.
289                String maxBytesStringValue = "-1";
290                if (forceMaxBytesPerDomain != Constants.HERITRIX_MAXBYTES_INFINITY) {
291                        maxBytesStringValue = Long.toString(( forceMaxBytesPerDomain 
292                                        / Constants.BYTES_PER_HERITRIX_BYTELIMIT_UNIT) + 1);
293                        log.debug("MaxbytesPerDomain set to {} Kbytes per domain", maxBytesStringValue);
294                } else {
295                        log.debug("MaxbytesPerDomain set to infinite number of Kbytes per domain");     
296                }
297                
298                this.template = tmp.replace(QUOTA_ENFORCER_MAX_BYTES_PLACEHOLDER, maxBytesStringValue);
299                
300        }
301        
302         /**
303     * Make sure that Heritrix will archive its data in the chosen archiveFormat.
304     *
305     * @param archiveFormat the chosen archiveformat ('arc' or 'warc' supported)
306     * @throws ArgumentNotValid If the chosen archiveFormat is not supported.
307     */
308        @Override
309        public void setArchiveFormat(String archiveFormat) {
310                if (!template.contains(ARCHIVER_BEAN_REFERENCE_PLACEHOLDER)){
311                throw new IllegalState("The placeholder '" + ARCHIVER_BEAN_REFERENCE_PLACEHOLDER 
312                                        + "' is missing. Unable to insert proper archive writer");
313        }
314        if (!template.contains(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER)) {
315                        throw new IllegalState("The placeholder '" + ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER 
316                                        + "' is missing. Unable to insert proper archive writer");
317                }
318                if ("arc".equalsIgnoreCase(archiveFormat)) {
319                        log.debug("ARC format selected to be used by Heritrix3");
320                        setArcArchiveformat();
321                } else if ("warc".equalsIgnoreCase(archiveFormat)) {
322                        log.debug("WARC format selected to be used by Heritrix3");
323                        setWarcArchiveformat();
324                } else {
325                        throw new ArgumentNotValid("Configuration of '" + HarvesterSettings.HERITRIX_ARCHIVE_FORMAT
326                                        + "' is invalid! Unrecognized format '" + archiveFormat + "'.");
327                }
328        }
329
330        /**
331         * Set the archive-format as ARC. This means enabling the ARCWriterProcessor in the template
332         */
333        private void setArcArchiveformat(){
334                String arcWriterbeanReference = "<ref bean=\"arcWriter\"/>";
335        String templateNew = template.replace(ARCHIVER_BEAN_REFERENCE_PLACEHOLDER, arcWriterbeanReference);
336        template = templateNew.replace(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER, getArcWriterProcessor()); 
337    }
338                
339        private String getArcWriterProcessor() {
340
341            //      <bean id="arcWriter" class="org.archive.modules.writer.ARCWriterProcessor">
342            //            <!-- <property name="compress" value="true" /> -->
343            //            <!-- <property name="prefix" value="IAH" /> -->
344            //            <!-- <property name="suffix" value="${HOSTNAME}" /> -->
345            //            <!-- <property name="maxFileSizeBytes" value="100000000" /> -->
346            //            <!-- <property name="poolMaxActive" value="1" /> -->
347            //            <!-- <property name="poolMaxWaitMs" value="300000" /> -->
348            //            <!-- <property name="skipIdenticalDigests" value="false" /> -->
349            //            <!-- <property name="maxTotalBytesToWrite" value="0" /> -->
350            //            <!-- <property name="directory" value="." /> -->
351            //            <!-- <property name="storePaths">
352            //                  <list>
353            //                   <value>arcs</value>
354            //                  </list>
355            //                 </property> -->
356            //           </bean>
357            // "<bean id=\"arcWriter\" class=\"org.archive.modules.writer.ARCWriterProcessor\">";
358            String propertyName="\n<property name=\"";
359            String valuePrefix = "\" value=\"";
360            String valueSuffix = "\"";
361            String propertyEnd="/>";
362
363            StringBuilder arcWriterBeanBuilder = new StringBuilder();
364            arcWriterBeanBuilder.append("<bean id=\"arcWriter\" class=\"org.archive.modules.writer.ARCWriterProcessor\">\n");
365            arcWriterBeanBuilder.append(propertyName + "compress" + valuePrefix
366                    + Settings.get(HarvesterSettings.HERITRIX3_ARC_COMPRESSION) 
367                    + valueSuffix + propertyEnd); 
368            arcWriterBeanBuilder.append(propertyName + "prefix" + valuePrefix
369                    + ARCHIVE_FILE_PREFIX_PLACEHOLDER
370                    + valueSuffix + propertyEnd);
371//          arcWriterBeanBuilder.append(propertyName + "suffix" + valuePrefix
372//                  + Settings.get(HarvesterSettings.HERITRIX3_ARC_SUFFIX) 
373//                  + valueSuffix + propertyEnd); 
374            arcWriterBeanBuilder.append(propertyName + "maxFileSizeBytes" + valuePrefix
375                    + Settings.get(HarvesterSettings.HERITRIX3_ARC_MAXSIZE) 
376                    + valueSuffix + propertyEnd); 
377            arcWriterBeanBuilder.append(propertyName + "poolMaxActive" + valuePrefix
378                    + Settings.get(HarvesterSettings.HERITRIX3_ARC_POOL_MAXACTIVE) 
379                    + valueSuffix + propertyEnd); 
380            arcWriterBeanBuilder.append(propertyName + "skipIdenticalDigests" + valuePrefix 
381                                + Settings.get(HarvesterSettings.HERITRIX3_ARC_SKIP_IDENTICAL_DIGESTS)
382                                + valueSuffix + propertyEnd);
383
384            arcWriterBeanBuilder.append("</bean>");
385
386            return arcWriterBeanBuilder.toString();                           
387        }
388
389                
390        /** 
391         * Insert WARC-archiver beans and remove placeholder for ARC-Archiver-beans
392         * It is an error, if the WARC place-holders doesnt't exist.
393         * It is not an error, if the property placeholder does not exist.
394         */
395        private void setWarcArchiveformat() {           
396                String warcWriterbeanReference = "<ref bean=\"warcWriter\"/>";
397                String warcWriterProcessorBean = "<bean id=\"warcWriter\" class=\"dk.netarkivet.harvester.harvesting.NasWARCProcessor\">";
398                String propertyName="\n<property name=\"";
399                String valuePrefix = "\" value=\"";
400                String valueSuffix = "\"";
401                String propertyEnd="/>";
402                if (!template.contains(ARCHIVER_BEAN_REFERENCE_PLACEHOLDER)) {
403                        throw new IllegalState("The placeholder '" + ARCHIVER_BEAN_REFERENCE_PLACEHOLDER 
404                                        + "' is missing");
405                }
406                if (!template.contains(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER)) {
407                        throw new IllegalState("The placeholder '" + ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER 
408                                        + "' is missing");
409                }
410                StringBuilder propertyBuilder = new StringBuilder();
411                propertyBuilder.append(propertyName + "template" + valuePrefix 
412                      + Settings.get(HarvesterSettings.HERITRIX3_WARC_TEMPLATE)
413              + valueSuffix + propertyEnd);                             
414                propertyBuilder.append(propertyName + "compress" + valuePrefix 
415                      + Settings.get(HarvesterSettings.HERITRIX3_WARC_COMPRESSION) 
416                                + valueSuffix + propertyEnd);
417                // Note: The prefix value will be replaced later by the setArchiveFilePrefix() method
418                propertyBuilder.append(propertyName + "prefix" + valuePrefix 
419                                + ARCHIVE_FILE_PREFIX_PLACEHOLDER
420                                + valueSuffix + propertyEnd);
421                propertyBuilder.append(propertyName + "maxFileSizeBytes" + valuePrefix
422                      + Settings.get(HarvesterSettings.HERITRIX3_WARC_MAXSIZE)
423                      + valueSuffix + propertyEnd);
424                propertyBuilder.append(propertyName + "poolMaxActive" + valuePrefix
425                + Settings.get(HarvesterSettings.HERITRIX3_WARC_POOL_MAXACTIVE)
426                + valueSuffix + propertyEnd);
427          
428                propertyBuilder.append(propertyName + "writeRequests" + valuePrefix 
429                                + Settings.get(HarvesterSettings.HERITRIX3_WARC_WRITE_REQUESTS)
430                                + valueSuffix + propertyEnd);
431                propertyBuilder.append(propertyName + "writeMetadata" + valuePrefix 
432                                + Settings.get(HarvesterSettings.HERITRIX3_WARC_WRITE_METADATA)
433                                + valueSuffix + propertyEnd);
434                propertyBuilder.append(propertyName + "writeMetadataOutlinks" + valuePrefix 
435                                + Settings.get(HarvesterSettings.HERITRIX3_WARC_WRITE_METADATA_OUTLINKS)
436                                + valueSuffix + propertyEnd);
437                propertyBuilder.append(propertyName + "skipIdenticalDigests" + valuePrefix 
438                                + Settings.get(HarvesterSettings.HERITRIX3_WARC_SKIP_IDENTICAL_DIGESTS)
439                                + valueSuffix + propertyEnd);
440                propertyBuilder.append(propertyName + "startNewFilesOnCheckpoint" + valuePrefix 
441                                + Settings.get(HarvesterSettings.HERITRIX3_WARC_START_NEW_FILES_ON_CHECKPOINT)
442                                + valueSuffix + propertyEnd);
443                
444                warcWriterProcessorBean += propertyBuilder.toString();
445                warcWriterProcessorBean += "\n\n%{METADATA_ITEMS_PLACEHOLDER}\n</bean>";
446                String templateNew = template.replace(
447                                ARCHIVER_BEAN_REFERENCE_PLACEHOLDER, warcWriterbeanReference);
448                this.template = templateNew.replace(ARCHIVER_PROCESSOR_BEAN_PLACEHOLDER,
449                                warcWriterProcessorBean);
450        }
451
452        @Override
453        /**
454         * With H3 template, we insert the crawlertraps into the template at once.
455         * They are inserted to be part of a org.archive.modules.deciderules.MatchesListRegexDecideRule
456         * bean.
457         * 
458         * @param elementName The elementName is currently not used with H3
459         * @param crawlertraps A list of crawlertraps to be inserted
460         */
461        public void insertCrawlerTraps(String elementName, List<String> crawlertraps) {
462//      <bean class="org.archive.modules.deciderules.MatchesListRegexDecideRule">
463//      <!-- <property name="listLogicalOr" value="true" /> -->
464//      <!-- <property name="regexList">
465//            <list>
466//            CRAWLERTRAPS_PLACEHOLDER 
467//            </list>
468//           </property> -->
469//     </bean>
470        if (crawlertraps.isEmpty()) {
471                log.debug("No crawlertraps yet. No insertion is done");
472                return;
473        } else if (!template.contains(CRAWLERTRAPS_PLACEHOLDER)) {      
474                log.warn("The placeholder '" + CRAWLERTRAPS_PLACEHOLDER 
475                                + "' is absent from the template. No insertion is done at all. {} traps were ignored", 
476                                crawlertraps);
477                return;
478        } else {
479                log.info("Inserting {} crawlertraps into the template", crawlertraps.size());
480                StringBuilder sb = new StringBuilder();
481                for (String trap: crawlertraps) {
482                        sb.append("<value>" + trap + "</value>\n");
483                }
484                // Adding the placeholder again to be able to insert crawlertraps multiple times.
485                sb.append(CRAWLERTRAPS_PLACEHOLDER + "\n"); 
486                String templateNew = template.replace(CRAWLERTRAPS_PLACEHOLDER, sb.toString());
487                this.template = templateNew;
488        }
489        }
490        
491        public String getMetadataInfo(MetadataInfo info) {
492                String infoStr = null;
493                if(metadataInfoMap.containsKey(info)) {
494                        infoStr = metadataInfoMap.get(info);
495                }
496                return infoStr;
497        }
498
499        @Override
500        public void writeTemplate(OutputStream os) throws IOFailure {
501                try {
502                        os.write(template.getBytes(Charset.forName("UTF-8")));
503                } catch (IOException e) {
504                        throw new IOFailure("Unable to write template to outputstream", e);
505                }
506                
507        }
508
509        @Override
510        public boolean hasContent() {
511                throw new NotImplementedException("The hasContent method hasn't been implemented yet");
512        }
513
514        @Override
515        public void writeToFile(File orderXmlFile) {
516                BufferedWriter writer = null;
517                try {
518                        writer = new BufferedWriter( new FileWriter(orderXmlFile));
519                        writer.write(template);
520                } catch(IOException e) {
521                        throw new IOFailure("Unable to write template to file '" + orderXmlFile.getAbsolutePath() + "'.", e);
522                } finally {
523                        IOUtils.closeQuietly(writer);
524                }
525        }
526
527        @Override
528        public void setRecoverlogNode(File recoverlogGzFile) {
529                throw new NotImplementedException("This method has not yet been implemented");
530                
531        }
532
533        @Override
534        public void setDeduplicationIndexLocation(String absolutePath) {
535                if (!template.contains(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER)) {
536                        throw new IllegalState("The placeholder for the deduplication index location property '" +  DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER 
537                                        + "' was not found. Maybe the placeholder has already been replaced with the correct value: " 
538                                        + template);
539                }
540        String templateNew = template.replace(DEDUPLICATION_INDEX_LOCATION_PLACEHOLDER, absolutePath); 
541        this.template = templateNew;
542        }
543
544        @Override
545        public void setSeedsFilePath(String absolutePath) {
546         log.debug("Note: SeedsFilePath is not set in h3");
547        }
548
549        @Override
550        public void setArchiveFilePrefix(String archiveFilePrefix) {
551                if (!template.contains(ARCHIVE_FILE_PREFIX_PLACEHOLDER)) {
552                        throw new IllegalState("The placeholder for the archive file prefix property '" 
553                                        + ARCHIVE_FILE_PREFIX_PLACEHOLDER 
554                                        + "' was not found. Maybe the placeholder has already been replaced with the correct value. The template looks like this: " 
555                                        + template);
556                }
557                String templateNew = template.replace(ARCHIVE_FILE_PREFIX_PLACEHOLDER, archiveFilePrefix);              
558        this.template = templateNew;
559                
560        }
561
562        @Override
563        public void setDiskPath(String absolutePath) {
564                // NOP
565                log.warn("The DiskPath is not settable in the H3 template");
566        }
567
568        @Override
569        public void removeDeduplicatorIfPresent() {
570                //NOP
571                log.debug("In H3 we don't remove the deduplicator, but just disable it.");
572        }
573
574        @Override public void enableOrDisableDeduplication(boolean enabled) {
575                final String replacement = Boolean.toString(enabled).toLowerCase();
576                log.debug("Replacing deduplication enabled placeholder {} with {}.", DEDUPLICATION_ENABLED_PLACEHOLDER, replacement);
577                this.template = template.replace(DEDUPLICATION_ENABLED_PLACEHOLDER, replacement);
578        }
579
580        //<property name="metadataItems">
581//  <map>
582//        <entry key="harvestInfo.version" value="1.03"/> <!-- TODO maybe not add this one -->
583//        <entry key="harvestInfo.jobId" value="1"/>
584//        <entry key="harvestInfo.channel" value="HIGH"/>
585//        <entry key="harvestInfo.harvestNum" value="1"/>
586//        <entry key="harvestInfo.origHarvestDefinitionID" value="1"/>
587//        <entry key="harvestInfo.maxBytesPerDomain" value="100000"/>
588//        <entry key="harvestInfo.maxObjectsPerDomain" value="-1"/>
589//        <entry key="harvestInfo.orderXMLName" value="defaultOrderXml"/>
590//        <entry key="harvestInfo.origHarvestDefinitionName" value="ddddddddd"/>
591//        <entry key="harvestInfo.scheduleName" value="EveryHour"/> <!-- Optional. only relevant for Selective Harvests -- only inserted if not null and not-empty.->
592//        <entry key="harvestInfo.harvestFilenamePrefix" value="netarkivet-1-1"/>
593//        <entry key="harvestInfo.jobSubmitDate" value="22. 10. 2014"/>
594//        <entry key="harvestInfo.performer" value="performer"/> <!-- Optional - only inserted if not null and not-empty. -->
595//        <entry key="harvestInfo.audience" value="audience"/> <!-- Optional - only inserted if not null and not-empty. -->
596//  </map>
597//  </property>
598
599        public void insertWarcInfoMetadata(Job ajob, String origHarvestdefinitionName, 
600                        String origHarvestdefinitionComments, String scheduleName, String performer) {
601                if (!template.contains(METADATA_ITEMS_PLACEHOLDER)) {
602                        throw new IllegalState("The placeholder for the property '" + METADATA_ITEMS_PLACEHOLDER  
603                                        + "' was not found. Maybe the placeholder has already been replaced with the correct value. The template looks like this: " 
604                                        + template); 
605                }
606                log.debug("Now in " + getClass().getName());
607                String startMetadataEntry = "\n<entry key=\"";
608                String endMetadataEntry = "\"/>";
609                String valuePart = "\" value=\"";
610                StringBuilder sb = new StringBuilder();
611                sb.append("<property name=\"metadataItems\">\n<map>\n");
612                
613                // <entry key="harvestInfo.version" value="1.03"/>
614                
615                sb.append(startMetadataEntry);
616                sb.append(HARVESTINFO_VERSION + valuePart + HARVESTINFO_VERSION_NUMBER + endMetadataEntry); 
617                sb.append(startMetadataEntry);
618                sb.append(HARVESTINFO_JOBID + valuePart + ajob.getJobID() + endMetadataEntry);
619
620                sb.append(startMetadataEntry);
621                sb.append(HARVESTINFO_CHANNEL + valuePart + ajob.getChannel() + endMetadataEntry);
622                sb.append(startMetadataEntry);
623                sb.append(HARVESTINFO_HARVESTNUM + valuePart + ajob.getHarvestNum() + endMetadataEntry);
624                sb.append(startMetadataEntry);
625                sb.append(HARVESTINFO_ORIGHARVESTDEFINITIONID + valuePart + ajob.getOrigHarvestDefinitionID() + endMetadataEntry);
626                sb.append(startMetadataEntry);
627                sb.append(HARVESTINFO_MAXBYTESPERDOMAIN + valuePart + ajob.getMaxBytesPerDomain() + endMetadataEntry);
628                sb.append(startMetadataEntry);
629                sb.append(HARVESTINFO_MAXOBJECTSPERDOMAIN + valuePart + ajob.getMaxObjectsPerDomain() + endMetadataEntry);
630                sb.append(startMetadataEntry);
631                sb.append(HARVESTINFO_ORDERXMLNAME + valuePart + ajob.getOrderXMLName() + endMetadataEntry);
632
633                /* orderxml update date - only inserted if not null and not-empty. */
634                /* take info from crawler-beans.cxml */
635                String tmp = getMetadataInfo(MetadataInfo.TEMPLATE_UPDATE_DATE);
636                if (tmp != null && !tmp.isEmpty()){
637                        sb.append(startMetadataEntry);
638                        sb.append(HARVESTINFO_ORDERXMLUPDATEDATE + valuePart + tmp  + endMetadataEntry);
639                }
640                /* orderxml description - only inserted if not null and not-empty. */
641                /* take info from crawler-beans.cxml */
642                tmp = getMetadataInfo(MetadataInfo.TEMPLATE_DESCRIPTION);
643                if (tmp != null && !tmp.isEmpty()){
644                        sb.append(startMetadataEntry);
645                        sb.append(HARVESTINFO_ORDERXMLDESCRIPTION + valuePart + tmp  + endMetadataEntry);
646                }
647
648                sb.append(startMetadataEntry);
649                sb.append(HARVESTINFO_ORIGHARVESTDEFINITIONNAME + valuePart + 
650                                origHarvestdefinitionName + endMetadataEntry);
651                
652                if(StringUtils.isNotEmpty(origHarvestdefinitionComments)) {
653                        sb.append(startMetadataEntry);
654                        sb.append(HARVESTINFO_ORIGHARVESTDEFINITIONCOMMENTS + valuePart + 
655                                origHarvestdefinitionComments + endMetadataEntry);
656                }
657                
658                /* optional schedule-name - only inserted if not null and not-empty. */
659                if (scheduleName != null && !scheduleName.isEmpty()) {
660                        sb.append(startMetadataEntry);
661                        sb.append(HARVESTINFO_SCHEDULENAME + valuePart + scheduleName + endMetadataEntry);
662                }
663                sb.append(startMetadataEntry);
664                sb.append(HARVESTINFO_HARVESTFILENAMEPREFIX + valuePart + ajob.getHarvestFilenamePrefix() + endMetadataEntry);
665                sb.append(startMetadataEntry);
666                sb.append(HARVESTINFO_JOBSUBMITDATE + valuePart + ArchiveDateConverter.getWarcDateFormat().format(ajob.getSubmittedDate()) + endMetadataEntry);
667                
668                /* optional HARVESTINFO_PERFORMER - only inserted if not null and not-empty. */
669                if (performer != null && !performer.isEmpty()){
670                        sb.append(startMetadataEntry);
671                        sb.append(HARVESTINFO_PERFORMER + valuePart + performer  + endMetadataEntry);
672                }
673                
674                /* optional OPERATOR - only inserted if not null and not-empty. */
675                /* take info from crawler-beans.cxml */
676                String operator = getMetadataInfo(MetadataInfo.OPERATOR);
677                if (operator != null && !operator.isEmpty()){
678                        sb.append(startMetadataEntry);
679                        sb.append(HARVESTINFO_OPERATOR + valuePart + operator  + endMetadataEntry);
680                }
681                
682                /* optional HARVESTINFO_AUDIENCE - only inserted if not null and not-empty. */
683                if (ajob.getHarvestAudience() != null && !ajob.getHarvestAudience().isEmpty()) {
684                        sb.append(startMetadataEntry);
685                        sb.append(HARVESTINFO_AUDIENCE + valuePart + ajob.getHarvestAudience() + endMetadataEntry);
686                }
687                sb.append("\n</map>\n</property>\n");
688                
689                // Replace command
690                log.info("Adding WarcInfoMetadata " + sb.toString());
691                String templateNew = template.replace(METADATA_ITEMS_PLACEHOLDER, sb.toString());
692                this.template = templateNew;
693        }
694
695        @Override
696        public void insertAttributes(List<AttributeAndType> attributesAndTypes) {
697            ArgumentNotValid.checkNotNull(attributesAndTypes, "List<AttributeAndType> attributesAndTypes");
698            for (AttributeAndType attributeAndType: attributesAndTypes) {
699                // initialize temp variables
700                Integer intVal = null;
701                String val = null;
702                AttributeTypeBase attributeType = attributeAndType.attributeType;
703                AttributeBase attribute = attributeAndType.attribute;
704
705                log.debug("Trying to insert the attribute {} into the template", attributeType.name);
706                switch (attributeType.viewtype) {
707                case 1:
708                    if (attribute != null) {
709                        intVal = attribute.getInteger();
710                        log.debug("Read explicitly value for attribute '{}'", attributeType.name, intVal);
711                    }
712                    if (intVal == null && attributeType.def_int != null) {
713                        intVal = attributeType.def_int;
714                        log.debug("Viewtype 1 attribute '{}' not set explicitly. Using default value '{}'",  attributeType.name, intVal);
715                    }
716                    if (intVal != null) {
717                        val = intVal.toString();
718                    } else {
719                        val = "";
720                    }
721                    log.info("Value selected for attribute {}: {}", attributeType.name, val);
722                    break;
723                case 5:
724                    if (attribute != null) {
725                        intVal = attribute.getInteger();
726                        log.debug("Read explicitly value for attribute '{}'", attributeType.name, intVal);
727                    }
728                    if (intVal == null && attributeType.def_int != null) {
729                        intVal = attributeType.def_int;
730                        log.debug("Viewtype 5 attribute '{}' not set explicitly. Using default value '{}'", attributeType.name, intVal);
731                    }
732                    if (intVal != null && intVal > 0) {
733                        val = "true";
734                    } else {
735                        val = "false";
736                    }
737                    log.info("Value selected for attribute '{}': '{}'", attributeType.name, val);
738                    break;
739                case 6:
740                    if (attribute != null) {
741                        intVal = attribute.getInteger();
742                        log.debug("Read explicitly value for attribute '{}'", attributeType.name, intVal);
743                    }
744                    if (intVal == null && attributeType.def_int != null) {
745                        intVal = attributeType.def_int;
746                        log.debug("Viewtype 6 attribute '{}' not set explicitly. Using default value '{}'", attributeType.name, intVal);
747                    }
748                    if (intVal != null && intVal > 0) {
749                        val = "obey";
750                    } else {
751                        val = "ignore";
752                    }
753                    log.info("Value selected for attribute '{}': '{}'", attributeType.name, val);
754                    break;
755                }
756                String placeholder = "%{" + attributeType.name.toUpperCase() + "}";
757                if (template.contains(placeholder)) {
758                    String templateNew = template.replace("%{" + attributeType.name.toUpperCase() + "}", val);
759                    this.template = templateNew;
760                } else {
761                    log.warn("Placeholder '{}' not found in template. Therefore not substituted by '{}' in this template", 
762                            placeholder, val); 
763                }
764            }
765        }
766
767        @Override
768        public void writeTemplate(JspWriter out) throws IOFailure {
769                try {
770                        out.write(template);
771                } catch (IOException e) {
772                        throw new IOFailure("Unable to write to JspWriter", e);
773                }
774        }
775        
776        /**
777         *  Hack to remove existing placeholders, that is still present after template 
778         *  manipulation is completed.
779         */
780        public void removePlaceholders() {
781                template = template.replace(METADATA_ITEMS_PLACEHOLDER, "");
782                template = template.replace(CRAWLERTRAPS_PLACEHOLDER, "");
783                
784                if (template.contains(METADATA_ITEMS_PLACEHOLDER)) {
785                        throw new IllegalState("The placeholder for the property '" + METADATA_ITEMS_PLACEHOLDER  
786                                        + "' should have been deleted now."); 
787                }
788                if (template.contains(CRAWLERTRAPS_PLACEHOLDER)) {
789                        throw new IllegalState("The placeholder for the property '" + CRAWLERTRAPS_PLACEHOLDER  
790                                        + "' should have been deleted now."); 
791                }               
792        }
793}