001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.datamodel;
024
025import java.io.File;
026import java.io.IOException;
027import java.io.OutputStream;
028import java.io.Serializable;
029import java.io.UnsupportedEncodingException;
030import java.util.HashMap;
031import java.util.List;
032import java.util.Map;
033import java.util.regex.Matcher;
034import java.util.regex.Pattern;
035
036import javax.servlet.jsp.JspWriter;
037
038import org.dom4j.Document;
039import org.dom4j.DocumentException;
040import org.dom4j.Element;
041import org.dom4j.Node;
042import org.dom4j.io.XMLWriter;
043import org.slf4j.Logger;
044import org.slf4j.LoggerFactory;
045
046import dk.netarkivet.common.exceptions.ArgumentNotValid;
047import dk.netarkivet.common.exceptions.IOFailure;
048import dk.netarkivet.common.exceptions.IllegalState;
049import dk.netarkivet.common.exceptions.PermissionDenied;
050import dk.netarkivet.common.exceptions.UnknownID;
051import dk.netarkivet.common.utils.Settings;
052import dk.netarkivet.common.utils.XmlUtils;
053import dk.netarkivet.harvester.HarvesterSettings;
054import dk.netarkivet.harvester.datamodel.eav.EAV.AttributeAndType;
055import dk.netarkivet.harvester.harvesting.report.Heritrix1Constants;
056
057/**
058 * Class encapsulating the Heritrix order.xml. Enables verification that dom4j Document obey the constraints required by
059 * our software, specifically the Job class.
060 * <p>
061 * The class assumes the type of order.xml used in configuring Heritrix version 1.10+. Information about the Heritrix
062 * crawler, and its processes and modules can be found in the Heritrix developer and user manuals found on <a
063 * href="http://crawler.archive.org">http://crawler.archive.org<a/>
064 *  
065 */
066public class H1HeritrixTemplate extends HeritrixTemplate implements Serializable {
067
068    private static final Logger log = LoggerFactory.getLogger(H1HeritrixTemplate.class);
069
070    /** the dom4j Document hiding behind this instance of HeritrixTemplate. */
071    private Document template;
072
073    /** has this HeritrixTemplate been verified. */
074    private boolean verified;
075
076    /** Xpath needed by Job.editOrderXML_maxBytesPerDomain(). */
077    public static final String QUOTA_ENFORCER_ENABLED_XPATH = "/crawl-order/controller/map[@name='pre-fetch-processors']"
078            + "/newObject[@name='QuotaEnforcer']" + "/boolean[@name='enabled']";;
079    /** Xpath needed by Job.editOrderXML_maxBytesPerDomain(). */
080    public static final String GROUP_MAX_ALL_KB_XPATH = "/crawl-order/controller/map[@name='pre-fetch-processors']"
081            + "/newObject[@name='QuotaEnforcer']" + "/long[@name='group-max-all-kb']";
082    /** Xpath needed by Job.editOrderXML_maxObjectsPerDomain(). */
083    public static final String GROUP_MAX_FETCH_SUCCESS_XPATH = "/crawl-order/controller/map[@name='pre-fetch-processors']"
084            + "/newObject[@name='QuotaEnforcer']" + "/long[@name='group-max-fetch-successes']";
085    /** Xpath needed by Job.editOrderXML_maxObjectsPerDomain(). */
086    public static final String QUEUE_TOTAL_BUDGET_XPATH = "/crawl-order/controller/newObject[@name='frontier']"
087            + "/long[@name='queue-total-budget']";
088    /** Xpath needed by Job.editOrderXML_crawlerTraps(). */
089    public static final String DECIDERULES_MAP_XPATH = "/crawl-order/controller/newObject"
090            + "/newObject[@name='decide-rules']" + "/map[@name='rules']";
091    /** Xpath needed by Job.editOrderXML_crawlerTraps(). */
092    public static final String DECIDERULES_ACCEPT_IF_PREREQUISITE_XPATH = "/crawl-order/controller/newObject"
093            + "/newObject[@name='decide-rules']" + "/map[@name='rules']/newObject[@class="
094            + "'org.archive.crawler.deciderules.PrerequisiteAcceptDecideRule']";
095
096    /** Xpath checked by Heritrix for correct user-agent field in requests. */
097    public static final String HERITRIX_USER_AGENT_XPATH = "/crawl-order/controller/map[@name='http-headers']"
098            + "/string[@name='user-agent']";
099    /** Xpath checked by Heritrix for correct mail address. */
100    public static final String HERITRIX_FROM_XPATH = "/crawl-order/controller/map[@name='http-headers']/"
101            + "string[@name='from']";
102    /** Xpath to check, that all templates use the DecidingScope. */
103    public static final String DECIDINGSCOPE_XPATH = "/crawl-order/controller/newObject[@name='scope']" + "[@class='"
104            + Heritrix1Constants.DECIDINGSCOPE_CLASSNAME + "']";
105    /**
106     * Xpath for the deduplicator node in order.xml documents.
107     */
108    public static final String DEDUPLICATOR_XPATH = "/crawl-order/controller/map[@name='write-processors']"
109            + "/newObject[@name='DeDuplicator']";
110
111    /**
112     * Xpath to check, that all templates use the same ARC archiver path,
113     * {@link dk.netarkivet.common.Constants#ARCDIRECTORY_NAME}. The archive path tells Heritrix to which directory it
114     * shall write its arc files.
115     */
116    public static final String ARC_ARCHIVER_PATH_XPATH = "/crawl-order/controller/map[@name='write-processors']/"
117            + "newObject[@name='Archiver']/stringList[@name='path']/string";
118
119    /**
120     * Xpath to check, that all templates use the same WARC archiver path,
121     * {@link dk.netarkivet.common.Constants#WARCDIRECTORY_NAME}. The archive path tells Heritrix to which directory it
122     * shall write its arc files.
123     */
124    public static final String WARC_ARCHIVER_PATH_XPATH = "/crawl-order/controller/map[@name='write-processors']/"
125            + "newObject[@name='WARCArchiver']/stringList[@name='path']/string";
126
127    /**
128     * Xpath for the deduplicator index directory node in order.xml documents.
129     */
130    public static final String DEDUPLICATOR_INDEX_LOCATION_XPATH = DEDUPLICATOR_XPATH
131            + "/string[@name='index-location']";
132
133    /**
134     * Xpath for the boolean telling if the deduplicator is enabled in order.xml documents.
135     */
136    public static final String DEDUPLICATOR_ENABLED = DEDUPLICATOR_XPATH + "/boolean[@name='enabled']";
137
138    /** Xpath for the 'disk-path' in the order.xml . */
139    public static final String DISK_PATH_XPATH = "//crawl-order/controller" + "/string[@name='disk-path']";
140    /** Xpath for the arcfile 'prefix' in the order.xml . */
141    public static final String ARCHIVEFILE_PREFIX_XPATH = "//crawl-order/controller" + "/map[@name='write-processors']"
142            + "/newObject/string[@name='prefix']";
143    /** Xpath for the ARCs dir in the order.xml. */
144    public static final String ARCSDIR_XPATH = "//crawl-order/controller" + "/map[@name='write-processors']"
145            + "/newObject[@name='Archiver']/stringList[@name='path']/string";
146
147    public static final String WARCWRITERPROCESSOR_XPATH = "//crawl-order/controller"
148            + "/map[@name='write-processors']" + "/newObject[@name='WARCArchiver']";
149
150    public static final String ARCWRITERPROCESSOR_XPATH = "//crawl-order/controller"
151            + "/map[@name='write-processors']" + "/newObject[@name='Archiver']";
152
153    /** Xpath for the WARCs dir in the order.xml. */
154    public static final String WARCSDIR_XPATH = WARCWRITERPROCESSOR_XPATH + "/stringList[@name='path']/string";
155
156    /** Xpath for the 'seedsfile' in the order.xml. */
157    public static final String SEEDS_FILE_XPATH = "//crawl-order/controller" + "/newObject[@name='scope']"
158            + "/string[@name='seedsfile']";
159
160    public static final String ARCS_ENABLED_XPATH = ARCWRITERPROCESSOR_XPATH + "/boolean[@name='enabled']";
161
162    /** Xpath for the WARCs dir in the order.xml. */
163    public static final String WARCS_ENABLED_XPATH = WARCWRITERPROCESSOR_XPATH + "/boolean[@name='enabled']";
164
165    public static final String WARCS_WRITE_REQUESTS_XPATH = WARCWRITERPROCESSOR_XPATH
166            + "/boolean[@name='write-requests']";
167    public static final String WARCS_WRITE_METADATA_XPATH = WARCWRITERPROCESSOR_XPATH
168            + "/boolean[@name='write-metadata']";
169    public static final String WARCS_WRITE_METADATA_OUTLINKS_XPATH = WARCWRITERPROCESSOR_XPATH
170                + "/boolean[@name='write-metadata-outlinks']";
171    public static final String WARCS_SKIP_IDENTICAL_DIGESTS_XPATH = WARCWRITERPROCESSOR_XPATH
172            + "/boolean[@name='skip-identical-digests']";
173    public static final String WARCS_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS_XPATH = WARCWRITERPROCESSOR_XPATH
174            + "/boolean[@name='write-revisit-for-identical-digests']";
175    public static final String WARCS_WRITE_REVISIT_FOR_NOT_MODIFIED_XPATH = WARCWRITERPROCESSOR_XPATH
176            + "/boolean[@name='write-revisit-for-not-modified']";
177
178    /** Xpath for the WARC metadata in the order.xml. */
179    public static final String METADATA_ITEMS_XPATH = WARCWRITERPROCESSOR_XPATH + "/map[@name='metadata-items']";
180
181    /**
182     * Map from required xpaths to a regular expression describing legal content for the path text.
183     */
184    private static final Map<String, Pattern> requiredXpaths = new HashMap<String, Pattern>();
185
186    /**
187     * A regular expression that matches a whole number, possibly negative, and with optional whitespace around it.
188     */
189    private static final String WHOLE_NUMBER_REGEXP = "\\s*-?[0-9]+\\s*";
190    /**
191     * A regular expression that matches everything. Except newlines, unless DOTALL is given to Pattern.compile().
192     */
193    private static final String EVERYTHING_REGEXP = ".*";
194
195    // These two regexps are copied from
196    // org.archive.crawler.datamodel.CrawlOrder because they're private there.
197
198    /**
199     * A regular expression that matches Heritrix' specs for the user-agent field in order.xml. It should be used with
200     * DOTALL. An example match is "Org (ourCrawler, see +http://org.org/aPage for details) harvest".
201     */
202    private static final String USER_AGENT_REGEXP = "\\S+.*\\(.*\\+http(s)?://\\S+\\.\\S+.*\\).*";
203    /**
204     * A regular expression that matches Heritrix' specs for the from field. This should be a valid email address.
205     */
206    private static final String FROM_REGEXP = "\\S+@\\S+\\.\\S+";
207
208    /**
209     * Xpath to check, that all templates have the max-time-sec attribute.
210     */
211    public static final String MAXTIMESEC_PATH_XPATH = "/crawl-order/controller/long[@name='max-time-sec']";
212
213
214    static {
215        requiredXpaths.put(GROUP_MAX_FETCH_SUCCESS_XPATH, Pattern.compile(WHOLE_NUMBER_REGEXP));
216        requiredXpaths.put(QUEUE_TOTAL_BUDGET_XPATH, Pattern.compile(WHOLE_NUMBER_REGEXP));
217        requiredXpaths.put(GROUP_MAX_ALL_KB_XPATH, Pattern.compile(WHOLE_NUMBER_REGEXP));
218
219        // Required that we use DecidingScope
220        // requiredXpaths.put(DECIDINGSCOPE_XPATH,
221        // Pattern.compile(EVERYTHING_REGEXP));
222
223        // Required that we have a rules map used to add crawlertraps
224        requiredXpaths.put(DECIDERULES_MAP_XPATH, Pattern.compile(EVERYTHING_REGEXP, Pattern.DOTALL));
225
226        requiredXpaths.put(HERITRIX_USER_AGENT_XPATH, Pattern.compile(USER_AGENT_REGEXP, Pattern.DOTALL));
227        requiredXpaths.put(HERITRIX_FROM_XPATH, Pattern.compile(FROM_REGEXP));
228
229        // max-time-sec attribute needed, so we can't override it set
230        // a timelimit on broad crawls.
231        requiredXpaths.put(MAXTIMESEC_PATH_XPATH, Pattern.compile(WHOLE_NUMBER_REGEXP));
232    }
233
234    /**
235     * Constructor for HeritrixTemplate class.
236     *
237     * @param doc the order.xml
238     * @param verify If true, verifies if the given dom4j Document contains the elements required by our software.
239     * @throws ArgumentNotValid if doc is null, or verify is true and doc does not obey the constraints required by our
240     * software.
241     */
242    public H1HeritrixTemplate(Document doc, boolean verify) {
243        ArgumentNotValid.checkNotNull(doc, "Document doc");
244        String xpath;
245        Node node;
246        Pattern pattern;
247        Matcher matcher;
248        if (verify) {
249            for (Map.Entry<String, Pattern> required : requiredXpaths.entrySet()) {
250                xpath = required.getKey();
251                node = doc.selectSingleNode(xpath);
252                ArgumentNotValid.checkTrue(node != null, "Template error: Missing node: " + xpath 
253                                + ". The template looks like this: " + doc.asXML());
254
255                pattern = required.getValue();
256                matcher = pattern.matcher(node.getText().trim());
257
258                ArgumentNotValid.checkTrue(matcher.matches(), "Template error: Value '" + node.getText()
259                        + "' of node '" + xpath + "' does not match required regexp '" + pattern 
260                        + "'. The template looks like this: " + doc.asXML());
261            }
262            verified = true;
263            // Required that Heritrix write its ARC/WARC files to the correct dir
264            // relative to the crawldir. This dir is defined by the constant:
265            // dk.netarkivet.common.Constants.ARCDIRECTORY_NAME.
266            // dk.netarkivet.common.Constants.WARCDIRECTORY_NAME.
267            int validArchivePaths = 0;
268            node = doc.selectSingleNode(ARC_ARCHIVER_PATH_XPATH);
269            if (node != null) {
270                pattern = Pattern.compile(dk.netarkivet.common.Constants.ARCDIRECTORY_NAME);
271                matcher = pattern.matcher(node.getText().trim());
272                ArgumentNotValid.checkTrue(matcher.matches(), "Template error: Value '" + node.getText()
273                        + "' of node '" + ARC_ARCHIVER_PATH_XPATH + "' does not match required regexp '" + pattern
274                        + "'");
275                ++validArchivePaths;
276            }
277            node = doc.selectSingleNode(WARC_ARCHIVER_PATH_XPATH);
278            if (node != null) {
279                pattern = Pattern.compile(dk.netarkivet.common.Constants.WARCDIRECTORY_NAME);
280                matcher = pattern.matcher(node.getText().trim());
281                ArgumentNotValid.checkTrue(matcher.matches(), "Template error: Value '" + node.getText()
282                        + "' of node '" + WARC_ARCHIVER_PATH_XPATH + "' does not match required regexp '" + pattern
283                        + "'");
284                ++validArchivePaths;
285            }
286            ArgumentNotValid.checkTrue(validArchivePaths > 0, "Template error: "
287                    + "An ARC or WARC writer processor seems to be missing");
288        }
289        this.template = (Document) doc.clone();
290    }
291
292    /**
293     * Alternate constructor, which always verifies the given document.
294     *
295     * @param doc
296     */
297    public H1HeritrixTemplate(Document doc) {
298        this(doc, true);
299    }
300
301    public H1HeritrixTemplate(long template_id, String templateAsString) throws DocumentException {
302        ArgumentNotValid.checkNotNull(templateAsString, "String template");
303        this.template_id = template_id;
304        this.template = XmlUtils.documentFromString(templateAsString);
305        }
306
307        /**
308     * return the template.
309     *
310     * @return the template
311     */
312    public Document getTemplate() {
313        return (Document) template.clone();
314    }
315
316    /**
317     * Has Template been verified?
318     *
319     * @return true, if verified on construction, otherwise false
320     */
321    public boolean isVerified() {
322        return verified;
323    }
324
325    /**
326     * Return HeritrixTemplate as XML.
327     *
328     * @return HeritrixTemplate as XML
329     */
330    public String getXML() {
331        return template.asXML();
332    }
333
334    /**
335     * Method to add a list of crawler traps with a given element name. It is used both to add per-domain traps and
336     * global traps.
337     *
338     * @param elementName The name of the added element.
339     * @param crawlerTraps A list of crawler trap regular expressions to add to this job.
340     */
341    @SuppressWarnings("unchecked")
342    public static void editOrderXMLAddCrawlerTraps(Document orderXMLdoc, String elementName, List<String> crawlerTraps) {
343        if (crawlerTraps.size() == 0) {
344            return;
345        }
346
347        // Get the node to update
348        // If there is an acceptIfPrerequisite decideRule in the template, crawler traps should be
349        // placed before (cf. issue NAS-2205)
350        // If no such rule exists then we append the crawler traps as to the existing decideRuleds.
351
352        Node rulesMapNode = orderXMLdoc.selectSingleNode(DECIDERULES_MAP_XPATH);
353        if (rulesMapNode == null || !(rulesMapNode instanceof Element)) {
354            throw new IllegalState("Unable to update order.xml document. It does not have the right form to add"
355                    + "crawler trap deciderules.");
356        }
357
358        Element rulesMap = (Element) rulesMapNode;
359
360        // Create the root node and append it top existing rules
361        Element decideRule = rulesMap.addElement("newObject");
362
363        // If an acceptiIfPrerequisite node exists, detach and insert before it
364        Node acceptIfPrerequisiteNode = orderXMLdoc.selectSingleNode(DECIDERULES_ACCEPT_IF_PREREQUISITE_XPATH);
365        if (acceptIfPrerequisiteNode != null) {
366            List<Node> elements = rulesMap.elements();
367            int insertPosition = elements.indexOf(acceptIfPrerequisiteNode);
368            decideRule.detach();
369            elements.add(insertPosition, decideRule);
370        } else {
371            rulesMap.elements().size();
372        }
373
374        // Add all regexps in the list to a single MatchesListRegExpDecideRule
375        decideRule.addAttribute("name", elementName);
376        decideRule.addAttribute("class", Heritrix1Constants.MATCHESLISTREGEXPDECIDERULE_CLASSNAME);
377
378        Element decision = decideRule.addElement("string");
379        decision.addAttribute("name", "decision");
380        decision.addText("REJECT");
381
382        Element listlogic = decideRule.addElement("string");
383        listlogic.addAttribute("name", "list-logic");
384        listlogic.addText("OR");
385
386        Element regexpList = decideRule.addElement("stringList");
387        regexpList.addAttribute("name", "regexp-list");
388        for (String trap : crawlerTraps) {
389            regexpList.addElement("string").addText(trap);
390        }
391    }
392
393    /**
394     * Updates the order.xml to include a MatchesListRegExpDecideRule for each crawlertrap associated with for the given
395     * DomainConfiguration.
396     * <p>
397     * The added nodes have the form
398     * <p>
399     * <newObject name="domain.dk" class="org.archive.crawler.deciderules.MatchesListRegExpDecideRule"> <string
400     * name="decision">REJECT</string> <string name="list-logic">OR</string> <stringList name="regexp-list">
401     * <string>theFirstRegexp</string> <string>theSecondRegexp</string> </stringList> </newObject>
402     *
403     * @param cfg The DomainConfiguration for which to generate crawler trap deciderules
404     * @throws IllegalState If unable to update order.xml due to wrong order.xml format
405     */
406    // FIXME REMOVE IF NOT USED
407    /*
408    public static void editOrderXMLAddPerDomainCrawlerTraps(Document orderXmlDoc, DomainConfiguration cfg) {
409        // Get the regexps to exclude
410        List<String> crawlerTraps = cfg.getCrawlertraps();
411        String elementName = cfg.getDomainName();
412        H1HeritrixTemplate.editOrderXMLAddCrawlerTraps(orderXmlDoc, elementName, crawlerTraps);
413    }
414    */
415
416    private static void setIfFound(Document doc, String Xpath, String param, String value) {
417        if (doc.selectSingleNode(Xpath) != null) {
418            XmlUtils.setNode(doc, Xpath, value);
419        } else {
420            log.warn("Could not replace setting value of '" + param + "' in template. Xpath not found: " + Xpath);
421        }
422    }
423
424    /**
425     * Auxiliary method to modify the orderXMLdoc Document with respect to setting the maximum number of objects to be
426     * retrieved per domain. This method updates 'group-max-fetch-success' element of the QuotaEnforcer pre-fetch
427     * processor node (org.archive.crawler.frontier.BdbFrontier) with the value of the argument forceMaxObjectsPerDomain
428     *
429     * @param orderXMLdoc
430     * @param forceMaxObjectsPerDomain The maximum number of objects to retrieve per domain, or 0 for no limit.
431     * @throws PermissionDenied If unable to replace the frontier node of the orderXMLdoc Document
432     * @throws IOFailure If the group-max-fetch-success element is not found in the orderXml. TODO The
433     * group-max-fetch-success check should also be performed in TemplateDAO.create, TemplateDAO.update
434     */
435    public static void editOrderXML_maxObjectsPerDomain(Document orderXMLdoc, long forceMaxObjectsPerDomain,
436            boolean maxObjectsIsSetByQuotaEnforcer) {
437
438        String xpath = (maxObjectsIsSetByQuotaEnforcer ? GROUP_MAX_FETCH_SUCCESS_XPATH : QUEUE_TOTAL_BUDGET_XPATH);
439
440        Node orderXmlNode = orderXMLdoc.selectSingleNode(xpath);
441        if (orderXmlNode != null) {
442            orderXmlNode.setText(String.valueOf(forceMaxObjectsPerDomain));
443        } else {
444            throw new IOFailure("Unable to locate " + xpath + " element in order.xml: " + orderXMLdoc.asXML());
445        }
446    }
447
448    /**
449     * Activates or deactivate the quota-enforcer, depending on budget definition. Object limit can be defined either by
450     * using the queue-total-budget property or the quota enforcer. Which is chosen is set by the argument
451     * maxObjectsIsSetByQuotaEnforcer}'s value. So quota enforcer is set as follows:
452     * <ul>
453     * <li>Object limit is not set by quota enforcer, disabled only if there is no byte limit.</li>
454     * <li>Object limit is set by quota enforcer, so it should be enabled whether a byte or object limit is set.</li>
455     * </ul>
456     *
457     * @param orderXMLdoc the template to modify
458     * @param maxObjectsIsSetByQuotaEnforcer Decides whether the maxObjectsIsSetByQuotaEnforcer or not.
459     * @param forceMaxBytesPerDomain The number of max bytes per domain enforced (can be no limit)
460     * @param forceMaxObjectsPerDomain The number of max objects per domain enforced (can be no limit)
461     */
462    public static void editOrderXML_configureQuotaEnforcer(Document orderXMLdoc,
463            boolean maxObjectsIsSetByQuotaEnforcer, long forceMaxBytesPerDomain, long forceMaxObjectsPerDomain) {
464
465        boolean quotaEnabled = true;
466
467        if (!maxObjectsIsSetByQuotaEnforcer) {
468            // Object limit is not set by quota enforcer, so it should be disabled only
469            // if there is no byte limit.
470            quotaEnabled = forceMaxBytesPerDomain != Constants.HERITRIX_MAXBYTES_INFINITY;
471
472        } else {
473            // Object limit is set by quota enforcer, so it should be enabled whether
474            // a byte or object limit is set.
475            quotaEnabled = forceMaxObjectsPerDomain != Constants.HERITRIX_MAXOBJECTS_INFINITY
476                    || forceMaxBytesPerDomain != Constants.HERITRIX_MAXBYTES_INFINITY;
477        }
478
479        String xpath = QUOTA_ENFORCER_ENABLED_XPATH;
480        Node qeNode = orderXMLdoc.selectSingleNode(xpath);
481        if (qeNode != null) {
482            qeNode.setText(Boolean.toString(quotaEnabled));
483        } else {
484            throw new IOFailure("Unable to locate " + xpath + " element in order.xml: " + orderXMLdoc.asXML());
485        }
486    }
487
488    
489    
490        @Override
491        // Always return true
492        public boolean isValid() {
493                return true;
494        }
495
496        @Override
497        public void configureQuotaEnforcer(boolean maxObjectsIsSetByQuotaEnforcer,
498                        long forceMaxBytesPerDomain, long forceMaxObjectsPerDomain) {
499                Document orderXMLdoc = this.template;
500                boolean quotaEnabled = true;
501
502                if (!maxObjectsIsSetByQuotaEnforcer) {
503                        // Object limit is not set by quota enforcer, so it should be disabled only
504                        // if there is no byte limit.
505                        quotaEnabled = forceMaxBytesPerDomain != Constants.HERITRIX_MAXBYTES_INFINITY;
506
507                } else {
508                        // Object limit is set by quota enforcer, so it should be enabled whether
509                        // a byte or object limit is set.
510                        quotaEnabled = forceMaxObjectsPerDomain != Constants.HERITRIX_MAXOBJECTS_INFINITY
511                                        || forceMaxBytesPerDomain != Constants.HERITRIX_MAXBYTES_INFINITY;
512                }
513
514                String xpath = QUOTA_ENFORCER_ENABLED_XPATH;
515                Node qeNode = orderXMLdoc.selectSingleNode(xpath);
516                if (qeNode != null) {
517                        qeNode.setText(Boolean.toString(quotaEnabled));
518                } else {
519                        throw new IOFailure("Unable to locate " + xpath + " element in order.xml: " + orderXMLdoc.asXML());
520                }
521        }
522
523        /**
524    * Auxiliary method to modify the orderXMLdoc Document with respect to setting the maximum number of bytes to
525    * retrieve per domain. This method updates 'group-max-all-kb' element of the 'QuotaEnforcer' node, which again is a
526    * subelement of 'pre-fetch-processors' node. with the value of the argument forceMaxBytesPerDomain
527    *
528    * @param forceMaxBytesPerDomain The maximum number of byte to retrieve per domain, or -1 for no limit. Note that
529    * the number is divided by 1024 before being inserted into the orderXml, as Heritrix expects KB.
530    * @throws PermissionDenied If unable to replace the QuotaEnforcer node of the orderXMLdoc Document
531    * @throws IOFailure If the group-max-all-kb element cannot be found. TODO This group-max-all-kb check also be
532    * performed in TemplateDAO.create, TemplateDAO.update
533    */
534        @Override
535        public void setMaxBytesPerDomain(Long forceMaxBytesPerDomain) {
536                // get and set the group-max-all-kb Node of the orderXMLdoc:
537        String xpath = GROUP_MAX_ALL_KB_XPATH;
538        Node groupMaxSuccessKbNode = template.selectSingleNode(xpath);
539        if (groupMaxSuccessKbNode != null) {
540            if (forceMaxBytesPerDomain == 0) {
541                groupMaxSuccessKbNode.setText("0");
542            } else if (forceMaxBytesPerDomain != Constants.HERITRIX_MAXBYTES_INFINITY) {
543                // Divide by 1024 since Heritrix uses KB rather than bytes,
544                // and add 1 to avoid to low limit due to rounding.
545                groupMaxSuccessKbNode.setText(Long
546                        .toString((forceMaxBytesPerDomain / Constants.BYTES_PER_HERITRIX_BYTELIMIT_UNIT) + 1));
547            } else {
548                groupMaxSuccessKbNode.setText(String.valueOf(Constants.HERITRIX_MAXBYTES_INFINITY));
549            }
550        } else {
551            throw new IOFailure("Unable to locate QuotaEnforcer object in order.xml: " + template.asXML());
552        }       
553        }
554
555        @Override
556        public Long getMaxBytesPerDomain() {
557                // FIXME IMPLEMENT ME
558                return null;
559        }
560
561        @Override
562        public void setMaxObjectsPerDomain(Long maxobjectsL) {
563                // FIXME IMPLEMENT ME
564                
565        }
566
567        @Override
568        public Long getMaxObjectsPerDomain() {
569                // FIXME IMPLEMENT ME OR DELETE
570                return null;
571        }
572
573        /**
574     * Return true if the templatefile has deduplication enabled.
575     * @return True if Deduplicator is enabled.
576     */
577        @Override
578        public boolean IsDeduplicationEnabled() {
579        Node xpathNode = template.selectSingleNode(DEDUPLICATOR_ENABLED);
580        return xpathNode != null && xpathNode.getText().trim().equals("true");
581        }
582
583        @Override
584        public void setArchiveFormat(String archiveFormat) {
585                Document orderXML = this.template;
586        boolean arcMode = false;
587        boolean warcMode = false;
588
589        //System.out.println("Document: " + template.asXML()); 
590        
591        if ("arc".equalsIgnoreCase(archiveFormat)) {
592            arcMode = true;
593            log.debug("ARC format selected to be used by Heritrix");
594        } else if ("warc".equalsIgnoreCase(archiveFormat)) {
595            warcMode = true;
596            log.debug("WARC format selected to be used by Heritrix");
597        } else {
598            throw new ArgumentNotValid("Configuration of '" + HarvesterSettings.HERITRIX_ARCHIVE_FORMAT
599                    + "' is invalid! Unrecognized format '" + archiveFormat + "'.");
600        }
601
602        if (arcMode) {
603            // enable ARC writing in Heritrix and disable WARC writing if needed.
604            if (orderXML.selectSingleNode(ARCSDIR_XPATH) != null
605                    && orderXML.selectSingleNode(ARCS_ENABLED_XPATH) != null) {
606                XmlUtils.setNode(orderXML, ARCSDIR_XPATH,
607                        dk.netarkivet.common.Constants.ARCDIRECTORY_NAME);
608                XmlUtils.setNode(orderXML, ARCS_ENABLED_XPATH, "true");
609                if (orderXML.selectSingleNode(WARCS_ENABLED_XPATH) != null) {
610                    XmlUtils.setNode(orderXML, WARCS_ENABLED_XPATH, "false");
611                }
612            } else {
613                throw new IllegalState("Unable to choose ARC as Heritrix archive format because "
614                        + " one of the following xpaths are invalid in the given order.xml: "
615                        + ARCSDIR_XPATH + "," + ARCS_ENABLED_XPATH);
616            }
617        } else if (warcMode) { // WARCmode
618            // enable ARC writing in Heritrix and disable WARC writing if needed.
619            if (orderXML.selectSingleNode(WARCSDIR_XPATH) != null
620                    && orderXML.selectSingleNode(WARCS_ENABLED_XPATH) != null) {
621                XmlUtils.setNode(orderXML, WARCSDIR_XPATH,
622                        dk.netarkivet.common.Constants.WARCDIRECTORY_NAME);
623                XmlUtils.setNode(orderXML, WARCS_ENABLED_XPATH, "true");
624                if (orderXML.selectSingleNode(ARCS_ENABLED_XPATH) != null) {
625                    XmlUtils.setNode(orderXML, ARCS_ENABLED_XPATH, "false");
626                }
627
628                String warcParametersOverrideStr = null;
629                try {
630                        warcParametersOverrideStr = Settings.get(HarvesterSettings.HERITRIX_WARC_PARAMETERS_OVERRIDE);
631                } catch (UnknownID e) {
632                        //nothing
633                }
634                //if the parameter is not found or if it exists and equals to true
635                if (warcParametersOverrideStr == null || (warcParametersOverrideStr != null
636                                && "true".equals(warcParametersOverrideStr))) {
637
638                        // Update the WARCWriterProcessorSettings with settings values
639                        setIfFound(orderXML, WARCS_SKIP_IDENTICAL_DIGESTS_XPATH,
640                                HarvesterSettings.HERITRIX_WARC_SKIP_IDENTICAL_DIGESTS,
641                                Settings.get(HarvesterSettings.HERITRIX_WARC_SKIP_IDENTICAL_DIGESTS));
642        
643                        setIfFound(orderXML, WARCS_WRITE_METADATA_XPATH,
644                                HarvesterSettings.HERITRIX_WARC_WRITE_METADATA,
645                                Settings.get(HarvesterSettings.HERITRIX_WARC_WRITE_METADATA));
646                        setIfFound(orderXML, WARCS_WRITE_METADATA_OUTLINKS_XPATH,
647                                HarvesterSettings.HERITRIX_WARC_WRITE_METADATA_OUTLINKS,
648                                Settings.get(HarvesterSettings.HERITRIX_WARC_WRITE_METADATA_OUTLINKS));
649                        setIfFound(orderXML, WARCS_WRITE_REQUESTS_XPATH,
650                                HarvesterSettings.HERITRIX_WARC_WRITE_REQUESTS,
651                                Settings.get(HarvesterSettings.HERITRIX_WARC_WRITE_REQUESTS));
652        
653                        setIfFound(orderXML, WARCS_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS_XPATH,
654                                HarvesterSettings.HERITRIX_WARC_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS,
655                                Settings.get(HarvesterSettings.HERITRIX_WARC_WRITE_REVISIT_FOR_IDENTICAL_DIGESTS));
656                        setIfFound(orderXML, WARCS_WRITE_REVISIT_FOR_NOT_MODIFIED_XPATH,
657                                HarvesterSettings.HERITRIX_WARC_WRITE_REVISIT_FOR_NOT_MODIFIED,
658                                Settings.get(HarvesterSettings.HERITRIX_WARC_WRITE_REVISIT_FOR_NOT_MODIFIED));
659                }
660            } else {
661                throw new IllegalState("Unable to choose WARC as Heritrix archive format because "
662                        + " one of the following xpaths are invalid in the given order.xml: "
663                        + WARCSDIR_XPATH + "," + WARCS_ENABLED_XPATH
664                        + ". order.xml: " + orderXML.asXML());
665            }
666
667        } else {
668            throw new IllegalState("Unknown state: "
669                    + "Should have selected either ARC or WARC as heritrix archive format");
670        }               
671        }
672
673        @Override
674        public void setMaxJobRunningTime(Long maxJobRunningTimeSecondsL) {
675        // get and set the "max-time-sec" node of the orderXMLdoc
676        String xpath = MAXTIMESEC_PATH_XPATH;
677        Node groupMaxTimeSecNode = template.selectSingleNode(xpath);
678        if (groupMaxTimeSecNode != null) {
679            String currentMaxTimeSec = groupMaxTimeSecNode.getText();
680            groupMaxTimeSecNode.setText(Long.toString(maxJobRunningTimeSecondsL));
681            log.trace("Value of groupMaxTimeSecNode changed from " + currentMaxTimeSec + " to " + maxJobRunningTimeSecondsL);
682        } else {
683            throw new IOFailure("Unable to locate xpath '" + xpath + "' in the order.xml: " + template.asXML());
684        }
685        }
686        
687        
688        @Override
689        public void writeTemplate(OutputStream os) throws IOException, ArgumentNotValid{
690        XMLWriter writer;
691                try {
692                        writer = new XMLWriter(os);
693                        writer.write(this.template);
694                } catch (UnsupportedEncodingException e) {
695                        String errMsg = "The encoding of this template is unsupported by this environment";
696                        log.error(errMsg, e);
697                        throw new ArgumentNotValid(errMsg, e);
698                } 
699        }
700
701        /**
702         * Only available for H1 templates.     
703         * @return the template as a String.
704         */
705        public String getText()  {
706                return this.template.getText();
707        }
708
709        @Override
710        public void insertCrawlerTraps(String elementName, List<String> crawlerTraps) {
711                if (crawlerTraps.size() == 0) {
712            return;
713        }
714        
715                //System.out.println("Calling insertCrawlerTraps(String elementName, List<String> crawlerTraps) ");
716        // Get the node to update
717        // If there is an acceptIfPrerequisite decideRule in the template, crawler traps should be
718        // placed before (cf. issue NAS-2205)
719        // If no such rule exists then we append the crawler traps as to the existing decideRuleds.
720
721        Node rulesMapNode = template.selectSingleNode(DECIDERULES_MAP_XPATH);
722        if (rulesMapNode == null || !(rulesMapNode instanceof Element)) {
723            throw new IllegalState("Unable to update order.xml document. It does not have the right form to add"
724                    + "crawler trap deciderules.");
725        }
726
727        Element rulesMap = (Element) rulesMapNode;
728
729        // Create the root node and append it top existing rules
730        Element decideRule = rulesMap.addElement("newObject");
731
732        // If an acceptiIfPrerequisite node exists, detach and insert before it
733        Node acceptIfPrerequisiteNode = template
734                .selectSingleNode(DECIDERULES_ACCEPT_IF_PREREQUISITE_XPATH);
735        if (acceptIfPrerequisiteNode != null) {
736            List<Node> elements = rulesMap.elements();
737            int insertPosition = elements.indexOf(acceptIfPrerequisiteNode);
738            decideRule.detach();
739            elements.add(insertPosition, decideRule);
740        } else {
741            rulesMap.elements().size();
742        }
743
744        // Add all regexps in the list to a single MatchesListRegExpDecideRule
745        decideRule.addAttribute("name", elementName);
746        decideRule.addAttribute("class", Heritrix1Constants.MATCHESLISTREGEXPDECIDERULE_CLASSNAME);
747
748        Element decision = decideRule.addElement("string");
749        decision.addAttribute("name", "decision");
750        decision.addText("REJECT");
751
752        Element listlogic = decideRule.addElement("string");
753        listlogic.addAttribute("name", "list-logic");
754        listlogic.addText("OR");
755
756        Element regexpList = decideRule.addElement("stringList");
757        regexpList.addAttribute("name", "regexp-list");
758        for (String trap : crawlerTraps) {
759            regexpList.addElement("string").addText(trap);
760        }
761                
762        }
763        
764        @Override
765        public boolean hasContent() {
766                return this.template.hasContent();
767        }
768
769        @Override
770        public void writeToFile(File orderXmlFile) {
771                XmlUtils.writeXmlToFile(this.template, orderXmlFile);
772        }
773
774        @Override
775        public void setRecoverlogNode(File recoverlogGzFile) {
776        final String RECOVERLOG_PATH_XPATH = "/crawl-order/controller/string[@name='recover-path']";
777        Node orderXmlNode = template.selectSingleNode(RECOVERLOG_PATH_XPATH);
778        if (orderXmlNode != null) {
779            orderXmlNode.setText(recoverlogGzFile.getAbsolutePath());
780            log.debug("The Heritrix recover path now refers to '{}'.", recoverlogGzFile.getAbsolutePath());
781        } else {
782            throw new IOFailure("Unable to locate the '" + RECOVERLOG_PATH_XPATH + "' element in order.xml: "
783                    + template.asXML());
784        }
785        }
786
787        @Override
788        public void setDeduplicationIndexLocation(String absolutePath) {
789                XmlUtils.setNode(template, DEDUPLICATOR_INDEX_LOCATION_XPATH, absolutePath);            
790        }
791
792        @Override
793        public void setSeedsFilePath(String absolutePath) {
794                XmlUtils.setNode(template, SEEDS_FILE_XPATH, absolutePath);
795        }
796
797        @Override
798        public void setArchiveFilePrefix(String archiveFilePrefix) {
799                XmlUtils.setNodes(template, ARCHIVEFILE_PREFIX_XPATH, archiveFilePrefix);
800        }
801
802        @Override
803        public void setDiskPath(String absolutePath) {
804                XmlUtils.setNode(template, DISK_PATH_XPATH, absolutePath);
805        }
806
807        @Override
808        public void removeDeduplicatorIfPresent() {
809                Node xpathNode = template.selectSingleNode(DEDUPLICATOR_XPATH);
810            if (xpathNode != null) {
811                xpathNode.detach();
812            }
813        }
814
815        @Override
816        public void insertWarcInfoMetadata(Job ajob, String origHarvestdefinitionName, 
817                        String scheduleName, String performer) {
818                
819                Node WARCWRITERNODE = template.selectSingleNode(WARCWRITERPROCESSOR_XPATH);
820                if (WARCWRITERNODE == null) {
821                        throw new IOFailure("Unable to locate the '" + WARCWRITERPROCESSOR_XPATH + "' element in order.xml: "
822                    + template.asXML());
823        } 
824                
825                Element warcwriterElement = (Element) WARCWRITERNODE;
826                Element metadataMap = warcwriterElement.addElement("map");
827        metadataMap.addAttribute("name", "metadata-items");
828        
829        Element metadataItem = null;
830        
831        metadataItem = metadataMap.addElement("string");
832        metadataItem.addAttribute("name", HARVESTINFO_VERSION);
833        metadataItem.addText(HARVESTINFO_VERSION_NUMBER);
834        
835        metadataItem = metadataMap.addElement("string");
836        metadataItem.addAttribute("name", HARVESTINFO_JOBID);
837        metadataItem.addText("" + ajob.getJobID());
838        
839        metadataItem = metadataMap.addElement("string");
840        metadataItem.addAttribute("name", HARVESTINFO_CHANNEL);
841        metadataItem.addText(ajob.getChannel());
842        
843        metadataItem = metadataMap.addElement("string");
844        metadataItem.addAttribute("name", HARVESTINFO_HARVESTNUM);
845        metadataItem.addText("" + ajob.getHarvestNum());
846        
847        metadataItem = metadataMap.addElement("string");
848        metadataItem.addAttribute("name", HARVESTINFO_ORIGHARVESTDEFINITIONID);
849        metadataItem.addText("" + ajob.getOrigHarvestDefinitionID());
850        
851        metadataItem = metadataMap.addElement("string");
852        metadataItem.addAttribute("name", HARVESTINFO_MAXBYTESPERDOMAIN);
853        metadataItem.addText("" + ajob.getMaxBytesPerDomain());
854        
855        metadataItem = metadataMap.addElement("string");
856        metadataItem.addAttribute("name", HARVESTINFO_MAXOBJECTSPERDOMAIN);
857        metadataItem.addText("" + ajob.getMaxObjectsPerDomain());
858        
859        metadataItem = metadataMap.addElement("string");
860        metadataItem.addAttribute("name", HARVESTINFO_ORDERXMLNAME);
861        metadataItem.addText(ajob.getOrderXMLName());
862        
863        metadataItem = metadataMap.addElement("string");
864        metadataItem.addAttribute("name", HARVESTINFO_ORIGHARVESTDEFINITIONNAME);
865        metadataItem.addText(origHarvestdefinitionName);
866        
867        /* optional schedule-name, only for selective harvests. */
868                if (scheduleName != null) {
869                        metadataItem = metadataMap.addElement("string");
870                metadataItem.addAttribute("name", HARVESTINFO_SCHEDULENAME);
871                metadataItem.addText(scheduleName);
872                }
873
874                metadataItem = metadataMap.addElement("string");
875        metadataItem.addAttribute("name", HARVESTINFO_HARVESTFILENAMEPREFIX);
876        metadataItem.addText(ajob.getHarvestFilenamePrefix());
877        
878        metadataItem = metadataMap.addElement("string");
879        metadataItem.addAttribute("name", HARVESTINFO_JOBSUBMITDATE);
880        metadataItem.addText("" + ajob.getSubmittedDate());
881        
882                /* optional HARVESTINFO_PERFORMER */
883                if (performer != null) {
884                        metadataItem = metadataMap.addElement("string");
885                metadataItem.addAttribute("name", HARVESTINFO_PERFORMER);
886                metadataItem.addText(performer);
887                }
888
889                /* optional HARVESTINFO_AUDIENCE */
890                if (ajob.getHarvestAudience() != null) {
891                        metadataItem = metadataMap.addElement("string");
892                metadataItem.addAttribute("name", HARVESTINFO_AUDIENCE);
893                metadataItem.addText(ajob.getHarvestAudience());
894                } 
895        }
896
897        @Override
898        public void insertAttributes(List<AttributeAndType> attributesAndTypes) {
899                // Unsupported for Heritrix 1 templates at this point.
900            log.warn("No attribute insertion is done for H1 templates");
901        }
902
903        @Override
904        public void writeTemplate(JspWriter out) throws IOFailure {
905                try {
906                        out.write(template.asXML());
907                } catch (IOException e) {
908                        throw new IOFailure("Unable to write to JspWriter", e);
909                }
910                
911        }
912
913}