001package dk.netarkivet.harvester.datamodel;
002
003import java.io.BufferedReader;
004import java.io.File;
005import java.io.FileNotFoundException;
006import java.io.FileReader;
007import java.io.IOException;
008import java.io.OutputStream;
009import java.io.Reader;
010import java.io.Serializable;
011import java.util.List;
012
013import javax.servlet.jsp.JspWriter;
014
015import org.dom4j.DocumentException;
016import org.slf4j.Logger;
017import org.slf4j.LoggerFactory;
018
019import dk.netarkivet.common.exceptions.ArgumentNotValid;
020import dk.netarkivet.common.exceptions.IOFailure;
021import dk.netarkivet.common.exceptions.IllegalState;
022
023/**
024 * Abstract class for manipulating Heritrix Templates.
025 *
026 */
027public abstract class HeritrixTemplate implements Serializable {
028
029        private static final Logger log = LoggerFactory.getLogger(HeritrixTemplate.class);
030
031        private static final CharSequence H1_SIGNATURE = "<crawl-order xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance"; 
032        //private static final CharSequence H3_SIGNATURE = "xmlns=\"http://www.springframework.org/";
033        private static final CharSequence H3_SIGNATURE = "http://www.springframework.org/";
034
035        // Constants for the metadata added to the warcinfo record when using WARC
036
037        protected static final String HARVESTINFO_VERSION_NUMBER = "0.5";
038        protected static final String HARVESTINFO_VERSION = "harvestInfo.version";
039        protected static final String HARVESTINFO_JOBID = "harvestInfo.jobId";
040        protected static final String HARVESTINFO_CHANNEL = "harvestInfo.channel";      
041        protected static final String HARVESTINFO_HARVESTNUM = "harvestInfo.harvestNum";
042        protected static final String HARVESTINFO_ORIGHARVESTDEFINITIONID = "harvestInfo.origHarvestDefinitionID";
043        protected static final String HARVESTINFO_MAXBYTESPERDOMAIN = "harvestInfo.maxBytesPerDomain";
044        protected static final String HARVESTINFO_MAXOBJECTSPERDOMAIN = "harvestInfo.maxObjectsPerDomain";
045        protected static final String HARVESTINFO_ORDERXMLNAME = "harvestInfo.orderXMLName";
046        protected static final String HARVESTINFO_ORIGHARVESTDEFINITIONNAME = "harvestInfo.origHarvestDefinitionName";
047        protected static final String HARVESTINFO_SCHEDULENAME = "harvestInfo.scheduleName";
048        protected static final String HARVESTINFO_HARVESTFILENAMEPREFIX = "harvestInfo.harvestFilenamePrefix";
049        protected static final String HARVESTINFO_JOBSUBMITDATE = "harvestInfo.jobSubmitDate";
050        protected static final String HARVESTINFO_PERFORMER = "harvestInfo.performer";
051        protected static final String HARVESTINFO_AUDIENCE = "harvestInfo.audience";
052
053
054        /** insertion-methods 
055         * 
056         * Two methods for adding domain quotas to the quotaEnforcer bean.
057         * maxBytesPerDomain()
058         * maxObjectsPerDomain()
059         * 
060         * One or two methods for inserting crawlertraps
061         * insertGlobalCrawlerTraps
062         * insertDomainSpecificCrawlerTraps 
063         */     
064
065        /**
066         * Activates or deactivate the quota-enforcer, depending on budget definition. Object limit can be defined either by
067         * using the queue-total-budget property or the quota enforcer. Which is chosen is set by the argument
068         * maxObjectsIsSetByQuotaEnforcer}'s value. So quota enforcer is set as follows:
069         * <ul>
070         * <li>Object limit is not set by quota enforcer, disabled only if there is no byte limit.</li>
071         * <li>Object limit is set by quota enforcer, so it should be enabled whether a byte or object limit is set.</li>
072         * </ul>
073         *
074         * @param maxObjectsIsSetByQuotaEnforcer Decides whether the maxObjectsIsSetByQuotaEnforcer or not.
075         * @param forceMaxBytesPerDomain The number of max bytes per domain enforced (can be no limit)
076         * @param forceMaxObjectsPerDomain The number of max objects per domain enforced (can be no limit)
077         */
078        public abstract void configureQuotaEnforcer(
079                        boolean maxObjectsIsSetByQuotaEnforcer, long forceMaxBytesPerDomain, long forceMaxObjectsPerDomain);
080
081
082        // Getter/Setter for MaxBytesPerDomain value
083        public abstract void setMaxBytesPerDomain(Long maxbytesL);
084        public abstract Long getMaxBytesPerDomain(); // TODO Is necessary? 
085
086        // Getter/Setter for MaxObjectsPerDomain value
087        public abstract void setMaxObjectsPerDomain(Long maxobjectsL);
088        public abstract Long getMaxObjectsPerDomain(); // TODO Is necessary? 
089
090        /**
091         * 
092         * @return true, if deduplication is enabled in the template (used for determine whether or not to request a deduplication index from the indexserver)
093         */
094        public abstract boolean IsDeduplicationEnabled();
095
096        /**
097         * @return true, if the template is valid, otherwise false
098         */
099        public abstract boolean isValid();
100
101        /**
102         * @return the XML behind this template
103         */
104        public abstract String getXML();
105
106        /**
107         * Method to add a list of crawler traps with a given element name. It is used both to add per-domain traps and
108         * global traps.
109         *
110         * @param elementName The name of the added element.
111         * @param crawlerTraps A list of crawler trap regular expressions to add to this job.
112         */
113
114        public abstract void insertCrawlerTraps(String elementName, List<String> crawlertraps);
115
116        /**
117         * Make sure that Heritrix will archive its data in the chosen archiveFormat.
118         *
119         * @param archiveFormat the chosen archiveformat ('arc' or 'warc' supported) Throws ArgumentNotValid If the chosen
120         * archiveFormat is not supported.
121         */
122        public abstract void setArchiveFormat(String archiveFormat);
123
124
125        /**
126         * Set the maxRunning time for the harvest
127         * @param maxJobRunningTimeSecondsL Limit the harvest to this number of seconds 
128         */
129        public abstract void setMaxJobRunningTime(Long maxJobRunningTimeSecondsL);
130
131        /**
132         * Updates the order.xml to include a MatchesListRegExpDecideRule for each crawler-trap associated with for the given
133         * DomainConfiguration.
134         * <p>
135         * The added nodes have the form
136         * <p>
137         * <newObject name="domain.dk" class="org.archive.crawler.deciderules.MatchesListRegExpDecideRule"> <string
138         * name="decision">REJECT</string> <string name="list-logic">OR</string> <stringList name="regexp-list">
139         * <string>theFirstRegexp</string> <string>theSecondRegexp</string> </stringList> </newObject>
140         *
141         * @param cfg The DomainConfiguration for which to generate crawler trap deciderules
142         * @throws IllegalState If unable to update order.xml due to wrong order.xml format
143         */
144        public void editOrderXMLAddPerDomainCrawlerTraps(DomainConfiguration cfg) {
145                List<String> crawlerTraps = cfg.getCrawlertraps();
146                String elementName = cfg.getDomainName();
147                if (!crawlerTraps.isEmpty()) {
148                        log.info("Inserting {} crawlertraps for domain '{}' into the template", crawlerTraps.size(), elementName);
149                        insertCrawlerTraps(elementName, crawlerTraps);
150                }
151        }
152
153
154        public abstract void setDeduplicationIndexLocation(String absolutePath);
155        public abstract void setSeedsFilePath(String absolutePath);
156
157        public abstract void setArchiveFilePrefix(String archiveFilePrefix);
158        public abstract void setDiskPath(String absolutePath);
159
160
161        public abstract void writeTemplate(OutputStream os) throws IOException, ArgumentNotValid;
162        public abstract void writeTemplate(JspWriter out);
163        public abstract boolean hasContent();
164
165        public abstract void writeToFile(File orderXmlFile);
166        public abstract void setRecoverlogNode(File recoverlogGzFile);
167
168        public static HeritrixTemplate getTemplateFromString(String templateAsString){
169                if (templateAsString.contains(H1_SIGNATURE)) {
170                        try {
171                                return new H1HeritrixTemplate(templateAsString);
172                        } catch (DocumentException e) {
173                                throw new IOFailure("Unable to recognize as a valid dom4j Document the following string: " 
174                                                + templateAsString, e);
175                        }
176                } else if (templateAsString.contains(H3_SIGNATURE)) {
177                        return new H3HeritrixTemplate(templateAsString);
178                } else {
179                        throw new ArgumentNotValid("The given template is neither H1 or H3: " + templateAsString);
180                }
181        }
182
183        /** 
184         * Read the given template from file.
185         * @param orderXmlFile a given HeritrixTemplate (H1 or H3) as a File
186         * @return the given HeritrixTemplate (H1 or H3) as a HeritrixTemplate object
187         */
188        public static HeritrixTemplate read(File orderXmlFile){
189                try {
190                        return read(new FileReader(orderXmlFile));
191                } catch (FileNotFoundException e) {
192                        throw new IOFailure("The file '" + orderXmlFile.getAbsolutePath() + "' was not found", e);
193                }
194        }
195
196        /**
197         * Read the template using the given Reader
198         * @param reader A given Reader
199         * @return a HeritrixTemplate object
200         */
201        public static HeritrixTemplate read(Reader orderTemplateReader) {
202                StringBuilder sb = new StringBuilder();
203                BufferedReader in = new BufferedReader(orderTemplateReader);
204                String line;
205                try {
206                        while ((line = in.readLine()) != null) {
207                                sb.append(line);
208                                sb.append('\n');
209                        }
210                } catch (IOException e) {
211                        throw new IOFailure("IOException thrown", e);
212                }
213                return getTemplateFromString((sb.toString()));
214        }
215
216
217        /**
218         * Try to remove the deduplicator, if present in the template.
219         */
220        public abstract void removeDeduplicatorIfPresent();
221
222        /**
223         * Method to add settings to the WARCWriterProcesser, so that it can generate a proper WARCINFO record. 
224         * @param ajob a HarvestJob
225         * @param origHarvestdefinitionName The name of the harvestdefinition behind this job
226         * @param scheduleName The name of the schedule used. (Will be null, if the job is not a selectiveHarvest).
227         * @param performer The name of organisation/person doing this harvest 
228         */
229        public abstract void insertWarcInfoMetadata(Job ajob,
230                        String origHarvestdefinitionName, String scheduleName,
231                        String performer);
232
233}