001package dk.netarkivet.harvester.datamodel;
002
003import java.io.BufferedReader;
004import java.io.File;
005import java.io.FileNotFoundException;
006import java.io.FileReader;
007import java.io.IOException;
008import java.io.OutputStream;
009import java.io.Reader;
010import java.io.Serializable;
011import java.util.List;
012
013import javax.servlet.jsp.JspWriter;
014
015import org.dom4j.DocumentException;
016import org.slf4j.Logger;
017import org.slf4j.LoggerFactory;
018
019import dk.netarkivet.common.exceptions.ArgumentNotValid;
020import dk.netarkivet.common.exceptions.IOFailure;
021import dk.netarkivet.common.exceptions.IllegalState;
022import dk.netarkivet.harvester.datamodel.eav.EAV.AttributeAndType;
023
024/**
025 * Abstract class for manipulating Heritrix Templates.
026 *
027 */
028public abstract class HeritrixTemplate implements Serializable {
029
030        private static final Logger log = LoggerFactory.getLogger(HeritrixTemplate.class);
031
032        private static final CharSequence H1_SIGNATURE = "<crawl-order xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance"; 
033        //private static final CharSequence H3_SIGNATURE = "xmlns=\"http://www.springframework.org/";
034        private static final CharSequence H3_SIGNATURE = "http://www.springframework.org/";
035
036        /**
037         * Templates for which isActive is false will be hidden in the web-gui by default.
038         */
039        private boolean isActive = true;
040
041        // Constants for the metadata added to the warcinfo record when using WARC
042
043        protected static final String HARVESTINFO_VERSION_NUMBER = "0.6";
044        protected static final String HARVESTINFO_VERSION = "harvestInfo.version";
045        protected static final String HARVESTINFO_JOBID = "harvestInfo.jobId";
046        protected static final String HARVESTINFO_CHANNEL = "harvestInfo.channel";      
047        protected static final String HARVESTINFO_HARVESTNUM = "harvestInfo.harvestNum";
048        protected static final String HARVESTINFO_ORIGHARVESTDEFINITIONID = "harvestInfo.origHarvestDefinitionID";
049        protected static final String HARVESTINFO_MAXBYTESPERDOMAIN = "harvestInfo.maxBytesPerDomain";
050        protected static final String HARVESTINFO_MAXOBJECTSPERDOMAIN = "harvestInfo.maxObjectsPerDomain";
051        protected static final String HARVESTINFO_ORDERXMLNAME = "harvestInfo.templateName";
052        protected static final String HARVESTINFO_ORDERXMLUPDATEDATE = "harvestInfo.templateLastUpdateDate";
053        protected static final String HARVESTINFO_ORDERXMLDESCRIPTION = "harvestInfo.templateDescription";
054        protected static final String HARVESTINFO_ORIGHARVESTDEFINITIONNAME = "harvestInfo.origHarvestDefinitionName";
055        protected static final String HARVESTINFO_ORIGHARVESTDEFINITIONCOMMENTS = "harvestInfo.origHarvestDefinitionComments";
056        protected static final String HARVESTINFO_SCHEDULENAME = "harvestInfo.scheduleName";
057        protected static final String HARVESTINFO_HARVESTFILENAMEPREFIX = "harvestInfo.harvestFilenamePrefix";
058        protected static final String HARVESTINFO_JOBSUBMITDATE = "harvestInfo.jobSubmitDate";
059        protected static final String HARVESTINFO_PERFORMER = "harvestInfo.performer";
060        protected static final String HARVESTINFO_OPERATOR = "harvestInfo.operator";
061        protected static final String HARVESTINFO_AUDIENCE = "harvestInfo.audience";
062
063
064        /** insertion-methods 
065         * 
066         * Two methods for adding domain quotas to the quotaEnforcer bean.
067         * maxBytesPerDomain()
068         * maxObjectsPerDomain()
069         * 
070         * One or two methods for inserting crawlertraps
071         * insertGlobalCrawlerTraps
072         * insertDomainSpecificCrawlerTraps 
073         */     
074
075        /**
076         * Activates or deactivate the quota-enforcer, depending on budget definition. Object limit can be defined either by
077         * using the queue-total-budget property or the quota enforcer. Which is chosen is set by the argument
078         * maxObjectsIsSetByQuotaEnforcer}'s value. So quota enforcer is set as follows:
079         * <ul>
080         * <li>Object limit is not set by quota enforcer, disabled only if there is no byte limit.</li>
081         * <li>Object limit is set by quota enforcer, so it should be enabled whether a byte or object limit is set.</li>
082         * </ul>
083         *
084         * @param maxObjectsIsSetByQuotaEnforcer Decides whether the maxObjectsIsSetByQuotaEnforcer or not.
085         * @param forceMaxBytesPerDomain The number of max bytes per domain enforced (can be no limit)
086         * @param forceMaxObjectsPerDomain The number of max objects per domain enforced (can be no limit)
087         */
088        public abstract void configureQuotaEnforcer(
089                        boolean maxObjectsIsSetByQuotaEnforcer, long forceMaxBytesPerDomain, long forceMaxObjectsPerDomain);
090
091        public boolean isActive() {
092                return isActive;
093        }
094
095        public void setIsActive(boolean isActive) {
096                this.isActive = isActive;
097        }
098
099        // Getter/Setter for MaxBytesPerDomain value
100        public abstract void setMaxBytesPerDomain(Long maxbytesL);
101        public abstract Long getMaxBytesPerDomain(); // TODO Is necessary? 
102
103        // Getter/Setter for MaxObjectsPerDomain value
104        public abstract void setMaxObjectsPerDomain(Long maxobjectsL);
105        public abstract Long getMaxObjectsPerDomain(); // TODO Is necessary? 
106
107        /** We need the persistent template id if we want to attach any attributes to it. */
108        public long template_id;
109
110        /**
111         * 
112         * @return true, if deduplication is enabled in the template (used for determine whether or not to request a deduplication index from the indexserver)
113         */
114        public abstract boolean IsDeduplicationEnabled();
115
116        /**
117         * @return true, if the template is valid, otherwise false
118         */
119        public abstract boolean isValid();
120
121        /**
122         * @return the XML behind this template
123         */
124        public abstract String getXML();
125
126        /**
127         * Method to add a list of crawler traps with a given element name. It is used both to add per-domain traps and
128         * global traps.
129         *
130         * @param elementName The name of the added element.
131         * @param crawlertraps A list of crawler trap regular expressions to add to this job.
132         */
133        public abstract void insertCrawlerTraps(String elementName, List<String> crawlertraps);
134
135        /**
136         * Make sure that Heritrix will archive its data in the chosen archiveFormat.
137         *
138         * @param archiveFormat the chosen archiveformat ('arc' or 'warc' supported) Throws ArgumentNotValid If the chosen
139         * archiveFormat is not supported.
140         */
141        public abstract void setArchiveFormat(String archiveFormat);
142
143
144        /**
145         * Set the maxRunning time for the harvest
146         * @param maxJobRunningTimeSecondsL Limit the harvest to this number of seconds 
147         */
148        public abstract void setMaxJobRunningTime(Long maxJobRunningTimeSecondsL);
149
150        /**
151         * Try to insert the given list of attributes into the template.
152         * @param attributesAndTypes
153         */
154        public abstract void insertAttributes(List<AttributeAndType> attributesAndTypes);
155
156        /**
157         * Updates the order.xml to include a MatchesListRegExpDecideRule for each crawler-trap associated with for the given
158         * DomainConfiguration.
159         * <p>
160         * The added nodes have the form
161         * <p>
162         * <newObject name="domain.dk" class="org.archive.crawler.deciderules.MatchesListRegExpDecideRule"> <string
163         * name="decision">REJECT</string> <string name="list-logic">OR</string> <stringList name="regexp-list">
164         * <string>theFirstRegexp</string> <string>theSecondRegexp</string> </stringList> </newObject>
165         *
166         * @param cfg The DomainConfiguration for which to generate crawler trap deciderules
167         * @throws IllegalState If unable to update order.xml due to wrong order.xml format
168         */
169        public void editOrderXMLAddPerDomainCrawlerTraps(DomainConfiguration cfg) {
170                List<String> crawlerTraps = cfg.getCrawlertraps();
171                String elementName = cfg.getDomainName();
172                int trapCount=crawlerTraps.size();
173                for (String trap: crawlerTraps){
174                    if (trap.isEmpty()) { // Ignore empty traps in the trapcount (NAS-2480)
175                        log.warn("Found empty trap for domain {}", cfg.getDomainName());
176                        trapCount--; 
177                    }
178                }
179                if (trapCount > 0) {
180                        log.info("Inserting {} crawlertraps for domain '{}' into the template", crawlerTraps.size(), elementName);
181                        insertCrawlerTraps(elementName, crawlerTraps);
182                }
183        }
184
185        public abstract void setDeduplicationIndexLocation(String absolutePath);
186        public abstract void setSeedsFilePath(String absolutePath);
187
188        public abstract void setArchiveFilePrefix(String archiveFilePrefix);
189        public abstract void setDiskPath(String absolutePath);
190
191
192        public abstract void writeTemplate(OutputStream os) throws IOException, ArgumentNotValid;
193        public abstract void writeTemplate(JspWriter out);
194        public abstract boolean hasContent();
195
196        public abstract void writeToFile(File orderXmlFile);
197        public abstract void setRecoverlogNode(File recoverlogGzFile);
198
199        /**
200         * Construct a H1HeritrixTemplate or H3HeritrixTemplate based on the signature of the given string.
201         * @param template_id The id of the template
202         * @param templateAsString The template as a String object
203         * @return a HeritrixTemplate based on the signature of the given string.
204         */
205        public static HeritrixTemplate getTemplateFromString(long template_id, String templateAsString){
206                if (templateAsString.contains(H1_SIGNATURE)) {
207                        try {
208                                return new H1HeritrixTemplate(template_id, templateAsString);
209                        } catch (DocumentException e) {
210                                throw new IOFailure("Unable to recognize as a valid dom4j Document the following string: " 
211                                                + templateAsString, e);
212                        }
213                } else if (templateAsString.contains(H3_SIGNATURE)) {
214                        return new H3HeritrixTemplate(template_id, templateAsString);
215                } else {
216                        throw new ArgumentNotValid("The given template is neither H1 or H3: " + templateAsString);
217                }
218        }
219
220        /** 
221         * Read the given template from file.
222         * @param orderXmlFile a given HeritrixTemplate (H1 or H3) as a File
223         * @return the given HeritrixTemplate (H1 or H3) as a HeritrixTemplate object
224         */
225        public static HeritrixTemplate read(File orderXmlFile){
226                try {
227                        return read(-1, new FileReader(orderXmlFile));
228                } catch (FileNotFoundException e) {
229                        throw new IOFailure("The file '" + orderXmlFile.getAbsolutePath() + "' was not found", e);
230                }
231        }
232
233        /**
234         * Read the template using the given Reader.
235         * 
236         * @param template_id The id of the template
237         * @param orderTemplateReader A given Reader to read a template
238         * @return a HeritrixTemplate object
239         */
240        public static HeritrixTemplate read(long template_id, Reader orderTemplateReader) {
241                StringBuilder sb = new StringBuilder();
242                BufferedReader in = new BufferedReader(orderTemplateReader);
243                String line;
244                try {
245                        while ((line = in.readLine()) != null) {
246                                sb.append(line);
247                                sb.append('\n');
248                        }
249                } catch (IOException e) {
250                        throw new IOFailure("IOException thrown", e);
251                }
252                return getTemplateFromString(template_id, sb.toString());
253        }
254
255
256        /**
257         * Try to remove the deduplicator, if present in the template.
258         */
259        public abstract void removeDeduplicatorIfPresent();
260
261        /**
262         *
263         */
264        public abstract void enableOrDisableDeduplication(boolean enabled);
265
266        /**
267         * Method to add settings to the WARCWriterProcesser, so that it can generate a proper WARCINFO record. 
268         * @param ajob a HarvestJob
269         * @param origHarvestdefinitionName The name of the harvestdefinition behind this job
270         * @param scheduleName The name of the schedule used. (Will be null, if the job is not a selectiveHarvest).
271         * @param performer The name of organisation/person doing this harvest 
272         */
273        public abstract void insertWarcInfoMetadata(Job ajob,
274                        String origHarvestdefinitionName, String origHarvestdefinitionComments,
275                        String scheduleName, String performer);
276
277}