001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.datamodel;
024
025import gnu.inet.encoding.IDNA;
026import gnu.inet.encoding.IDNAException;
027
028import java.io.BufferedReader;
029import java.io.File;
030import java.io.IOException;
031import java.io.Serializable;
032import java.io.StringReader;
033import java.net.MalformedURLException;
034import java.net.URL;
035import java.util.ArrayList;
036import java.util.Collections;
037import java.util.Date;
038import java.util.HashMap;
039import java.util.HashSet;
040import java.util.Iterator;
041import java.util.List;
042import java.util.Map;
043import java.util.Set;
044import java.util.TreeSet;
045import java.util.regex.Pattern;
046
047import org.apache.commons.io.IOUtils;
048import org.slf4j.Logger;
049import org.slf4j.LoggerFactory;
050
051import dk.netarkivet.common.exceptions.ArgumentNotValid;
052import dk.netarkivet.common.exceptions.IOFailure;
053import dk.netarkivet.common.exceptions.IllegalState;
054import dk.netarkivet.common.utils.DomainUtils;
055import dk.netarkivet.common.utils.Settings;
056import dk.netarkivet.common.utils.StringUtils;
057import dk.netarkivet.harvester.HarvesterSettings;
058import dk.netarkivet.harvester.datamodel.eav.EAV.AttributeAndType;
059import dk.netarkivet.harvester.harvesting.ArchiveFileNaming;
060import dk.netarkivet.harvester.harvesting.ArchiveFileNamingFactory;
061import dk.netarkivet.harvester.harvesting.JobInfo;
062
063/**
064 * This class represents one job to run by Heritrix. It's based on a number of configurations all based on the same
065 * order.xml and at most one configuration for each domain. Each job consists of configurations of the approximate same
066 * size; that is the difference in expectation from the smallest configuration to the largest configuration is within a
067 * factor of each other defined as limMaxRelSize (although differences smaller than limMinAbsSize are ignored) There is
068 * a limit limMaxTotalSize on the total size of the job in objects.
069 * <p>
070 * A job may also be limited on bytes or objects, defined either by the configurations in the job or the harvest
071 * definition the job is generated by.
072 * <p>
073 * The job contains the order file, the seedlist and the current status of the job, as well as the ID of the harvest
074 * definition that defined it and names of all the configurations it is based on.
075 */
076@SuppressWarnings({"serial"})
077public class Job implements Serializable, JobInfo {
078    private transient static final Logger log = LoggerFactory.getLogger(Job.class);
079
080    // Persistent fields stored in and read from DAO
081    /** The persistent ID of this job. */
082    private Long jobID;
083    /** The Id of the harvestdefinition, that generated this job. */
084    protected Long origHarvestDefinitionID;
085    /** The status of the job. See the JobStatus class for the possible states. */
086    protected JobStatus status;
087    /** The name of the {@link HarvestChannel} on which this job will be posted. */
088    private String channel;
089
090    /** Whether the job belongs to a snapshot or partial harvest. */
091    private boolean isSnapshot;
092    /**
093     * Overrides the individual configurations maximum setting for objects retrieved from a domain when set to a
094     * positive value.
095     */
096    private long forceMaxObjectsPerDomain = Constants.HERITRIX_MAXOBJECTS_INFINITY;
097    /**
098     * Overrides the individual configurations maximum setting for bytes retrieved from a domain when set to other than
099     * -1.
100     */
101    private long forceMaxBytesPerDomain = Constants.HERITRIX_MAXBYTES_INFINITY;
102    /** The name of the harvest template used by the job. */
103    private String orderXMLname;
104    /** The harvest template used by the job. */
105    private HeritrixTemplate orderXMLdoc;
106    /** The list of Heritrix settings files. */
107    private File[] settingsXMLfiles;
108    
109    /** The corresponding Dom4j Documents for these files. */
110    //private Document[] settingsXMLdocs;
111   
112    /**
113     * A set of seeds involved in this job. Outside the SetSeedList() method, the set of seeds is updated in the
114     * addConfiguration() method.
115     */
116    private Set<String> seedListSet = new HashSet<String>();
117    /** Which run of the harvest definition this is. */
118    private int harvestNum;
119    /** Errors during harvesting. */
120    private String harvestErrors;
121    /** Details about errors during harvesting. */
122    private String harvestErrorDetails;
123    /** Errors during upload of the harvested data. */
124    private String uploadErrors;
125    /** Details about errors during upload of the harvested data. */
126    private String uploadErrorDetails;
127    /** The starting point of the job. */
128    private Date actualStart;
129    /** The ending point of the job. */
130    private Date actualStop;
131    /** The time when this job was submitted. */
132    private Date submittedDate;
133    /** The time when this job was created. */
134    private Date creationDate;
135
136    /** Edition is used by the DAO to keep track of changes. */
137    private long edition = -1;
138
139    /** Resubmitted as the Job with this ID. If null, this job has not been resubmitted. */
140    private Long resubmittedAsJobWithID;
141
142    /** Continuation of this job. */
143    private Long continuationOF;
144
145    /**
146     * A map (domainName, domainConfigurationName), must be accessible in order to update job information (see Ass.
147     * 2.4.3)
148     */
149    private Map<String, String> domainConfigurationMap;
150    /**
151     * A hint to the DAO that configurations have changed. Since configurations are large, the DAO can use that this is
152     * false to avoid updating the config list. The DAO can set it to false after saving configurations.
153     */
154    boolean configsChanged = false;
155
156    // Intermediate fields, non-persistent and only used while building objects
157
158    /**
159     * Whether the maxObjects field was defined by the harvest definition or the configuration limit. This is deciding
160     * for whether we accept smaller configurations or not when building jobs. True means the limit is defined by the
161     * configuration, false means that it is defined by the harvest definition.
162     */
163    private boolean configurationSetsObjectLimit;
164
165    /**
166     * Whether the maxBytes field was defined by the harvest definition or the configuration limit. This is deciding for
167     * whether we accept smaller configurations or not when building jobs. True means the limit is defined by the
168     * configuration, false means by the harvest definition.
169     */
170    private boolean configurationSetsByteLimit;
171
172    /** The lowest number of objects expected by a configuration. */
173    private long minCountObjects;
174
175    /** The highest number of objects expected by a configuration. */
176    private long maxCountObjects;
177
178    /** The total number of objects expected by all added configurations. */
179    private long totalCountObjects;
180
181    /**
182     * The max time in seconds given to the harvester for this job. 0 is unlimited.
183     */
184    private long forceMaxRunningTime;
185
186    /**
187     * If true, this job object is still undergoing changes due to having more configurations added. When set to false,
188     * the object is no longer considered immutable except for updating status.
189     * <p>
190     * Jobs loaded from the DAO are never under construction anymore.
191     */
192    private boolean underConstruction = true;
193
194    // Constants
195
196    // Note: The following constants are intentionally left non-static for easy
197    // unit testing
198
199    private boolean maxObjectsIsSetByQuotaEnforcer = Settings
200            .getBoolean(HarvesterSettings.OBJECT_LIMIT_SET_BY_QUOTA_ENFORCER);
201
202    /**
203     * The harvestname prefix used in the files generated by Heritrix. Is set using an ArchiveFileNaming class when the
204     * jobID is available.
205     */
206    private String harvestnamePrefix;
207
208    /** This variable is right now the same as harvestdefinitions.audience field. */
209    private String harvestAudience;
210
211    protected Job() {
212        this.status = JobStatus.NEW;
213    }
214
215    /**
216     * Package private constructor for common initialisation.
217     *
218     * @param harvestID the id of the harvestdefinition
219     * @param cfg the configuration to base the Job on
220     * @param orderXMLdoc
221     * @param channel the channel on which the job will be submitted.
222     * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain, overrides individual
223     * configuration settings. -1 means no limit
224     * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for no limit.
225     * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job
226     * @param harvestNum the run number of the harvest definition
227     * @throws ArgumentNotValid if cfg or priority is null or harvestID is invalid, or if any limit < -1
228     */
229    public Job(Long harvestID, DomainConfiguration cfg, HeritrixTemplate orderXMLdoc, HarvestChannel channel,
230            long forceMaxObjectsPerDomain,
231            long forceMaxBytesPerDomain, long forceMaxJobRunningTime, int harvestNum) throws ArgumentNotValid {
232        ArgumentNotValid.checkNotNull(cfg, "cfg");
233        ArgumentNotValid.checkNotNull(harvestID, "harvestID");
234        ArgumentNotValid.checkNotNegative(harvestID, "harvestID");
235        ArgumentNotValid.checkNotNull(channel, "channel");
236
237        if (forceMaxObjectsPerDomain < -1) {
238            String msg = "forceMaxObjectsPerDomain must be either -1 or positive";
239            log.debug(msg);
240            throw new ArgumentNotValid(msg);
241        }
242        if (forceMaxBytesPerDomain < -1) {
243            String msg = "forceMaxBytesPerDomain must be either -1 or positive";
244            log.debug(msg);
245            throw new ArgumentNotValid(msg);
246        }
247
248        if (forceMaxBytesPerDomain == 0L) {
249            log.warn("forceMaxBytesPerDomain should probably not be 0.Means 0 bytes downloaded per domain");
250        }
251
252        if (forceMaxObjectsPerDomain == 0L) {
253            log.warn("forceMaxObjectsPerDomain should probably not be 0.Means 0 objects downloaded per domain");
254        }
255
256        // setup initial members
257        domainConfigurationMap = new HashMap<>();
258        origHarvestDefinitionID = harvestID;
259        orderXMLname = cfg.getOrderXmlName();
260        this.orderXMLdoc = orderXMLdoc;
261
262        setHarvestChannel(channel);
263
264        long maxObjects = NumberUtils.minInf(forceMaxObjectsPerDomain, cfg.getMaxObjects());
265        setMaxObjectsPerDomain(maxObjects);
266        configurationSetsObjectLimit = (maxObjects != forceMaxObjectsPerDomain);
267
268        long maxBytes = NumberUtils.minInf(forceMaxBytesPerDomain, cfg.getMaxBytes());
269        setMaxBytesPerDomain(maxBytes);
270        configurationSetsByteLimit = (maxBytes != forceMaxBytesPerDomain);
271
272        long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain);
273        maxCountObjects = expectation;
274        minCountObjects = expectation;
275        this.harvestNum = harvestNum;
276
277        addConfiguration(cfg);
278
279        setMaxJobRunningTime(forceMaxJobRunningTime);
280
281        setArchiveFormatInTemplate(Settings.get(HarvesterSettings.HERITRIX_ARCHIVE_FORMAT));
282
283        setAttributes(cfg.getAttributesAndTypes());
284
285        status = JobStatus.NEW;
286    }
287
288        public void setAttributes(List<AttributeAndType> attributesAndTypes) {
289        orderXMLdoc.insertAttributes(attributesAndTypes);
290        }
291
292    /**
293     * Update the order template according to the chosen archive format (arc/warc).
294     */
295    private void setArchiveFormatInTemplate(String archiveFormat) {
296        if (!underConstruction) {
297            final String msg = "Cannot modify job " + this + " as it is no longer under construction";
298            log.debug(msg);
299            throw new IllegalState(msg);
300        }
301        orderXMLdoc.setArchiveFormat(archiveFormat);
302    }
303
304    /**
305     * Create a new Job object from basic information stored in the DAO.
306     *
307     * @param harvestID the id of the harvestdefinition
308     * @param configurations the configurations to base the Job on
309     * @param channel the name of the channel on which the job will be submitted.
310     * @param snapshot whether the job belongs to a snapshot harvest
311     * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain, overrides individual
312     * configuration settings. 0 means no limit.
313     * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for no limit.
314     * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job
315     * @param status the current status of the job.
316     * @param orderXMLname the name of the order template used.
317     * @param orderXMLdoc the (possibly modified) template
318     * @param seedlist the combined seedlist from all configs.
319     * @param harvestNum the run number of the harvest definition
320     */
321    Job(Long harvestID, Map<String, String> configurations, String channel, boolean snapshot,
322            long forceMaxObjectsPerDomain, long forceMaxBytesPerDomain, long forceMaxJobRunningTime, JobStatus status,
323            String orderXMLname, HeritrixTemplate orderXMLdoc, String seedlist, int harvestNum, Long continuationOf) {
324        origHarvestDefinitionID = harvestID;
325        domainConfigurationMap = configurations;
326        this.channel = channel;
327        this.isSnapshot = snapshot;
328        this.forceMaxBytesPerDomain = forceMaxBytesPerDomain;
329        this.forceMaxObjectsPerDomain = forceMaxObjectsPerDomain;
330        this.forceMaxRunningTime = forceMaxJobRunningTime;
331        this.status = status;
332        this.orderXMLname = orderXMLname;
333        this.orderXMLdoc = orderXMLdoc;
334        this.setSeedList(seedlist);
335        this.harvestNum = harvestNum;
336        this.continuationOF = continuationOf;
337        
338        underConstruction = false;
339    }
340
341
342        /**
343     * Adds a configuration to this Job. Seedlists and settings are updated accordingly.
344     *
345     * @param cfg the configuration to add
346     * @throws ArgumentNotValid if cfg is null or cfg uses a different orderxml than this job or if this job already
347     * contains a configuration associated with domain of configuration cfg.
348     */
349    public void addConfiguration(DomainConfiguration cfg) {
350        ArgumentNotValid.checkNotNull(cfg, "cfg");
351        if (domainConfigurationMap.containsKey(cfg.getDomainName())) {
352            throw new ArgumentNotValid("Job already has a configuration for Domain " + cfg.getDomainName());
353        }
354
355        if (log.isTraceEnabled()) {
356            log.trace("Adding configuration '{}' to job '{}'", cfg, cfg.getName());
357        }
358
359        if (!underConstruction) {
360            final String msg = "Cannot modify job " + this + " as it is no longer under construction";
361            log.debug(msg);
362            throw new IllegalState(msg);
363        }
364
365        if (!cfg.getOrderXmlName().equals(getOrderXMLName())) {
366            throw new ArgumentNotValid("Job requires the orderxml file:'" + getOrderXMLName() + "' not:'"
367                    + cfg.getOrderXmlName() + "' used by the configuration:'" + cfg.getName());
368        }
369
370        domainConfigurationMap.put(cfg.getDomainName(), cfg.getName());
371
372        // Add the seeds from the configuration to the Job seeds.
373        // Take care of duplicates.
374        for (Iterator<SeedList> itt = cfg.getSeedLists(); itt.hasNext();) {
375            SeedList seed = itt.next();
376            List<String> seeds = seed.getSeeds();
377            for (String seedUrl : seeds) {
378                seedListSet.add(seedUrl); // duplicates is silently ignored
379
380                // TODO remove when heritrix implements this functionality
381                // try to convert a seed into a Internationalized Domain Name
382                try {
383                    String seedASCII = seedUrl;
384                    // It is rare to see these seeds, but they need to be
385                    // correctly idnaized
386                    if (seedUrl.contains(":") || seedUrl.contains("/")) {
387                        String normalizedUrl = seedUrl;
388                        if (!normalizedUrl.matches("^[a-zA-Z]+:.*")) {
389                            // If no protocol is given, assume http
390                            normalizedUrl = "http://" + normalizedUrl;
391                        }
392                        URL url = new URL(normalizedUrl);
393                        String domainName = url.getHost();
394                        String domainNameASCII = IDNA.toASCII(domainName);
395                        if (!domainName.equals(domainNameASCII)) {
396                            // If the domain name changed, replace that in the
397                            // seed.
398                            seedASCII = seedUrl.replaceFirst(Pattern.quote(domainName), domainNameASCII);
399                        }
400                    } else {
401                        seedASCII = IDNA.toASCII(seedUrl);
402                    }
403                    if (!seedASCII.equals(seedUrl)) {
404                        log.trace("Converted {} to {}", seedUrl, seedASCII);
405                        // Note that duplicates is silently ignored
406                        seedListSet.add(seedASCII);
407                    }
408                } catch (IDNAException e) {
409                    log.trace("Cannot convert seed {} to ASCII", seedUrl, e);
410                } catch (MalformedURLException e) {
411                    log.trace("Cannot convert seed {} to ASCII", seedUrl, e);
412                }
413            }
414        }
415
416        orderXMLdoc.editOrderXMLAddPerDomainCrawlerTraps(cfg);
417
418        // TODO update limits in settings files - see also bug 269
419
420        // Update estimates of job size
421        long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain);
422        maxCountObjects = Math.max(expectation, maxCountObjects);
423        minCountObjects = Math.min(expectation, minCountObjects);
424        totalCountObjects += expectation;
425
426        configsChanged = true;
427
428        assert (maxCountObjects >= minCountObjects) : "basic invariant";
429    }
430
431    /**
432     * Get the name of the order XML file used by this Job.
433     *
434     * @return the name of the orderXML file
435     */
436    public String getOrderXMLName() {
437        return orderXMLname;
438    }
439
440    /**
441     * Get the actual time when this job was stopped/completed.
442     *
443     * @return the time as Date
444     */
445    public Date getActualStop() {
446        return actualStop;
447    }
448
449    /**
450     * Get the actual time when this job was started.
451     *
452     * @return the time as Date
453     */
454    public Date getActualStart() {
455        return actualStart;
456    }
457
458    /**
459     * Get the time when this job was submitted.
460     *
461     * @return the time as Date
462     */
463    public Date getSubmittedDate() {
464        return submittedDate;
465    }
466
467    /**
468     * Get the time when this job was created.
469     *
470     * @return the creation time as a <code>Date</code>
471     */
472    public Date getCreationDate() {
473        return creationDate;
474    }
475
476    /**
477     * Get a list of Heritrix settings.xml files. Note that these files have nothing to do with NetarchiveSuite settings
478     * files. They are files that supplement the Heritrix order.xml files, and contain overrides for specific domains.
479     *
480     * @return the list of Files as an array
481     */
482    public File[] getSettingsXMLfiles() {
483        return settingsXMLfiles;
484    }
485
486    /**
487     * Get the id of the HarvestDefinition from which this job originates.
488     *
489     * @return the id as a Long
490     */
491    public Long getOrigHarvestDefinitionID() {
492        return origHarvestDefinitionID;
493    }
494
495    /**
496     * Get the id of this Job.
497     *
498     * @return the id as a Long
499     */
500    public Long getJobID() {
501        return jobID;
502    }
503
504    /**
505     * Set the id of this Job.
506     *
507     * @param id The Id for this job.
508     */
509    public void setJobID(Long id) {
510        jobID = id;
511    }
512
513    /**
514     * Get's the total number of different domains harvested by this job.
515     *
516     * @return the number of configurations added to this domain
517     */
518    public int getCountDomains() {
519        return domainConfigurationMap.size();
520    }
521
522    /**
523     * Set the actual time when this job was started.
524     * <p>
525     * Sends a notification, if actualStart is set to a time after actualStop.
526     *
527     * @param actualStart A Date object representing the time when this job was started.
528     */
529    public void setActualStart(Date actualStart) {
530        ArgumentNotValid.checkNotNull(actualStart, "actualStart");
531        if (actualStop != null && actualStop.before(actualStart)) {
532            log.warn("Job(" + getJobID()+ "): Start time (" + actualStart + ") is after end time: " + actualStop);
533        }
534        this.actualStart = (Date) actualStart.clone();
535    }
536
537    /**
538     * Set the actual time when this job was stopped/completed. Sends a notification, if actualStop is set to a time
539     * before actualStart.
540     *
541     * @param actualStop A Date object representing the time when this job was stopped.
542     * @throws ArgumentNotValid
543     */
544    public void setActualStop(Date actualStop) throws ArgumentNotValid {
545        ArgumentNotValid.checkNotNull(actualStop, "actualStop");
546        if (actualStart == null) {
547            log.warn("Job(" + getJobID()+ "): actualStart should be defined before setting actualStop");
548        } else if (actualStop.before(actualStart)) {
549            log.warn("Job(" + getJobID()+ "): actualStop (" + actualStop + ") is before actualStart: " + actualStart);
550        }
551        this.actualStop = (Date) actualStop.clone();
552    }
553
554    /**
555     * Set the orderxml for this job.
556     *
557     * @param doc A orderxml to be used by this job
558     */
559    public void setOrderXMLDoc(HeritrixTemplate doc) {
560        ArgumentNotValid.checkNotNull(doc, "doc");
561        this.orderXMLdoc = doc;
562    }
563
564    /**
565     * Gets a document representation of the order.xml associated with this Job.
566     *
567     * @return the XML as a org.dom4j.Document
568     */
569    public HeritrixTemplate getOrderXMLdoc() {
570        return orderXMLdoc;
571    }
572
573//    /**
574//     * Gets a list of document representations of the settings.xml's associated with this Job.
575//     *
576//     * @return the XML as an array of org.dom4j.Document
577//     */
578//    public Document[] getSettingsXMLdocs() {
579//        return settingsXMLdocs;
580//    }
581
582    /**
583     * Set the seedlist of the job from the seedList argument. Individual seeds are separated by a '\n' character.
584     * Duplicate seeds are removed.
585     *
586     * @param seedList List of seeds as one String
587     */
588    public void setSeedList(String seedList) {
589        ArgumentNotValid.checkNotNullOrEmpty(seedList, "seedList");
590        seedListSet = new HashSet<>();
591        BufferedReader reader = new BufferedReader(new StringReader(seedList));
592        String seed;
593        try {
594            while ((seed = reader.readLine()) != null) {
595                seedListSet.add(seed); // add to seedlist if not already there
596            }
597        } catch (IOException e) {
598            // This never happens, as we're reading from a string!
599            throw new IOFailure("IOException reading from seed string", e);
600        } finally {
601            IOUtils.closeQuietly(reader);
602        }
603    }
604
605    /**
606     * Get the seedlist as a String. The individual seeds are separated by the character '\n'. The order of the seeds
607     * are unknown.
608     *
609     * @return the seedlist as a String
610     */
611    public String getSeedListAsString() {
612        return StringUtils.conjoin("\n", seedListSet);
613    }
614
615    /**
616     * Get the current status of this Job.
617     *
618     * @return the status as an int in the range 0 to 4.
619     */
620    public JobStatus getStatus() {
621        return status;
622    }
623
624    /**
625     * Sets status of this job.
626     *
627     * @param newStatus Must be one of the values STATUS_NEW, ..., STATUS_FAILED
628     * @throws ArgumentNotValid in case of invalid status argument or invalid status change
629     */
630    public void setStatus(JobStatus newStatus) {
631        ArgumentNotValid.checkNotNull(newStatus, "newStatus");
632        if (!status.legalChange(newStatus)) {
633            final String message = "Status change from " + status + " to " + newStatus + " is not allowed";
634            log.debug(message);
635            throw new ArgumentNotValid(message);
636        }
637
638        if ((this.status == JobStatus.NEW || this.status == JobStatus.RESUBMITTED) && newStatus == JobStatus.SUBMITTED) {
639            orderXMLdoc.configureQuotaEnforcer(maxObjectsIsSetByQuotaEnforcer, forceMaxBytesPerDomain, forceMaxObjectsPerDomain);
640        }
641            
642
643        if (this.status == JobStatus.SUBMITTED && newStatus == JobStatus.STARTED) {
644            setActualStart(new Date());
645        }
646        if (this.status == JobStatus.STARTED && (newStatus == JobStatus.DONE || newStatus == JobStatus.FAILED)) {
647            setActualStop(new Date());
648        }
649        status = newStatus;
650    }
651
652    /**
653     * Returns a map of domain names and name of their corresponding configuration.
654     * <p>
655     * The returned Map cannot be changed.
656     *
657     * @return a read-only Map (<String>, <String>)
658     */
659    public Map<String, String> getDomainConfigurationMap() {
660        return Collections.unmodifiableMap(domainConfigurationMap);
661    }
662
663    /**
664     * Gets the maximum number of objects harvested per domain.
665     *
666     * @return The maximum number of objects harvested per domain. 0 means no limit.
667     */
668    public long getMaxObjectsPerDomain() {
669        return forceMaxObjectsPerDomain;
670    }
671
672    /**
673     * Gets the maximum number of bytes harvested per domain.
674     *
675     * @return The maximum number of bytes harvested per domain. -1 means no limit.
676     */
677    public long getMaxBytesPerDomain() {
678        return forceMaxBytesPerDomain;
679    }
680
681    /**
682     * Get the edition number.
683     *
684     * @return The edition number
685     */
686    long getEdition() {
687        return edition;
688    }
689
690    /**
691     * Set the edition number.
692     *
693     * @param edition the new edition number
694     */
695    void setEdition(long edition) {
696        this.edition = edition;
697    }
698
699    public void setHarvestChannel(HarvestChannel harvestChannel) {
700        this.channel = harvestChannel.getName();
701        this.isSnapshot = harvestChannel.isSnapshot();
702    }
703
704    /**
705     * @return the associated {@link HarvestChannel} name.
706     */
707    public String getChannel() {
708        return channel;
709    }
710
711    /**
712     * Sets the associated {@link HarvestChannel} name.
713     *
714     * @param channel the channel name
715     */
716    public void setChannel(String channel) {
717        this.channel = channel;
718    }
719
720    /**
721     * @return true if the job belongs to a snapshot harvest, false if it belongs to a focused harvest.
722     */
723    public boolean isSnapshot() {
724        return isSnapshot;
725    }
726
727    /**
728     * Sets whether job belongs to a snapshot or focused harvest.
729     *
730     * @param isSnapshot true if the job belongs to a snapshot harvest, false if it belongs to a focused harvest.
731     */
732    public void setSnapshot(boolean isSnapshot) {
733        this.isSnapshot = isSnapshot;
734    }
735
736    @Override
737    public String toString() {
738        return "Job " + getJobID() + " (state = " + getStatus() + ", HD = " + getOrigHarvestDefinitionID()
739                + ", channel = " + getChannel() + ", snapshot = " + isSnapshot() + ", forcemaxcount = "
740                + getForceMaxObjectsPerDomain() + ", forcemaxbytes = " + getMaxBytesPerDomain()
741                + ", forcemaxrunningtime = " + forceMaxRunningTime + ", orderxml = " + getOrderXMLName()
742                + ", numconfigs = " + getDomainConfigurationMap().size() + ", created = " + getCreationDate()
743                + (getSubmittedDate() != null ? ", submitted = " + getSubmittedDate() : "")
744                + (getActualStart() != null ? ", started = " + getActualStart() : "")
745                + (getActualStop() != null ? ", stopped = " + getActualStop() : "") + ")";
746    }
747
748    /**
749     * @return Returns the forceMaxObjectsPerDomain. 0 means no limit.
750     */
751    public long getForceMaxObjectsPerDomain() {
752        return forceMaxObjectsPerDomain;
753    }
754
755    /**
756     * Sets the maxObjectsPerDomain value.
757     *
758     * @param maxObjectsPerDomain The forceMaxObjectsPerDomain to set. 0 means no limit.
759     * @throws IOFailure Thrown from auxiliary method editOrderXML_maxObjectsPerDomain.
760     */
761    protected void setMaxObjectsPerDomain(long maxObjectsPerDomain) {
762        if (!underConstruction) {
763            final String msg = "Cannot modify job " + this + " as it is no longer under construction";
764            log.debug(msg);
765            throw new IllegalState(msg);
766        }
767        
768        this.forceMaxObjectsPerDomain = maxObjectsPerDomain;
769        orderXMLdoc.setMaxObjectsPerDomain(maxObjectsPerDomain); // FIXME? add argument to maxObjectsIsSetByQuotaEnforcer to method setMaxObjectsPerDomain  
770        //orderXMLdoc.editOrderXML_maxObjectsPerDomain(orderXMLdoc, maxObjectsPerDomain,
771        //        maxObjectsIsSetByQuotaEnforcer);
772
773        if (0L == maxObjectsPerDomain && 0L != forceMaxBytesPerDomain) {
774            setMaxBytesPerDomain(0L);
775        }
776    }
777
778    /**
779     * Set the maxbytes per domain value.
780     *
781     * @param maxBytesPerDomain The maxBytesPerDomain to set, or -1 for no limit.
782     */
783    protected void setMaxBytesPerDomain(long maxBytesPerDomain) {
784        if (!underConstruction) {
785            final String msg = "Cannot modify job " + this + " as it is no longer under construction";
786            log.debug(msg);
787            throw new IllegalState(msg);
788        }
789        this.forceMaxBytesPerDomain = maxBytesPerDomain;
790        orderXMLdoc.setMaxBytesPerDomain(maxBytesPerDomain);
791
792        if (0L == maxBytesPerDomain && 0L != forceMaxObjectsPerDomain) {
793            setMaxObjectsPerDomain(0L);
794        }
795    }
796
797    /**
798     * Set the maxJobRunningTime value.
799     *
800     * @param maxJobRunningTime The maxJobRunningTime in seconds to set, or 0 for no limit.
801     */
802    protected void setMaxJobRunningTime(long maxJobRunningTime) {
803        if (!underConstruction) {
804            final String msg = "Cannot modify job " + this + " as it is no longer under construction";
805            log.debug(msg);
806            throw new IllegalState(msg);
807        }
808        this.forceMaxRunningTime = maxJobRunningTime;
809        orderXMLdoc.setMaxJobRunningTime(maxJobRunningTime);
810    }
811
812    /**
813     * @return Returns the MaxJobRunningTime. 0 means no limit.
814     */
815    public long getMaxJobRunningTime() {
816        return forceMaxRunningTime;
817    }
818
819    /**
820     * Get the harvestNum for this job. The number reflects which run of the harvest definition this is.
821     *
822     * @return the harvestNum for this job.
823     */
824    public int getHarvestNum() {
825        return harvestNum;
826    }
827
828    /**
829     * Set the harvestNum for this job. The number reflects which run of the harvest definition this is. ONLY TO BE USED
830     * IN THE CONSTRUCTION PHASE.
831     *
832     * @param harvestNum a given harvestNum
833     */
834    public void setHarvestNum(int harvestNum) {
835        if (!underConstruction) {
836            final String msg = "Cannot modify job " + this + " as it is no longer under construction";
837            log.debug(msg);
838            throw new IllegalState(msg);
839        }
840        this.harvestNum = harvestNum;
841    }
842
843    /**
844     * Get the list of harvest errors for this job. If no harvest errors, null is returned This value is not meaningful
845     * until the job is finished (FAILED,DONE, RESUBMITTED)
846     *
847     * @return the harvest errors for this job or null if no harvest errors.
848     */
849    public String getHarvestErrors() {
850        return harvestErrors;
851    }
852
853    /**
854     * Append to the list of harvest errors for this job. Nothing happens, if argument harvestErrors is null.
855     *
856     * @param harvestErrors a string containing harvest errors (may be null)
857     */
858    public void appendHarvestErrors(String harvestErrors) {
859        if (harvestErrors != null) {
860            if (this.harvestErrors == null) {
861                this.harvestErrors = harvestErrors;
862            } else {
863                this.harvestErrors += "\n" + harvestErrors;
864            }
865        }
866    }
867
868    /**
869     * Get the list of harvest error details for this job. If no harvest error details, null is returned This value is
870     * not meaningful until the job is finished (FAILED,DONE, RESUBMITTED)
871     *
872     * @return the list of harvest error details for this job or null if no harvest error details.
873     */
874
875    public String getHarvestErrorDetails() {
876        return harvestErrorDetails;
877    }
878
879    /**
880     * Append to the list of harvest error details for this job. Nothing happens, if argument harvestErrorDetails is
881     * null.
882     *
883     * @param harvestErrorDetails a string containing harvest error details.
884     */
885    public void appendHarvestErrorDetails(String harvestErrorDetails) {
886        if (harvestErrorDetails != null) {
887            if (this.harvestErrorDetails == null) {
888                this.harvestErrorDetails = harvestErrorDetails;
889            } else {
890                this.harvestErrorDetails += "\n" + harvestErrorDetails;
891            }
892        }
893    }
894
895    /**
896     * Get the list of upload errors. If no upload errors, null is returned. This value is not meaningful until the job
897     * is finished (FAILED,DONE, RESUBMITTED)
898     *
899     * @return the list of upload errors as String, or null if no upload errors.
900     */
901    public String getUploadErrors() {
902        return uploadErrors;
903    }
904
905    /**
906     * Append to the list of upload errors. Nothing happens, if argument uploadErrors is null.
907     *
908     * @param uploadErrors a string containing upload errors.
909     */
910    public void appendUploadErrors(String uploadErrors) {
911        if (uploadErrors != null) {
912            if (this.uploadErrors == null) {
913                this.uploadErrors = uploadErrors;
914            } else {
915                this.uploadErrors += "\n" + uploadErrors;
916            }
917        }
918    }
919
920    /**
921     * Get the list of upload error details. If no upload error details, null is returned. This value is not meaningful
922     * until the job is finished (FAILED,DONE, RESUBMITTED)
923     *
924     * @return the list of upload error details as String, or null if no upload error details
925     */
926    public String getUploadErrorDetails() {
927        return uploadErrorDetails;
928    }
929
930    /**
931     * Append to the list of upload error details. Nothing happens, if argument uploadErrorDetails is null.
932     *
933     * @param uploadErrorDetails a string containing upload error details.
934     */
935    public void appendUploadErrorDetails(String uploadErrorDetails) {
936        if (uploadErrorDetails != null) {
937            if (this.uploadErrorDetails == null) {
938                this.uploadErrorDetails = uploadErrorDetails;
939            } else {
940                this.uploadErrorDetails += "\n" + uploadErrorDetails;
941            }
942        }
943    }
944
945    /**
946     * Get the ID for the job which this job was resubmitted as. If null, this job has not been resubmitted.
947     *
948     * @return this ID.
949     */
950    public Long getResubmittedAsJob() {
951        return resubmittedAsJobWithID;
952    }
953
954    /**
955     * Set the Date for when this job was submitted. If null, this job has not been submitted.
956     *
957     * @param submittedDate The date when this was submitted
958     */
959    public void setSubmittedDate(Date submittedDate) {
960        this.submittedDate = submittedDate;
961    }
962
963    /**
964     * Set the Date for when this job was created. If null, this job has not been created.
965     *
966     * @param creationDate The date when this was created
967     */
968    public void setCreationDate(Date creationDate) {
969        this.creationDate = creationDate;
970    }
971
972    /**
973     * Set the ID for the job which this job was resubmitted as.
974     *
975     * @param resubmittedAsJob An Id for a new job.
976     */
977    public void setResubmittedAsJob(Long resubmittedAsJob) {
978        this.resubmittedAsJobWithID = resubmittedAsJob;
979    }
980
981    /**
982     * @return id of the job that this job is supposed to continue using Heritrix recover-log or null if it starts from
983     * scratch.
984     */
985    public Long getContinuationOf() {
986        return this.continuationOF;
987    }
988
989    @Override
990    public String getHarvestFilenamePrefix() {
991        if (this.harvestnamePrefix == null) {
992            log.warn("HarvestnamePrefix not yet set for job {}. Set it by using the naming scheme. "
993                    + "This should only happen for old jobs being read", this.jobID);
994            setDefaultHarvestNamePrefix();
995        }
996        return this.harvestnamePrefix;
997    }
998
999    /**
1000     * @param prefix
1001     */
1002    public void setHarvestFilenamePrefix(String prefix) {
1003        this.harvestnamePrefix = prefix;
1004    }
1005
1006    /**
1007     * @return the forceMaxBytesPerDomain
1008     */
1009    public long getForceMaxBytesPerDomain() {
1010        return forceMaxBytesPerDomain;
1011    }
1012
1013    /**
1014     * @return the configurationSetsObjectLimit
1015     */
1016    public boolean isConfigurationSetsObjectLimit() {
1017        return configurationSetsObjectLimit;
1018    }
1019
1020    /**
1021     * @return the configurationSetsByteLimit
1022     */
1023    public boolean isConfigurationSetsByteLimit() {
1024        return configurationSetsByteLimit;
1025    }
1026
1027    /**
1028     * @return the minCountObjects
1029     */
1030    public long getMinCountObjects() {
1031        return minCountObjects;
1032    }
1033
1034    /**
1035     * @return the maxCountObjects
1036     */
1037    public long getMaxCountObjects() {
1038        return maxCountObjects;
1039    }
1040
1041    /**
1042     * @return the totalCountObjects
1043     */
1044    public long getTotalCountObjects() {
1045        return totalCountObjects;
1046    }
1047
1048    void setDefaultHarvestNamePrefix() {
1049        if (getJobID() != null) {
1050                ArchiveFileNaming naming = ArchiveFileNamingFactory.getInstance();
1051                log.debug("Applying the default ArchiveFileNaming class '{}'.", naming.getClass().getName());
1052                final String prefix = naming.getPrefix(this);
1053                setHarvestFilenamePrefix(prefix);
1054                log.debug("The harvestPrefix of this job is: {}", prefix);
1055        } else {
1056                log.warn("The harvestnamePrefix is not set now, as it depends on the JobID, which is not set yet");
1057        }
1058    }
1059
1060    /** @return the harvest-audience. */
1061    public String getHarvestAudience() {
1062        return harvestAudience;
1063    }
1064
1065    /**
1066     * Set the harvest audience for this job. Taken from the harvestdefinition that generated this job.
1067     *
1068     * @param theAudience the harvest-audience.
1069     */
1070    public void setHarvestAudience(String theAudience) {
1071        this.harvestAudience = theAudience;
1072    }
1073
1074    ///////////// The following two methods are needed by harvestStatus-jobdetails.jsp ////////////////////////////////////
1075    /**
1076     * Returns a list of sorted seeds for this job.
1077     * The sorting is by domain, and inside each domain,
1078     * the list is sorted by url
1079     * @return a list of sorted seeds for this job.
1080     */
1081    public List<String> getSortedSeedList() {
1082        Map<String, Set<String>> urlMap = new HashMap<String, Set<String>>();
1083        for (String seed : seedListSet) {
1084            String url;
1085            // Assume the protocol is http://, if it is missing
1086            if (!seed.matches(Constants.PROTOCOL_REGEXP)) {
1087                url = "http://" + seed;
1088            } else {
1089                url = seed;
1090            }
1091            String domain = getDomain(url);
1092            if (domain == null) {
1093                // stop processing this url, and continue to the next seed
1094                continue; 
1095            }
1096            Set<String> set;
1097            if (urlMap.containsKey(domain)) {
1098                set = urlMap.get(domain);
1099            } else {
1100                set = new TreeSet<String>();
1101                urlMap.put(domain, set);
1102            }
1103            set.add(seed);
1104
1105        }
1106       List<String> result = new ArrayList<String>();
1107       for (Set<String> set: urlMap.values()) {
1108           result.addAll(set);
1109       }
1110       return result;
1111    }
1112    /**
1113     * Get the domain, that the given URL belongs to.
1114     * @param url an URL
1115     * @return the domain, that the given URL belongs to, or 
1116     * null if unable to do so.
1117     */
1118    private String getDomain(String url) {
1119        try {
1120            URL uri = new URL(url);
1121            return DomainUtils.domainNameFromHostname(uri.getHost());
1122        } catch (MalformedURLException e) {
1123            log.warn("The string '{}' is not a valid URL", url);
1124            return null;
1125        }
1126    }
1127    
1128}