001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.datamodel;
024
025import gnu.inet.encoding.IDNA;
026import gnu.inet.encoding.IDNAException;
027
028import java.io.BufferedReader;
029import java.io.File;
030import java.io.IOException;
031import java.io.Serializable;
032import java.io.StringReader;
033import java.net.MalformedURLException;
034import java.net.URL;
035import java.util.ArrayList;
036import java.util.Collections;
037import java.util.Date;
038import java.util.HashMap;
039import java.util.HashSet;
040import java.util.Iterator;
041import java.util.List;
042import java.util.Map;
043import java.util.Set;
044import java.util.TreeSet;
045import java.util.regex.Pattern;
046
047import org.apache.commons.io.IOUtils;
048import org.slf4j.Logger;
049import org.slf4j.LoggerFactory;
050
051import dk.netarkivet.common.exceptions.ArgumentNotValid;
052import dk.netarkivet.common.exceptions.IOFailure;
053import dk.netarkivet.common.exceptions.IllegalState;
054import dk.netarkivet.common.utils.DomainUtils;
055import dk.netarkivet.common.utils.Settings;
056import dk.netarkivet.common.utils.StringUtils;
057import dk.netarkivet.harvester.HarvesterSettings;
058import dk.netarkivet.harvester.datamodel.eav.EAV.AttributeAndType;
059import dk.netarkivet.harvester.harvesting.ArchiveFileNaming;
060import dk.netarkivet.harvester.harvesting.ArchiveFileNamingFactory;
061import dk.netarkivet.harvester.harvesting.JobInfo;
062
063/**
064 * This class represents one job to run by Heritrix. It's based on a number of configurations all based on the same
065 * order.xml and at most one configuration for each domain. Each job consists of configurations of the approximate same
066 * size; that is the difference in expectation from the smallest configuration to the largest configuration is within a
067 * factor of each other defined as limMaxRelSize (although differences smaller than limMinAbsSize are ignored) There is
068 * a limit limMaxTotalSize on the total size of the job in objects.
069 * <p>
070 * A job may also be limited on bytes or objects, defined either by the configurations in the job or the harvest
071 * definition the job is generated by.
072 * <p>
073 * The job contains the order file, the seedlist and the current status of the job, as well as the ID of the harvest
074 * definition that defined it and names of all the configurations it is based on.
075 */
076@SuppressWarnings({"serial"})
077public class Job implements Serializable, JobInfo {
078    private transient static final Logger log = LoggerFactory.getLogger(Job.class);
079
080    // Persistent fields stored in and read from DAO
081    /** The persistent ID of this job. */
082    private Long jobID;
083    /** The Id of the harvestdefinition, that generated this job. */
084    protected Long origHarvestDefinitionID;
085    /** The status of the job. See the JobStatus class for the possible states. */
086    protected JobStatus status;
087    /** The name of the {@link HarvestChannel} on which this job will be posted. */
088    private String channel;
089
090    /** Whether the job belongs to a snapshot or partial harvest. */
091    private boolean isSnapshot;
092    /**
093     * Overrides the individual configurations maximum setting for objects retrieved from a domain when set to a
094     * positive value.
095     */
096    private long forceMaxObjectsPerDomain = Constants.HERITRIX_MAXOBJECTS_INFINITY;
097    /**
098     * Overrides the individual configurations maximum setting for bytes retrieved from a domain when set to other than
099     * -1.
100     */
101    private long forceMaxBytesPerDomain = Constants.HERITRIX_MAXBYTES_INFINITY;
102    /** The name of the harvest template used by the job. */
103    private String orderXMLname;
104    /** The harvest template used by the job. */
105    private HeritrixTemplate orderXMLdoc;
106    /** The list of Heritrix settings files. */
107    private File[] settingsXMLfiles;
108    
109    /** The corresponding Dom4j Documents for these files. */
110    //private Document[] settingsXMLdocs;
111   
112    /**
113     * A set of seeds involved in this job. Outside the SetSeedList() method, the set of seeds is updated in the
114     * addConfiguration() method.
115     */
116    private Set<String> seedListSet = new HashSet<String>();
117    /** Which run of the harvest definition this is. */
118    private int harvestNum;
119    /** Errors during harvesting. */
120    private String harvestErrors;
121    /** Details about errors during harvesting. */
122    private String harvestErrorDetails;
123    /** Errors during upload of the harvested data. */
124    private String uploadErrors;
125    /** Details about errors during upload of the harvested data. */
126    private String uploadErrorDetails;
127    /** The starting point of the job. */
128    private Date actualStart;
129    /** The ending point of the job. */
130    private Date actualStop;
131    /** The time when this job was submitted. */
132    private Date submittedDate;
133    /** The time when this job was created. */
134    private Date creationDate;
135
136    /** Edition is used by the DAO to keep track of changes. */
137    private long edition = -1;
138
139    /** Resubmitted as the Job with this ID. If null, this job has not been resubmitted. */
140    private Long resubmittedAsJobWithID;
141
142    /** Continuation of this job. */
143    private Long continuationOF;
144
145    /**
146     * A map (domainName, domainConfigurationName), must be accessible in order to update job information (see Ass.
147     * 2.4.3)
148     */
149    private Map<String, String> domainConfigurationMap;
150    /**
151     * A hint to the DAO that configurations have changed. Since configurations are large, the DAO can use that this is
152     * false to avoid updating the config list. The DAO can set it to false after saving configurations.
153     */
154    boolean configsChanged = false;
155
156    // Intermediate fields, non-persistent and only used while building objects
157
158    /**
159     * Whether the maxObjects field was defined by the harvest definition or the configuration limit. This is deciding
160     * for whether we accept smaller configurations or not when building jobs. True means the limit is defined by the
161     * configuration, false means that it is defined by the harvest definition.
162     */
163    private boolean configurationSetsObjectLimit;
164
165    /**
166     * Whether the maxBytes field was defined by the harvest definition or the configuration limit. This is deciding for
167     * whether we accept smaller configurations or not when building jobs. True means the limit is defined by the
168     * configuration, false means by the harvest definition.
169     */
170    private boolean configurationSetsByteLimit;
171
172    /** The lowest number of objects expected by a configuration. */
173    private long minCountObjects;
174
175    /** The highest number of objects expected by a configuration. */
176    private long maxCountObjects;
177
178    /** The total number of objects expected by all added configurations. */
179    private long totalCountObjects;
180
181    /**
182     * The max time in seconds given to the harvester for this job. 0 is unlimited.
183     */
184    private long forceMaxRunningTime;
185
186    /**
187     * If true, this job object is still undergoing changes due to having more configurations added. When set to false,
188     * the object is no longer considered immutable except for updating status.
189     * <p>
190     * Jobs loaded from the DAO are never under construction anymore.
191     */
192    private boolean underConstruction = true;
193
194    // Constants
195
196    // Note: The following constants are intentionally left non-static for easy
197    // unit testing
198
199    private boolean maxObjectsIsSetByQuotaEnforcer = Settings
200            .getBoolean(HarvesterSettings.OBJECT_LIMIT_SET_BY_QUOTA_ENFORCER);
201
202    /**
203     * The harvestname prefix used in the files generated by Heritrix. Is set using an ArchiveFileNaming class when the
204     * jobID is available.
205     */
206    private String harvestnamePrefix;
207
208    /** This variable is right now the same as harvestdefinitions.audience field. */
209    private String harvestAudience;
210
211    protected Job() {
212        this.status = JobStatus.NEW;
213    }
214
215    /**
216     * Package private constructor for common initialisation.
217     *
218     * @param harvestID the id of the harvestdefinition
219     * @param cfg the configuration to base the Job on
220     * @param orderXMLdoc
221     * @param channel the channel on which the job will be submitted.
222     * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain, overrides individual
223     * configuration settings. -1 means no limit
224     * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for no limit.
225     * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job
226     * @param harvestNum the run number of the harvest definition
227     * @throws ArgumentNotValid if cfg or priority is null or harvestID is invalid, or if any limit < -1
228     */
229    public Job(Long harvestID, DomainConfiguration cfg, HeritrixTemplate orderXMLdoc, HarvestChannel channel,
230            long forceMaxObjectsPerDomain,
231            long forceMaxBytesPerDomain, long forceMaxJobRunningTime, int harvestNum) throws ArgumentNotValid {
232        ArgumentNotValid.checkNotNull(cfg, "cfg");
233        ArgumentNotValid.checkNotNull(harvestID, "harvestID");
234        ArgumentNotValid.checkNotNegative(harvestID, "harvestID");
235        ArgumentNotValid.checkNotNull(channel, "channel");
236
237        if (forceMaxObjectsPerDomain < -1) {
238            String msg = "forceMaxObjectsPerDomain must be either -1 or positive";
239            log.debug(msg);
240            throw new ArgumentNotValid(msg);
241        }
242        if (forceMaxBytesPerDomain < -1) {
243            String msg = "forceMaxBytesPerDomain must be either -1 or positive";
244            log.debug(msg);
245            throw new ArgumentNotValid(msg);
246        }
247
248        if (forceMaxBytesPerDomain == 0L) {
249            log.warn("forceMaxBytesPerDomain should probably not be 0.Means 0 bytes downloaded per domain");
250        }
251
252        if (forceMaxObjectsPerDomain == 0L) {
253            log.warn("forceMaxObjectsPerDomain should probably not be 0.Means 0 objects downloaded per domain");
254        }
255
256        // setup initial members
257        domainConfigurationMap = new HashMap<>();
258        origHarvestDefinitionID = harvestID;
259        orderXMLname = cfg.getOrderXmlName();
260        this.orderXMLdoc = orderXMLdoc;
261
262        setHarvestChannel(channel);
263
264        long maxObjects = NumberUtils.minInf(forceMaxObjectsPerDomain, cfg.getMaxObjects());
265        setMaxObjectsPerDomain(maxObjects);
266        configurationSetsObjectLimit = (maxObjects != forceMaxObjectsPerDomain);
267
268        long maxBytes = NumberUtils.minInf(forceMaxBytesPerDomain, cfg.getMaxBytes());
269        setMaxBytesPerDomain(maxBytes);
270        configurationSetsByteLimit = (maxBytes != forceMaxBytesPerDomain);
271
272        long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain);
273        maxCountObjects = expectation;
274        minCountObjects = expectation;
275        this.harvestNum = harvestNum;
276
277        addConfiguration(cfg);
278
279        setMaxJobRunningTime(forceMaxJobRunningTime);
280
281        setArchiveFormatInTemplate(Settings.get(HarvesterSettings.HERITRIX_ARCHIVE_FORMAT));
282
283        setAttributes(cfg.getAttributesAndTypes());
284
285        orderXMLdoc.enableOrDisableDeduplication(Settings.getBoolean(HarvesterSettings.DEDUPLICATION_ENABLED));
286
287        status = JobStatus.NEW;
288    }
289
290        public void setAttributes(List<AttributeAndType> attributesAndTypes) {
291        orderXMLdoc.insertAttributes(attributesAndTypes);
292        }
293
294    /**
295     * Update the order template according to the chosen archive format (arc/warc).
296     */
297    private void setArchiveFormatInTemplate(String archiveFormat) {
298        if (!underConstruction) {
299            final String msg = "Cannot modify job " + this + " as it is no longer under construction";
300            log.debug(msg);
301            throw new IllegalState(msg);
302        }
303        orderXMLdoc.setArchiveFormat(archiveFormat);
304    }
305
306    /**
307     * Create a new Job object from basic information stored in the DAO.
308     *
309     * @param harvestID the id of the harvestdefinition
310     * @param configurations the configurations to base the Job on
311     * @param channel the name of the channel on which the job will be submitted.
312     * @param snapshot whether the job belongs to a snapshot harvest
313     * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain, overrides individual
314     * configuration settings. 0 means no limit.
315     * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for no limit.
316     * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job
317     * @param status the current status of the job.
318     * @param orderXMLname the name of the order template used.
319     * @param orderXMLdoc the (possibly modified) template
320     * @param seedlist the combined seedlist from all configs.
321     * @param harvestNum the run number of the harvest definition
322     */
323    Job(Long harvestID, Map<String, String> configurations, String channel, boolean snapshot,
324            long forceMaxObjectsPerDomain, long forceMaxBytesPerDomain, long forceMaxJobRunningTime, JobStatus status,
325            String orderXMLname, HeritrixTemplate orderXMLdoc, String seedlist, int harvestNum, Long continuationOf) {
326        origHarvestDefinitionID = harvestID;
327        domainConfigurationMap = configurations;
328        this.channel = channel;
329        this.isSnapshot = snapshot;
330        this.forceMaxBytesPerDomain = forceMaxBytesPerDomain;
331        this.forceMaxObjectsPerDomain = forceMaxObjectsPerDomain;
332        this.forceMaxRunningTime = forceMaxJobRunningTime;
333        this.status = status;
334        this.orderXMLname = orderXMLname;
335        this.orderXMLdoc = orderXMLdoc;
336        this.setSeedList(seedlist);
337        this.harvestNum = harvestNum;
338        this.continuationOF = continuationOf;
339        
340        underConstruction = false;
341    }
342
343
344        /**
345     * Adds a configuration to this Job. Seedlists and settings are updated accordingly.
346     *
347     * @param cfg the configuration to add
348     * @throws ArgumentNotValid if cfg is null or cfg uses a different orderxml than this job or if this job already
349     * contains a configuration associated with domain of configuration cfg.
350     */
351    public void addConfiguration(DomainConfiguration cfg) {
352        ArgumentNotValid.checkNotNull(cfg, "cfg");
353        if (domainConfigurationMap.containsKey(cfg.getDomainName())) {
354            throw new ArgumentNotValid("Job already has a configuration for Domain " + cfg.getDomainName());
355        }
356
357        if (log.isTraceEnabled()) {
358            log.trace("Adding configuration '{}' to job '{}'", cfg, cfg.getName());
359        }
360
361        if (!underConstruction) {
362            final String msg = "Cannot modify job " + this + " as it is no longer under construction";
363            log.debug(msg);
364            throw new IllegalState(msg);
365        }
366
367        if (!cfg.getOrderXmlName().equals(getOrderXMLName())) {
368            throw new ArgumentNotValid("Job requires the orderxml file:'" + getOrderXMLName() + "' not:'"
369                    + cfg.getOrderXmlName() + "' used by the configuration:'" + cfg.getName());
370        }
371
372        domainConfigurationMap.put(cfg.getDomainName(), cfg.getName());
373
374        // Add the seeds from the configuration to the Job seeds.
375        // Take care of duplicates.
376        for (Iterator<SeedList> itt = cfg.getSeedLists(); itt.hasNext();) {
377            SeedList seed = itt.next();
378            List<String> seeds = seed.getSeeds();
379            for (String seedUrl : seeds) {
380                seedListSet.add(seedUrl); // duplicates is silently ignored
381
382                // TODO remove when heritrix implements this functionality
383                // try to convert a seed into a Internationalized Domain Name
384                try {
385                    String seedASCII = seedUrl;
386                    // It is rare to see these seeds, but they need to be
387                    // correctly idnaized
388                    if (seedUrl.contains(":") || seedUrl.contains("/")) {
389                        String normalizedUrl = seedUrl;
390                        if (!normalizedUrl.matches("^[a-zA-Z]+:.*")) {
391                            // If no protocol is given, assume http
392                            normalizedUrl = "http://" + normalizedUrl;
393                        }
394                        URL url = new URL(normalizedUrl);
395                        String domainName = url.getHost();
396                        String domainNameASCII = IDNA.toASCII(domainName);
397                        if (!domainName.equals(domainNameASCII)) {
398                            // If the domain name changed, replace that in the
399                            // seed.
400                            seedASCII = seedUrl.replaceFirst(Pattern.quote(domainName), domainNameASCII);
401                        }
402                    } else {
403                        seedASCII = IDNA.toASCII(seedUrl);
404                    }
405                    if (!seedASCII.equals(seedUrl)) {
406                        log.trace("Converted {} to {}", seedUrl, seedASCII);
407                        // Note that duplicates is silently ignored
408                        seedListSet.add(seedASCII);
409                    }
410                } catch (IDNAException e) {
411                    log.trace("Cannot convert seed {} to ASCII", seedUrl, e);
412                } catch (MalformedURLException e) {
413                    log.trace("Cannot convert seed {} to ASCII", seedUrl, e);
414                }
415            }
416        }
417
418        orderXMLdoc.editOrderXMLAddPerDomainCrawlerTraps(cfg);
419
420        // TODO update limits in settings files - see also bug 269
421
422        // Update estimates of job size
423        long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain);
424        maxCountObjects = Math.max(expectation, maxCountObjects);
425        minCountObjects = Math.min(expectation, minCountObjects);
426        totalCountObjects += expectation;
427
428        configsChanged = true;
429
430        assert (maxCountObjects >= minCountObjects) : "basic invariant";
431    }
432
433    /**
434     * Get the name of the order XML file used by this Job.
435     *
436     * @return the name of the orderXML file
437     */
438    public String getOrderXMLName() {
439        return orderXMLname;
440    }
441
442    /**
443     * Get the actual time when this job was stopped/completed.
444     *
445     * @return the time as Date
446     */
447    public Date getActualStop() {
448        return actualStop;
449    }
450
451    /**
452     * Get the actual time when this job was started.
453     *
454     * @return the time as Date
455     */
456    public Date getActualStart() {
457        return actualStart;
458    }
459
460    /**
461     * Get the time when this job was submitted.
462     *
463     * @return the time as Date
464     */
465    public Date getSubmittedDate() {
466        return submittedDate;
467    }
468
469    /**
470     * Get the time when this job was created.
471     *
472     * @return the creation time as a <code>Date</code>
473     */
474    public Date getCreationDate() {
475        return creationDate;
476    }
477
478    /**
479     * Get a list of Heritrix settings.xml files. Note that these files have nothing to do with NetarchiveSuite settings
480     * files. They are files that supplement the Heritrix order.xml files, and contain overrides for specific domains.
481     *
482     * @return the list of Files as an array
483     */
484    public File[] getSettingsXMLfiles() {
485        return settingsXMLfiles;
486    }
487
488    /**
489     * Get the id of the HarvestDefinition from which this job originates.
490     *
491     * @return the id as a Long
492     */
493    public Long getOrigHarvestDefinitionID() {
494        return origHarvestDefinitionID;
495    }
496
497    /**
498     * Get the id of this Job.
499     *
500     * @return the id as a Long
501     */
502    public Long getJobID() {
503        return jobID;
504    }
505
506    /**
507     * Set the id of this Job.
508     *
509     * @param id The Id for this job.
510     */
511    public void setJobID(Long id) {
512        jobID = id;
513    }
514
515    /**
516     * Get's the total number of different domains harvested by this job.
517     *
518     * @return the number of configurations added to this domain
519     */
520    public int getCountDomains() {
521        return domainConfigurationMap.size();
522    }
523
524    /**
525     * Set the actual time when this job was started.
526     * <p>
527     * Sends a notification, if actualStart is set to a time after actualStop.
528     *
529     * @param actualStart A Date object representing the time when this job was started.
530     */
531    public void setActualStart(Date actualStart) {
532        ArgumentNotValid.checkNotNull(actualStart, "actualStart");
533        if (actualStop != null && actualStop.before(actualStart)) {
534            log.warn("Job(" + getJobID()+ "): Start time (" + actualStart + ") is after end time: " + actualStop);
535        }
536        this.actualStart = (Date) actualStart.clone();
537    }
538
539    /**
540     * Set the actual time when this job was stopped/completed. Sends a notification, if actualStop is set to a time
541     * before actualStart.
542     *
543     * @param actualStop A Date object representing the time when this job was stopped.
544     * @throws ArgumentNotValid
545     */
546    public void setActualStop(Date actualStop) throws ArgumentNotValid {
547        ArgumentNotValid.checkNotNull(actualStop, "actualStop");
548        if (actualStart == null) {
549            log.warn("Job(" + getJobID()+ "): actualStart should be defined before setting actualStop");
550        } else if (actualStop.before(actualStart)) {
551            log.warn("Job(" + getJobID()+ "): actualStop (" + actualStop + ") is before actualStart: " + actualStart);
552        }
553        this.actualStop = (Date) actualStop.clone();
554    }
555
556    /**
557     * Set the orderxml for this job.
558     *
559     * @param doc A orderxml to be used by this job
560     */
561    public void setOrderXMLDoc(HeritrixTemplate doc) {
562        ArgumentNotValid.checkNotNull(doc, "doc");
563        this.orderXMLdoc = doc;
564    }
565
566    /**
567     * Gets a document representation of the order.xml associated with this Job.
568     *
569     * @return the XML as a org.dom4j.Document
570     */
571    public HeritrixTemplate getOrderXMLdoc() {
572        return orderXMLdoc;
573    }
574
575//    /**
576//     * Gets a list of document representations of the settings.xml's associated with this Job.
577//     *
578//     * @return the XML as an array of org.dom4j.Document
579//     */
580//    public Document[] getSettingsXMLdocs() {
581//        return settingsXMLdocs;
582//    }
583
584    /**
585     * Set the seedlist of the job from the seedList argument. Individual seeds are separated by a '\n' character.
586     * Duplicate seeds are removed.
587     *
588     * @param seedList List of seeds as one String
589     */
590    public void setSeedList(String seedList) {
591        ArgumentNotValid.checkNotNullOrEmpty(seedList, "seedList");
592        seedListSet = new HashSet<>();
593        BufferedReader reader = new BufferedReader(new StringReader(seedList));
594        String seed;
595        try {
596            while ((seed = reader.readLine()) != null) {
597                seedListSet.add(seed); // add to seedlist if not already there
598            }
599        } catch (IOException e) {
600            // This never happens, as we're reading from a string!
601            throw new IOFailure("IOException reading from seed string", e);
602        } finally {
603            IOUtils.closeQuietly(reader);
604        }
605    }
606
607    /**
608     * Get the seedlist as a String. The individual seeds are separated by the character '\n'. The order of the seeds
609     * are unknown.
610     *
611     * @return the seedlist as a String
612     */
613    public String getSeedListAsString() {
614        return StringUtils.conjoin("\n", seedListSet);
615    }
616
617    /**
618     * Get the current status of this Job.
619     *
620     * @return the status as an int in the range 0 to 4.
621     */
622    public JobStatus getStatus() {
623        return status;
624    }
625
626    /**
627     * Sets status of this job.
628     *
629     * @param newStatus Must be one of the values STATUS_NEW, ..., STATUS_FAILED
630     * @throws ArgumentNotValid in case of invalid status argument or invalid status change
631     */
632    public void setStatus(JobStatus newStatus) {
633        ArgumentNotValid.checkNotNull(newStatus, "newStatus");
634        if (!status.legalChange(newStatus)) {
635            final String message = "Status change from " + status + " to " + newStatus + " is not allowed";
636            log.debug(message);
637            throw new ArgumentNotValid(message);
638        }
639
640        if ((this.status == JobStatus.NEW || this.status == JobStatus.RESUBMITTED) && newStatus == JobStatus.SUBMITTED) {
641            orderXMLdoc.configureQuotaEnforcer(maxObjectsIsSetByQuotaEnforcer, forceMaxBytesPerDomain, forceMaxObjectsPerDomain);
642        }
643            
644
645        if (this.status == JobStatus.SUBMITTED && newStatus == JobStatus.STARTED) {
646            setActualStart(new Date());
647        }
648        if (this.status == JobStatus.STARTED && (newStatus == JobStatus.DONE || newStatus == JobStatus.FAILED)) {
649            setActualStop(new Date());
650        }
651        status = newStatus;
652    }
653
654    /**
655     * Returns a map of domain names and name of their corresponding configuration.
656     * <p>
657     * The returned Map cannot be changed.
658     *
659     * @return a read-only Map (<String>, <String>)
660     */
661    public Map<String, String> getDomainConfigurationMap() {
662        return Collections.unmodifiableMap(domainConfigurationMap);
663    }
664
665    /**
666     * Gets the maximum number of objects harvested per domain.
667     *
668     * @return The maximum number of objects harvested per domain. 0 means no limit.
669     */
670    public long getMaxObjectsPerDomain() {
671        return forceMaxObjectsPerDomain;
672    }
673
674    /**
675     * Gets the maximum number of bytes harvested per domain.
676     *
677     * @return The maximum number of bytes harvested per domain. -1 means no limit.
678     */
679    public long getMaxBytesPerDomain() {
680        return forceMaxBytesPerDomain;
681    }
682
683    /**
684     * Get the edition number.
685     *
686     * @return The edition number
687     */
688    long getEdition() {
689        return edition;
690    }
691
692    /**
693     * Set the edition number.
694     *
695     * @param edition the new edition number
696     */
697    void setEdition(long edition) {
698        this.edition = edition;
699    }
700
701    public void setHarvestChannel(HarvestChannel harvestChannel) {
702        this.channel = harvestChannel.getName();
703        this.isSnapshot = harvestChannel.isSnapshot();
704    }
705
706    /**
707     * @return the associated {@link HarvestChannel} name.
708     */
709    public String getChannel() {
710        return channel;
711    }
712
713    /**
714     * Sets the associated {@link HarvestChannel} name.
715     *
716     * @param channel the channel name
717     */
718    public void setChannel(String channel) {
719        this.channel = channel;
720    }
721
722    /**
723     * @return true if the job belongs to a snapshot harvest, false if it belongs to a focused harvest.
724     */
725    public boolean isSnapshot() {
726        return isSnapshot;
727    }
728
729    /**
730     * Sets whether job belongs to a snapshot or focused harvest.
731     *
732     * @param isSnapshot true if the job belongs to a snapshot harvest, false if it belongs to a focused harvest.
733     */
734    public void setSnapshot(boolean isSnapshot) {
735        this.isSnapshot = isSnapshot;
736    }
737
738    @Override
739    public String toString() {
740        return "Job " + getJobID() + " (state = " + getStatus() + ", HD = " + getOrigHarvestDefinitionID()
741                + ", channel = " + getChannel() + ", snapshot = " + isSnapshot() + ", forcemaxcount = "
742                + getForceMaxObjectsPerDomain() + ", forcemaxbytes = " + getMaxBytesPerDomain()
743                + ", forcemaxrunningtime = " + forceMaxRunningTime + ", orderxml = " + getOrderXMLName()
744                + ", numconfigs = " + getDomainConfigurationMap().size() + ", created = " + getCreationDate()
745                + (getSubmittedDate() != null ? ", submitted = " + getSubmittedDate() : "")
746                + (getActualStart() != null ? ", started = " + getActualStart() : "")
747                + (getActualStop() != null ? ", stopped = " + getActualStop() : "") + ")";
748    }
749
750    /**
751     * @return Returns the forceMaxObjectsPerDomain. 0 means no limit.
752     */
753    public long getForceMaxObjectsPerDomain() {
754        return forceMaxObjectsPerDomain;
755    }
756
757    /**
758     * Sets the maxObjectsPerDomain value.
759     *
760     * @param maxObjectsPerDomain The forceMaxObjectsPerDomain to set. 0 means no limit.
761     * @throws IOFailure Thrown from auxiliary method editOrderXML_maxObjectsPerDomain.
762     */
763    protected void setMaxObjectsPerDomain(long maxObjectsPerDomain) {
764        if (!underConstruction) {
765            final String msg = "Cannot modify job " + this + " as it is no longer under construction";
766            log.debug(msg);
767            throw new IllegalState(msg);
768        }
769        
770        this.forceMaxObjectsPerDomain = maxObjectsPerDomain;
771        orderXMLdoc.setMaxObjectsPerDomain(maxObjectsPerDomain); // FIXME? add argument to maxObjectsIsSetByQuotaEnforcer to method setMaxObjectsPerDomain  
772        //orderXMLdoc.editOrderXML_maxObjectsPerDomain(orderXMLdoc, maxObjectsPerDomain,
773        //        maxObjectsIsSetByQuotaEnforcer);
774
775        if (0L == maxObjectsPerDomain && 0L != forceMaxBytesPerDomain) {
776            setMaxBytesPerDomain(0L);
777        }
778    }
779
780    /**
781     * Set the maxbytes per domain value.
782     *
783     * @param maxBytesPerDomain The maxBytesPerDomain to set, or -1 for no limit.
784     */
785    protected void setMaxBytesPerDomain(long maxBytesPerDomain) {
786        if (!underConstruction) {
787            final String msg = "Cannot modify job " + this + " as it is no longer under construction";
788            log.debug(msg);
789            throw new IllegalState(msg);
790        }
791        this.forceMaxBytesPerDomain = maxBytesPerDomain;
792        orderXMLdoc.setMaxBytesPerDomain(maxBytesPerDomain);
793
794        if (0L == maxBytesPerDomain && 0L != forceMaxObjectsPerDomain) {
795            setMaxObjectsPerDomain(0L);
796        }
797    }
798
799    /**
800     * Set the maxJobRunningTime value.
801     *
802     * @param maxJobRunningTime The maxJobRunningTime in seconds to set, or 0 for no limit.
803     */
804    protected void setMaxJobRunningTime(long maxJobRunningTime) {
805        if (!underConstruction) {
806            final String msg = "Cannot modify job " + this + " as it is no longer under construction";
807            log.debug(msg);
808            throw new IllegalState(msg);
809        }
810        this.forceMaxRunningTime = maxJobRunningTime;
811        orderXMLdoc.setMaxJobRunningTime(maxJobRunningTime);
812    }
813
814    /**
815     * @return Returns the MaxJobRunningTime. 0 means no limit.
816     */
817    public long getMaxJobRunningTime() {
818        return forceMaxRunningTime;
819    }
820
821    /**
822     * Get the harvestNum for this job. The number reflects which run of the harvest definition this is.
823     *
824     * @return the harvestNum for this job.
825     */
826    public int getHarvestNum() {
827        return harvestNum;
828    }
829
830    /**
831     * Set the harvestNum for this job. The number reflects which run of the harvest definition this is. ONLY TO BE USED
832     * IN THE CONSTRUCTION PHASE.
833     *
834     * @param harvestNum a given harvestNum
835     */
836    public void setHarvestNum(int harvestNum) {
837        if (!underConstruction) {
838            final String msg = "Cannot modify job " + this + " as it is no longer under construction";
839            log.debug(msg);
840            throw new IllegalState(msg);
841        }
842        this.harvestNum = harvestNum;
843    }
844
845    /**
846     * Get the list of harvest errors for this job. If no harvest errors, null is returned This value is not meaningful
847     * until the job is finished (FAILED,DONE, RESUBMITTED)
848     *
849     * @return the harvest errors for this job or null if no harvest errors.
850     */
851    public String getHarvestErrors() {
852        return harvestErrors;
853    }
854
855    /**
856     * Append to the list of harvest errors for this job. Nothing happens, if argument harvestErrors is null.
857     *
858     * @param harvestErrors a string containing harvest errors (may be null)
859     */
860    public void appendHarvestErrors(String harvestErrors) {
861        if (harvestErrors != null) {
862            if (this.harvestErrors == null) {
863                this.harvestErrors = harvestErrors;
864            } else {
865                this.harvestErrors += "\n" + harvestErrors;
866            }
867        }
868    }
869
870    /**
871     * Get the list of harvest error details for this job. If no harvest error details, null is returned This value is
872     * not meaningful until the job is finished (FAILED,DONE, RESUBMITTED)
873     *
874     * @return the list of harvest error details for this job or null if no harvest error details.
875     */
876
877    public String getHarvestErrorDetails() {
878        return harvestErrorDetails;
879    }
880
881    /**
882     * Append to the list of harvest error details for this job. Nothing happens, if argument harvestErrorDetails is
883     * null.
884     *
885     * @param harvestErrorDetails a string containing harvest error details.
886     */
887    public void appendHarvestErrorDetails(String harvestErrorDetails) {
888        if (harvestErrorDetails != null) {
889            if (this.harvestErrorDetails == null) {
890                this.harvestErrorDetails = harvestErrorDetails;
891            } else {
892                this.harvestErrorDetails += "\n" + harvestErrorDetails;
893            }
894        }
895    }
896
897    /**
898     * Get the list of upload errors. If no upload errors, null is returned. This value is not meaningful until the job
899     * is finished (FAILED,DONE, RESUBMITTED)
900     *
901     * @return the list of upload errors as String, or null if no upload errors.
902     */
903    public String getUploadErrors() {
904        return uploadErrors;
905    }
906
907    /**
908     * Append to the list of upload errors. Nothing happens, if argument uploadErrors is null.
909     *
910     * @param uploadErrors a string containing upload errors.
911     */
912    public void appendUploadErrors(String uploadErrors) {
913        if (uploadErrors != null) {
914            if (this.uploadErrors == null) {
915                this.uploadErrors = uploadErrors;
916            } else {
917                this.uploadErrors += "\n" + uploadErrors;
918            }
919        }
920    }
921
922    /**
923     * Get the list of upload error details. If no upload error details, null is returned. This value is not meaningful
924     * until the job is finished (FAILED,DONE, RESUBMITTED)
925     *
926     * @return the list of upload error details as String, or null if no upload error details
927     */
928    public String getUploadErrorDetails() {
929        return uploadErrorDetails;
930    }
931
932    /**
933     * Append to the list of upload error details. Nothing happens, if argument uploadErrorDetails is null.
934     *
935     * @param uploadErrorDetails a string containing upload error details.
936     */
937    public void appendUploadErrorDetails(String uploadErrorDetails) {
938        if (uploadErrorDetails != null) {
939            if (this.uploadErrorDetails == null) {
940                this.uploadErrorDetails = uploadErrorDetails;
941            } else {
942                this.uploadErrorDetails += "\n" + uploadErrorDetails;
943            }
944        }
945    }
946
947    /**
948     * Get the ID for the job which this job was resubmitted as. If null, this job has not been resubmitted.
949     *
950     * @return this ID.
951     */
952    public Long getResubmittedAsJob() {
953        return resubmittedAsJobWithID;
954    }
955
956    /**
957     * Set the Date for when this job was submitted. If null, this job has not been submitted.
958     *
959     * @param submittedDate The date when this was submitted
960     */
961    public void setSubmittedDate(Date submittedDate) {
962        this.submittedDate = submittedDate;
963    }
964
965    /**
966     * Set the Date for when this job was created. If null, this job has not been created.
967     *
968     * @param creationDate The date when this was created
969     */
970    public void setCreationDate(Date creationDate) {
971        this.creationDate = creationDate;
972    }
973
974    /**
975     * Set the ID for the job which this job was resubmitted as.
976     *
977     * @param resubmittedAsJob An Id for a new job.
978     */
979    public void setResubmittedAsJob(Long resubmittedAsJob) {
980        this.resubmittedAsJobWithID = resubmittedAsJob;
981    }
982
983    /**
984     * @return id of the job that this job is supposed to continue using Heritrix recover-log or null if it starts from
985     * scratch.
986     */
987    public Long getContinuationOf() {
988        return this.continuationOF;
989    }
990
991    @Override
992    public String getHarvestFilenamePrefix() {
993        if (this.harvestnamePrefix == null) {
994            log.warn("HarvestnamePrefix not yet set for job {}. Set it by using the naming scheme. "
995                    + "This should only happen for old jobs being read", this.jobID);
996            setDefaultHarvestNamePrefix();
997        }
998        return this.harvestnamePrefix;
999    }
1000
1001    /**
1002     * @param prefix
1003     */
1004    public void setHarvestFilenamePrefix(String prefix) {
1005        this.harvestnamePrefix = prefix;
1006    }
1007
1008    /**
1009     * @return the forceMaxBytesPerDomain
1010     */
1011    public long getForceMaxBytesPerDomain() {
1012        return forceMaxBytesPerDomain;
1013    }
1014
1015    /**
1016     * @return the configurationSetsObjectLimit
1017     */
1018    public boolean isConfigurationSetsObjectLimit() {
1019        return configurationSetsObjectLimit;
1020    }
1021
1022    /**
1023     * @return the configurationSetsByteLimit
1024     */
1025    public boolean isConfigurationSetsByteLimit() {
1026        return configurationSetsByteLimit;
1027    }
1028
1029    /**
1030     * @return the minCountObjects
1031     */
1032    public long getMinCountObjects() {
1033        return minCountObjects;
1034    }
1035
1036    /**
1037     * @return the maxCountObjects
1038     */
1039    public long getMaxCountObjects() {
1040        return maxCountObjects;
1041    }
1042
1043    /**
1044     * @return the totalCountObjects
1045     */
1046    public long getTotalCountObjects() {
1047        return totalCountObjects;
1048    }
1049
1050    void setDefaultHarvestNamePrefix() {
1051        if (getJobID() != null) {
1052                ArchiveFileNaming naming = ArchiveFileNamingFactory.getInstance();
1053                log.debug("Applying the default ArchiveFileNaming class '{}'.", naming.getClass().getName());
1054                final String prefix = naming.getPrefix(this);
1055                setHarvestFilenamePrefix(prefix);
1056                log.debug("The harvestPrefix of this job is: {}", prefix);
1057        } else {
1058                log.warn("The harvestnamePrefix is not set now, as it depends on the JobID, which is not set yet");
1059        }
1060    }
1061
1062    /** @return the harvest-audience. */
1063    public String getHarvestAudience() {
1064        return harvestAudience;
1065    }
1066
1067    /**
1068     * Set the harvest audience for this job. Taken from the harvestdefinition that generated this job.
1069     *
1070     * @param theAudience the harvest-audience.
1071     */
1072    public void setHarvestAudience(String theAudience) {
1073        this.harvestAudience = theAudience;
1074    }
1075
1076    ///////////// The following two methods are needed by harvestStatus-jobdetails.jsp ////////////////////////////////////
1077    /**
1078     * Returns a list of sorted seeds for this job.
1079     * The sorting is by domain, and inside each domain,
1080     * the list is sorted by url
1081     * @return a list of sorted seeds for this job.
1082     */
1083    public List<String> getSortedSeedList() {
1084        Map<String, Set<String>> urlMap = new HashMap<String, Set<String>>();
1085        for (String seed : seedListSet) {
1086            String url;
1087            // Assume the protocol is http://, if it is missing
1088            if (!seed.matches(Constants.PROTOCOL_REGEXP)) {
1089                url = "http://" + seed;
1090            } else {
1091                url = seed;
1092            }
1093            String domain = getDomain(url);
1094            if (domain == null) {
1095                // stop processing this url, and continue to the next seed
1096                continue; 
1097            }
1098            Set<String> set;
1099            if (urlMap.containsKey(domain)) {
1100                set = urlMap.get(domain);
1101            } else {
1102                set = new TreeSet<String>();
1103                urlMap.put(domain, set);
1104            }
1105            set.add(seed);
1106
1107        }
1108       List<String> result = new ArrayList<String>();
1109       for (Set<String> set: urlMap.values()) {
1110           result.addAll(set);
1111       }
1112       return result;
1113    }
1114    /**
1115     * Get the domain, that the given URL belongs to.
1116     * @param url an URL
1117     * @return the domain, that the given URL belongs to, or 
1118     * null if unable to do so.
1119     */
1120    private String getDomain(String url) {
1121        try {
1122            URL uri = new URL(url);
1123            return DomainUtils.domainNameFromHostname(uri.getHost());
1124        } catch (MalformedURLException e) {
1125            log.warn("The string '{}' is not a valid URL", url);
1126            return null;
1127        }
1128    }
1129    
1130}