Source code

001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.harvesting.report;
024
025import java.util.Date;
026import java.util.HashSet;
027import java.util.Map;
028import java.util.Set;
029
030import org.slf4j.Logger;
031import org.slf4j.LoggerFactory;
032
033import dk.netarkivet.common.utils.StringUtils;
034import dk.netarkivet.common.utils.TimeUtils;
035import dk.netarkivet.harvester.datamodel.Domain;
036import dk.netarkivet.harvester.datamodel.DomainDAO;
037import dk.netarkivet.harvester.datamodel.HarvestInfo;
038import dk.netarkivet.harvester.datamodel.Job;
039import dk.netarkivet.harvester.datamodel.NumberUtils;
040import dk.netarkivet.harvester.datamodel.StopReason;
041
042/**
043 * Class responsible for representing a domain harvest report from crawl logs created by Heritrix and presenting the
044 * relevant information to clients.
045 */
046@SuppressWarnings({"serial"})
047public class LegacyHarvestReport extends AbstractHarvestReport {
048
049    /** The logger for this class. */
050    private static final Logger log = LoggerFactory.getLogger(LegacyHarvestReport.class);
051
052    /**
053     * @dsr a DomainStatsReport for a harvest
054     */
055    public LegacyHarvestReport(DomainStatsReport dsr) {
056        super(dsr);
057    }
058
059    /** Default constructor. */
060    public LegacyHarvestReport() {
061        super();
062    }
063
064    /**
065     * Post-processing happens on the scheduler side when ARC files have been uploaded.
066     *
067     * @param job the actual job.
068     */
069    @Override
070    public void postProcess(Job job) {
071        log.info("Starting post-processing of harvest report for job {}", job.getJobID());
072        long startTime = System.currentTimeMillis();
073
074        // Get the map from domain names to domain configurations
075        Map<String, String> configurationMap = job.getDomainConfigurationMap();
076
077        // For each domain harvested, check if it corresponds to a
078        // domain configuration for this Job and if so add a new HarvestInfo
079        // to the DomainHistory of the corresponding Domain object.
080        // TODO Information about the domains harvested by the crawler
081        // without a domain configuration for this job is deleted!
082        // Should this information be saved in some way (perhaps stored
083        // in metadata.arc-files?)
084
085        final Set<String> domainNames = new HashSet<String>();
086        domainNames.addAll(getDomainNames());
087        domainNames.retainAll(configurationMap.keySet());
088        final DomainDAO dao = DomainDAO.getInstance();
089        for (String domainName : domainNames) {
090            Domain domain = dao.read(domainName);
091
092            // Retrieve crawl data from log and add it to HarvestInfo
093            StopReason stopReason = getStopReason(domainName);
094            if (stopReason == null) {
095                log.warn("No stopreason found for domain '{}'", domainName);
096            }
097            Long countObjectRetrieved = getObjectCount(domainName);
098            if (countObjectRetrieved == null) {
099                log.warn("No count for objects retrieved found for domain '{}'", domainName);
100                countObjectRetrieved = -1L;
101            }
102            Long bytesReceived = getByteCount(domainName);
103            if (bytesReceived == null) {
104                log.warn("No count for bytes received found for domain '{}'", domainName);
105                bytesReceived = -1L;
106            }
107            // If StopReason is SIZE_LIMIT, we check if it's the harvests' size
108            // limit, or rather a configuration size limit.
109
110            // A harvest is considered to have hit the configuration limit if
111            // 1) The limit is lowest, or
112            // 2) The number of harvested bytes is greater than the limit
113
114            // Note: Even though the per-config-byte-limit might have changed
115            // between the time we calculated the job and now, it's okay we
116            // compare with the new limit, since it gives us the most accurate
117            // result for whether we want to harvest any more.
118            if (stopReason == StopReason.SIZE_LIMIT) {
119                long maxBytesPerDomain = job.getMaxBytesPerDomain();
120                long configMaxBytes = domain.getConfiguration(configurationMap.get(domainName)).getMaxBytes();
121                if (NumberUtils.compareInf(configMaxBytes, maxBytesPerDomain) <= 0
122                        || NumberUtils.compareInf(configMaxBytes, bytesReceived) <= 0) {
123                    stopReason = StopReason.CONFIG_SIZE_LIMIT;
124                }
125            } else if (stopReason == StopReason.OBJECT_LIMIT) {
126                long maxObjectsPerDomain = job.getMaxObjectsPerDomain();
127                long configMaxObjects = domain.getConfiguration(configurationMap.get(domainName)).getMaxObjects();
128                if (NumberUtils.compareInf(configMaxObjects, maxObjectsPerDomain) <= 0) {
129                    stopReason = StopReason.CONFIG_OBJECT_LIMIT;
130                }
131            }
132            // Create the HarvestInfo object
133            HarvestInfo hi = new HarvestInfo(job.getOrigHarvestDefinitionID(), job.getJobID(), domain.getName(),
134                    configurationMap.get(domain.getName()), new Date(), bytesReceived, countObjectRetrieved, stopReason);
135
136            // Add HarvestInfo to Domain and make data persistent
137            // by updating DAO
138            domain.getHistory().addHarvestInfo(hi);
139            dao.update(domain);
140        }
141
142        if (log.isInfoEnabled()) {
143            long time = System.currentTimeMillis() - startTime;
144            log.info("Finished post-processing of harvest report for job {}, operation took {}", job.getJobID(),
145                    StringUtils.formatDuration(time / TimeUtils.SECOND_IN_MILLIS));
146        }
147
148    }
149
150}