Source code

001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.harvesting.report;
024
025import java.util.Date;
026import java.util.HashSet;
027import java.util.Map;
028import java.util.Set;
029
030import org.slf4j.Logger;
031import org.slf4j.LoggerFactory;
032
033import dk.netarkivet.common.exceptions.ArgumentNotValid;
034import dk.netarkivet.common.exceptions.IOFailure;
035import dk.netarkivet.common.utils.StringUtils;
036import dk.netarkivet.common.utils.TimeUtils;
037import dk.netarkivet.harvester.HarvesterSettings;
038import dk.netarkivet.harvester.datamodel.Domain;
039import dk.netarkivet.harvester.datamodel.DomainConfiguration;
040import dk.netarkivet.harvester.datamodel.DomainDAO;
041import dk.netarkivet.harvester.datamodel.HarvestDefinitionDAO;
042import dk.netarkivet.harvester.datamodel.HarvestInfo;
043import dk.netarkivet.harvester.datamodel.Job;
044import dk.netarkivet.harvester.datamodel.SparseFullHarvest;
045import dk.netarkivet.harvester.datamodel.StopReason;
046import dk.netarkivet.harvester.harvesting.distribute.DomainStats;
047
048/**
049 * This implementation of the harvest report has the same pre-processing as {@link LegacyHarvestReport}, but is intended
050 * to be used with a crawl order that sets budget using "queue-total-budget" instead of the QuotaEnforcer (@see
051 * {@link HarvesterSettings#OBJECT_LIMIT_SET_BY_QUOTA_ENFORCER}). Hence post-processing cannot rely any more on
052 * annotations added by QuotaEnforcer anymore and thus simply compares actual document counts to crawl and configuration
053 * budgets.
054 */
055@SuppressWarnings({"serial"})
056public class BnfHarvestReport extends AbstractHarvestReport{
057
058    /** The logger for this class. */
059    private static final Logger LOG = LoggerFactory.getLogger(BnfHarvestReport.class);
060
061    /**
062     * Constructor for this class.
063     *
064     * @param files A HeritrixFiles object.
065     * @throws IOFailure If the processing of the files goes wrong
066     */
067    public BnfHarvestReport(DomainStatsReport dsr) throws IOFailure {
068        super(dsr);
069    }
070
071    /**
072     * Post-processing happens on the scheduler side when ARC files have been uploaded.
073     *
074     * @param job the actual job.
075     */
076    @Override
077    public void postProcess(Job job) {
078        ArgumentNotValid.checkNotNull(job, "job");
079
080        LOG.info("Starting post-processing of harvest report for job {}", job.getJobID());
081        long startTime = System.currentTimeMillis();
082
083        long harvestObjectLimit = -1L;
084        long harvestByteLimit = -1L;
085
086        // First find if it's a full harvest job,
087        // and if so get actual byte and object limits.
088        if (job.isSnapshot()) {
089            HarvestDefinitionDAO dao = HarvestDefinitionDAO.getInstance();
090            String harvestName = dao.getHarvestName(job.getOrigHarvestDefinitionID());
091            SparseFullHarvest harvest = dao.getSparseFullHarvest(harvestName);
092
093            harvestObjectLimit = harvest.getMaxCountObjects();
094            harvestByteLimit = harvest.getMaxBytes();
095        }
096
097        DomainDAO domDao = DomainDAO.getInstance();
098        Map<String, String> domConfMap = job.getDomainConfigurationMap();
099
100        // Process only domains from the harvest definition.
101        final Set<String> harvestDomainNames = new HashSet<String>();
102        harvestDomainNames.addAll(getDomainNames());
103        harvestDomainNames.retainAll(domConfMap.keySet());
104
105        for (String domainName : harvestDomainNames) {
106            Domain domain = domDao.read(domainName);
107            String confName = domConfMap.get(domainName);
108            DomainConfiguration conf = domain.getConfiguration(confName);
109
110            long confByteLimit = conf.getMaxBytes();
111            long confObjectLimit = conf.getMaxObjects();
112
113            DomainStats ds = getOrCreateDomainStats(domainName);
114            long actualByteCount = ds.getByteCount();
115            long actualObjectCount = ds.getObjectCount();
116
117            StopReason finalStopReason = ds.getStopReason();
118
119            if (harvestByteLimit > 0 && (actualByteCount >= harvestByteLimit)) {
120                finalStopReason = StopReason.SIZE_LIMIT;
121            } else if (harvestObjectLimit > 0 && (actualObjectCount >= harvestObjectLimit)) {
122                finalStopReason = StopReason.OBJECT_LIMIT;
123            } else if (confByteLimit > 0 && (actualByteCount >= confByteLimit)) {
124                finalStopReason = StopReason.CONFIG_SIZE_LIMIT;
125            } else if (confObjectLimit > 0 && (actualObjectCount >= confObjectLimit)) {
126                finalStopReason = StopReason.CONFIG_OBJECT_LIMIT;
127            }
128
129            ds.setStopReason(finalStopReason);
130
131            // Create the HarvestInfo object
132            HarvestInfo hi = new HarvestInfo(job.getOrigHarvestDefinitionID(), job.getJobID(), domainName, confName,
133                    new Date(), actualByteCount, actualObjectCount, finalStopReason);
134
135            // Add HarvestInfo to Domain and make data persistent
136            // by updating DAO
137            domain.getHistory().addHarvestInfo(hi);
138            domDao.update(domain);
139        }
140
141        if (LOG.isInfoEnabled()) {
142            long time = System.currentTimeMillis() - startTime;
143            LOG.info("Finished post-processing of harvest report for job {}, operation took {}", job.getJobID(),
144                    StringUtils.formatDuration(time / TimeUtils.SECOND_IN_MILLIS));
145        }
146
147    }
148}