001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.harvesting.report; 024 025import java.util.Date; 026import java.util.HashSet; 027import java.util.Map; 028import java.util.Set; 029 030import org.slf4j.Logger; 031import org.slf4j.LoggerFactory; 032 033import dk.netarkivet.common.exceptions.ArgumentNotValid; 034import dk.netarkivet.common.exceptions.IOFailure; 035import dk.netarkivet.common.utils.StringUtils; 036import dk.netarkivet.common.utils.TimeUtils; 037import dk.netarkivet.harvester.HarvesterSettings; 038import dk.netarkivet.harvester.datamodel.Domain; 039import dk.netarkivet.harvester.datamodel.DomainConfiguration; 040import dk.netarkivet.harvester.datamodel.DomainDAO; 041import dk.netarkivet.harvester.datamodel.HarvestDefinitionDAO; 042import dk.netarkivet.harvester.datamodel.HarvestInfo; 043import dk.netarkivet.harvester.datamodel.Job; 044import dk.netarkivet.harvester.datamodel.SparseFullHarvest; 045import dk.netarkivet.harvester.datamodel.StopReason; 046import dk.netarkivet.harvester.harvesting.distribute.DomainStats; 047 048/** 049 * This implementation of the harvest report has the same pre-processing as {@link LegacyHarvestReport}, but is intended 050 * to be used with a crawl order that sets budget using "queue-total-budget" instead of the QuotaEnforcer (@see 051 * {@link HarvesterSettings#OBJECT_LIMIT_SET_BY_QUOTA_ENFORCER}). Hence post-processing cannot rely any more on 052 * annotations added by QuotaEnforcer anymore and thus simply compares actual document counts to crawl and configuration 053 * budgets. 054 */ 055@SuppressWarnings({"serial"}) 056public class BnfHarvestReport extends AbstractHarvestReport{ 057 058 /** The logger for this class. */ 059 private static final Logger LOG = LoggerFactory.getLogger(BnfHarvestReport.class); 060 061 /** 062 * Constructor for this class. 063 * 064 * @param files A HeritrixFiles object. 065 * @throws IOFailure If the processing of the files goes wrong 066 */ 067 public BnfHarvestReport(DomainStatsReport dsr) throws IOFailure { 068 super(dsr); 069 } 070 071 /** 072 * Post-processing happens on the scheduler side when ARC files have been uploaded. 073 * 074 * @param job the actual job. 075 */ 076 @Override 077 public void postProcess(Job job) { 078 ArgumentNotValid.checkNotNull(job, "job"); 079 080 LOG.info("Starting post-processing of harvest report for job {}", job.getJobID()); 081 long startTime = System.currentTimeMillis(); 082 083 long harvestObjectLimit = -1L; 084 long harvestByteLimit = -1L; 085 086 // First find if it's a full harvest job, 087 // and if so get actual byte and object limits. 088 if (job.isSnapshot()) { 089 HarvestDefinitionDAO dao = HarvestDefinitionDAO.getInstance(); 090 String harvestName = dao.getHarvestName(job.getOrigHarvestDefinitionID()); 091 SparseFullHarvest harvest = dao.getSparseFullHarvest(harvestName); 092 093 harvestObjectLimit = harvest.getMaxCountObjects(); 094 harvestByteLimit = harvest.getMaxBytes(); 095 } 096 097 DomainDAO domDao = DomainDAO.getInstance(); 098 Map<String, String> domConfMap = job.getDomainConfigurationMap(); 099 100 // Process only domains from the harvest definition. 101 final Set<String> harvestDomainNames = new HashSet<String>(); 102 harvestDomainNames.addAll(getDomainNames()); 103 harvestDomainNames.retainAll(domConfMap.keySet()); 104 105 for (String domainName : harvestDomainNames) { 106 Domain domain = domDao.read(domainName); 107 String confName = domConfMap.get(domainName); 108 DomainConfiguration conf = domain.getConfiguration(confName); 109 110 long confByteLimit = conf.getMaxBytes(); 111 long confObjectLimit = conf.getMaxObjects(); 112 113 DomainStats ds = getOrCreateDomainStats(domainName); 114 long actualByteCount = ds.getByteCount(); 115 long actualObjectCount = ds.getObjectCount(); 116 117 StopReason finalStopReason = ds.getStopReason(); 118 119 if (harvestByteLimit > 0 && (actualByteCount >= harvestByteLimit)) { 120 finalStopReason = StopReason.SIZE_LIMIT; 121 } else if (harvestObjectLimit > 0 && (actualObjectCount >= harvestObjectLimit)) { 122 finalStopReason = StopReason.OBJECT_LIMIT; 123 } else if (confByteLimit > 0 && (actualByteCount >= confByteLimit)) { 124 finalStopReason = StopReason.CONFIG_SIZE_LIMIT; 125 } else if (confObjectLimit > 0 && (actualObjectCount >= confObjectLimit)) { 126 finalStopReason = StopReason.CONFIG_OBJECT_LIMIT; 127 } 128 129 ds.setStopReason(finalStopReason); 130 131 // Create the HarvestInfo object 132 HarvestInfo hi = new HarvestInfo(job.getOrigHarvestDefinitionID(), job.getJobID(), domainName, confName, 133 new Date(), actualByteCount, actualObjectCount, finalStopReason); 134 135 // Add HarvestInfo to Domain and make data persistent 136 // by updating DAO 137 domain.getHistory().addHarvestInfo(hi); 138 domDao.update(domain); 139 } 140 141 if (LOG.isInfoEnabled()) { 142 long time = System.currentTimeMillis() - startTime; 143 LOG.info("Finished post-processing of harvest report for job {}, operation took {}", job.getJobID(), 144 StringUtils.formatDuration(time / TimeUtils.SECOND_IN_MILLIS)); 145 } 146 147 } 148}