001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.harvesting.report; 024 025import java.util.Date; 026import java.util.HashSet; 027import java.util.Map; 028import java.util.Set; 029 030import org.slf4j.Logger; 031import org.slf4j.LoggerFactory; 032 033import dk.netarkivet.common.utils.StringUtils; 034import dk.netarkivet.common.utils.TimeUtils; 035import dk.netarkivet.harvester.datamodel.Domain; 036import dk.netarkivet.harvester.datamodel.DomainDAO; 037import dk.netarkivet.harvester.datamodel.HarvestInfo; 038import dk.netarkivet.harvester.datamodel.Job; 039import dk.netarkivet.harvester.datamodel.NumberUtils; 040import dk.netarkivet.harvester.datamodel.StopReason; 041 042/** 043 * Class responsible for representing a domain harvest report from crawl logs created by Heritrix and presenting the 044 * relevant information to clients. 045 */ 046@SuppressWarnings({"serial"}) 047public class LegacyHarvestReport extends AbstractHarvestReport { 048 049 /** The logger for this class. */ 050 private static final Logger log = LoggerFactory.getLogger(LegacyHarvestReport.class); 051 052 /** 053 * @dsr a DomainStatsReport for a harvest 054 */ 055 public LegacyHarvestReport(DomainStatsReport dsr) { 056 super(dsr); 057 } 058 059 /** Default constructor. */ 060 public LegacyHarvestReport() { 061 super(); 062 } 063 064 /** 065 * Post-processing happens on the scheduler side when ARC files have been uploaded. 066 * 067 * @param job the actual job. 068 */ 069 @Override 070 public void postProcess(Job job) { 071 log.info("Starting post-processing of harvest report for job {}", job.getJobID()); 072 long startTime = System.currentTimeMillis(); 073 074 // Get the map from domain names to domain configurations 075 Map<String, String> configurationMap = job.getDomainConfigurationMap(); 076 077 // For each domain harvested, check if it corresponds to a 078 // domain configuration for this Job and if so add a new HarvestInfo 079 // to the DomainHistory of the corresponding Domain object. 080 // TODO Information about the domains harvested by the crawler 081 // without a domain configuration for this job is deleted! 082 // Should this information be saved in some way (perhaps stored 083 // in metadata.arc-files?) 084 085 final Set<String> domainNames = new HashSet<String>(); 086 domainNames.addAll(getDomainNames()); 087 domainNames.retainAll(configurationMap.keySet()); 088 final DomainDAO dao = DomainDAO.getInstance(); 089 for (String domainName : domainNames) { 090 Domain domain = dao.read(domainName); 091 092 // Retrieve crawl data from log and add it to HarvestInfo 093 StopReason stopReason = getStopReason(domainName); 094 if (stopReason == null) { 095 log.warn("No stopreason found for domain '{}'", domainName); 096 } 097 Long countObjectRetrieved = getObjectCount(domainName); 098 if (countObjectRetrieved == null) { 099 log.warn("No count for objects retrieved found for domain '{}'", domainName); 100 countObjectRetrieved = -1L; 101 } 102 Long bytesReceived = getByteCount(domainName); 103 if (bytesReceived == null) { 104 log.warn("No count for bytes received found for domain '{}'", domainName); 105 bytesReceived = -1L; 106 } 107 // If StopReason is SIZE_LIMIT, we check if it's the harvests' size 108 // limit, or rather a configuration size limit. 109 110 // A harvest is considered to have hit the configuration limit if 111 // 1) The limit is lowest, or 112 // 2) The number of harvested bytes is greater than the limit 113 114 // Note: Even though the per-config-byte-limit might have changed 115 // between the time we calculated the job and now, it's okay we 116 // compare with the new limit, since it gives us the most accurate 117 // result for whether we want to harvest any more. 118 if (stopReason == StopReason.SIZE_LIMIT) { 119 long maxBytesPerDomain = job.getMaxBytesPerDomain(); 120 long configMaxBytes = domain.getConfiguration(configurationMap.get(domainName)).getMaxBytes(); 121 if (NumberUtils.compareInf(configMaxBytes, maxBytesPerDomain) <= 0 122 || NumberUtils.compareInf(configMaxBytes, bytesReceived) <= 0) { 123 stopReason = StopReason.CONFIG_SIZE_LIMIT; 124 } 125 } else if (stopReason == StopReason.OBJECT_LIMIT) { 126 long maxObjectsPerDomain = job.getMaxObjectsPerDomain(); 127 long configMaxObjects = domain.getConfiguration(configurationMap.get(domainName)).getMaxObjects(); 128 if (NumberUtils.compareInf(configMaxObjects, maxObjectsPerDomain) <= 0) { 129 stopReason = StopReason.CONFIG_OBJECT_LIMIT; 130 } 131 } 132 // Create the HarvestInfo object 133 HarvestInfo hi = new HarvestInfo(job.getOrigHarvestDefinitionID(), job.getJobID(), domain.getName(), 134 configurationMap.get(domain.getName()), new Date(), bytesReceived, countObjectRetrieved, stopReason); 135 136 // Add HarvestInfo to Domain and make data persistent 137 // by updating DAO 138 domain.getHistory().addHarvestInfo(hi); 139 dao.update(domain); 140 } 141 142 if (log.isInfoEnabled()) { 143 long time = System.currentTimeMillis() - startTime; 144 log.info("Finished post-processing of harvest report for job {}, operation took {}", job.getJobID(), 145 StringUtils.formatDuration(time / TimeUtils.SECOND_IN_MILLIS)); 146 } 147 148 } 149 150}