001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.datamodel; 024 025import java.util.Date; 026import java.util.Iterator; 027 028import javax.inject.Provider; 029 030import org.slf4j.Logger; 031import org.slf4j.LoggerFactory; 032 033import dk.netarkivet.common.exceptions.ArgumentNotValid; 034import dk.netarkivet.common.exceptions.IOFailure; 035import dk.netarkivet.common.exceptions.UnknownID; 036import dk.netarkivet.common.utils.FilterIterator; 037import dk.netarkivet.harvester.datamodel.extendedfield.ExtendedFieldDAO; 038 039/** 040 * This class contains the specific properties and operations of snapshot harvest definitions. 041 */ 042public class FullHarvest extends HarvestDefinition { 043 044 /** The class logger. */ 045 private static final Logger log = LoggerFactory.getLogger(FullHarvest.class); 046 047 /** The maximum number of objects retrieved from each domain during a snapshot harvest. */ 048 private long maxCountObjects; 049 050 /** The maximum number of bytes retrieved from each domain during a snapshot harvest. */ 051 private long maxBytes; 052 053 /** The maximum time in seconds to run for each job generated by this definition. */ 054 private long maxJobRunningTime; 055 056 /** The ID for the harvestdefinition, this FullHarvest is based upon. */ 057 private Long previousHarvestDefinitionOid; 058 059 /** a boolean to indicate whether the deduplication index is ready. */ 060 private boolean indexReady; 061 062 private final Provider<HarvestDefinitionDAO> hdDaoProvider; 063 private final Provider<JobDAO> jobDaoProvider; // Not used 064 private final Provider<DomainDAO> domainDAOProvider; 065 066 /** 067 * Create new instance of FullHarvest configured according to the properties of the supplied DomainConfiguration. 068 * Should only be used by the HarvestFactory class. 069 * 070 * @param harvestDefName the name of the harvest definition 071 * @param comments comments 072 * @param previousHarvestDefinitionOid This harvestDefinition is used to create this Fullharvest definition. 073 * @param maxCountObjects Limit for how many objects can be fetched per domain 074 * @param maxBytes Limit for how many bytes can be fetched per domain 075 * @param maxJobRunningTime Limit on how much time can be spent on each job. 0 means no limit 076 * @param isIndexReady Is the deduplication index ready for this harvest. 077 */ 078 public FullHarvest(String harvestDefName, String comments, Long previousHarvestDefinitionOid, long maxCountObjects, 079 long maxBytes, long maxJobRunningTime, boolean isIndexReady, Provider<HarvestDefinitionDAO> hdDaoProvider, 080 Provider<JobDAO> jobDaoProvider, Provider<ExtendedFieldDAO> extendedFieldDAOProvide, 081 Provider<DomainDAO> domainDAOProvider) { 082 super(extendedFieldDAOProvide); 083 ArgumentNotValid.checkNotNullOrEmpty(harvestDefName, "harvestDefName"); 084 ArgumentNotValid.checkNotNull(comments, "comments"); 085 this.previousHarvestDefinitionOid = previousHarvestDefinitionOid; 086 this.harvestDefName = harvestDefName; 087 this.comments = comments; 088 this.maxCountObjects = maxCountObjects; 089 this.numEvents = 0; 090 this.maxBytes = maxBytes; 091 this.maxJobRunningTime = maxJobRunningTime; 092 this.indexReady = isIndexReady; 093 this.hdDaoProvider = hdDaoProvider; 094 this.jobDaoProvider = jobDaoProvider; 095 this.domainDAOProvider = domainDAOProvider; 096 } 097 098 /** 099 * Get the previous HarvestDefinition which is used to base this. 100 * 101 * @return The previous HarvestDefinition 102 */ 103 public HarvestDefinition getPreviousHarvestDefinition() { 104 if (previousHarvestDefinitionOid != null) { 105 return hdDaoProvider.get().read(previousHarvestDefinitionOid); 106 } 107 return null; 108 } 109 110 /** 111 * Set the previous HarvestDefinition which is used to base this. 112 * 113 * @param prev The id of a HarvestDefinition 114 */ 115 public void setPreviousHarvestDefinition(Long prev) { 116 previousHarvestDefinitionOid = prev; 117 } 118 119 /** @return Returns the maxCountObjects. */ 120 public long getMaxCountObjects() { 121 return maxCountObjects; 122 } 123 124 /** @param maxCountObjects The maxCountObjects to set. */ 125 public void setMaxCountObjects(long maxCountObjects) { 126 this.maxCountObjects = maxCountObjects; 127 } 128 129 /** 130 * Get the maximum number of bytes that this fullharvest will harvest per domain, 0 for no limit. 131 * 132 * @return Total download limit in bytes per domain. 133 */ 134 public long getMaxBytes() { 135 return maxBytes; 136 } 137 138 /** 139 * Set the limit for how many bytes this fullharvest will harvest per domain, or -1 for no limit. 140 * 141 * @param maxBytes Number of bytes to stop harvesting at. 142 */ 143 public void setMaxBytes(long maxBytes) { 144 this.maxBytes = maxBytes; 145 } 146 147 /** 148 * Returns an iterator of domain configurations for this harvest definition. Domains are filtered out if, on the 149 * previous harvest, they: 1) were completed 2) reached their maxBytes limit (and the maxBytes limit has not changed 150 * since time of harvest) 3) reached their maxObjects limit (and the maxObjects limit has not changed since time of 151 * harvest) 4) died uncleanly (e.g. due to a manual shutdown of heritrix) on their last harvest. 152 * <p> 153 * Domains are also excluded if they are aliases of another domain. 154 * 155 * @return Iterator containing information about the domain configurations 156 */ 157 public Iterator<DomainConfiguration> getDomainConfigurations() { 158 if (previousHarvestDefinitionOid == null) { 159 // The first snapshot harvest 160 //HarvestDefinitionDAO hdDao = HarvestDefinitionDAO.getInstance(); 161 return hdDaoProvider.get().getSnapShotConfigurations(); 162 } 163 164 // An iterative snapshot harvest 165 final DomainDAO dao = domainDAOProvider.get(); 166 // Get what has been harvested 167 Iterator<HarvestInfo> i = dao.getHarvestInfoBasedOnPreviousHarvestDefinition(getPreviousHarvestDefinition()); 168 // 169 return new FilterIterator<HarvestInfo, DomainConfiguration>(i) { 170 protected DomainConfiguration filter(HarvestInfo harvestInfo) { 171 172 if (harvestInfo.getStopReason() == StopReason.DOWNLOAD_COMPLETE 173 || harvestInfo.getStopReason() == StopReason.DOWNLOAD_UNFINISHED) { 174 // Don't include the ones that finished or died 175 // in an unclean fashion 176 return null; 177 } 178 179 DomainConfiguration config = getConfigurationFromPreviousHarvest(harvestInfo, dao); 180 if (harvestInfo.getStopReason() == StopReason.CONFIG_SIZE_LIMIT) { 181 // Check if MaxBytes limit for DomainConfiguration have 182 // been raised since previous harvest. 183 // If this is the case, return the configuration 184 int compare = NumberUtils.compareInf(config.getMaxBytes(), harvestInfo.getSizeDataRetrieved()); 185 if (compare < 1) { 186 return null; 187 } else { 188 return config; 189 } 190 } 191 192 if (harvestInfo.getStopReason() == StopReason.CONFIG_OBJECT_LIMIT) { 193 // Check if MaxObjects limit for DomainConfiguration have 194 // been raised since previous harvest. 195 // If this is the case, return the configuration 196 int compare = NumberUtils.compareInf(config.getMaxObjects(), harvestInfo.getCountObjectRetrieved()); 197 if (compare < 1) { 198 return null; 199 } else { 200 return config; 201 } 202 } 203 Domain d = dao.read(config.getDomainName()); 204 205 if (d.getAliasInfo() != null && !d.getAliasInfo().isExpired()) { 206 // Don't include aliases 207 return null; 208 } else { 209 return config; 210 } 211 } 212 }; 213 } 214 215 /** 216 * Get the configuration used in a previous harvest. If the configuration in the harvestinfo cannot be found 217 * (deleted), uses the default configuration. 218 * 219 * @param harvestInfo A harvest info object from a previous harvest. 220 * @param dao The dao to read configurations from. 221 * @return A configuration if found and the download in this harvestinfo was complete, null otherwise 222 */ 223 private DomainConfiguration getConfigurationFromPreviousHarvest(final HarvestInfo harvestInfo, DomainDAO dao) { 224 // For each bit of harvest info that did not complete 225 try { 226 Domain domain = dao.read(harvestInfo.getDomainName()); 227 // Read the domain 228 DomainConfiguration configuration; 229 // Read the configuration 230 try { 231 configuration = domain.getConfiguration(harvestInfo.getDomainConfigurationName()); 232 } catch (UnknownID e) { 233 // If the old configuration cannot be found, fall 234 // back on default configuration 235 configuration = domain.getDefaultConfiguration(); 236 log.debug( 237 "Previous configuration '{}' for harvesting domain '{}' not found. Using default '{}' instead.", 238 harvestInfo.getDomainConfigurationName(), harvestInfo.getDomainName(), configuration.getName(), 239 e); 240 } 241 // Add the configuration to the list to harvest 242 return configuration; 243 } catch (UnknownID e) { 244 // If the domain doesn't exist, warn 245 log.debug("Previously harvested domain '{}' no longer exists. Ignoring this domain.", 246 harvestInfo.getDomainName(), e); 247 } catch (IOFailure e) { 248 // If the domain can't be read, warn 249 log.debug("Previously harvested domain '{}' can't be read. Ignoring this domain.", 250 harvestInfo.getDomainName(), e); 251 } 252 return null; 253 } 254 255 /** 256 * Check if this harvest definition should be run, given the time now. 257 * 258 * @param now The current time 259 * @return true if harvest definition should be run 260 */ 261 public boolean runNow(Date now) { 262 return getActive() && (numEvents < 1); 263 } 264 265 /** 266 * Returns whether this HarvestDefinition represents a snapshot harvest. 267 * 268 * @return Returns true 269 */ 270 public boolean isSnapShot() { 271 return true; 272 } 273 274 /** 275 * @return Returns the max job running time 276 */ 277 public long getMaxJobRunningTime() { 278 return maxJobRunningTime; 279 } 280 281 /** 282 * Set the limit for how many seconds each crawljob in this fullharvest will run, or 0 for no limit. 283 * 284 * @param maxJobRunningtime max number of seconds 285 */ 286 public void setMaxJobRunningTime(long maxJobRunningtime) { 287 this.maxJobRunningTime = maxJobRunningtime; 288 } 289 290 /** 291 * Is index ready. Used to check, whether or a FullHarvest is ready for scheduling. The scheduling requires, that 292 * the deduplication index used by the jobs in the FullHarvest, has already been prepared by the IndexServer. 293 * 294 * @return true, if the deduplication index is ready. Otherwise false. 295 */ 296 public boolean getIndexReady() { 297 return this.indexReady; 298 } 299 300 /** 301 * Set the indexReady field. 302 * 303 * @param isIndexReady The new value of the indexReady field. 304 */ 305 public void setIndexReady(boolean isIndexReady) { 306 this.indexReady = isIndexReady; 307 } 308 309}