001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.datamodel; 024 025import java.util.Date; 026import java.util.Iterator; 027 028import javax.inject.Provider; 029 030import org.slf4j.Logger; 031import org.slf4j.LoggerFactory; 032 033import dk.netarkivet.common.exceptions.ArgumentNotValid; 034import dk.netarkivet.common.exceptions.IOFailure; 035import dk.netarkivet.common.exceptions.UnknownID; 036import dk.netarkivet.common.utils.FilterIterator; 037import dk.netarkivet.common.utils.Settings; 038import dk.netarkivet.harvester.HarvesterSettings; 039import dk.netarkivet.harvester.datamodel.extendedfield.ExtendedFieldDAO; 040 041/** 042 * This class contains the specific properties and operations of snapshot harvest definitions. 043 */ 044public class FullHarvest extends HarvestDefinition { 045 046 /** The class logger. */ 047 private static final Logger log = LoggerFactory.getLogger(FullHarvest.class); 048 049 /** The maximum number of objects retrieved from each domain during a snapshot harvest. */ 050 private long maxCountObjects; 051 052 /** The maximum number of bytes retrieved from each domain during a snapshot harvest. */ 053 private long maxBytes; 054 055 /** The maximum time in seconds to run for each job generated by this definition. */ 056 private long maxJobRunningTime; 057 058 /** The ID for the harvestdefinition, this FullHarvest is based upon. */ 059 private Long previousHarvestDefinitionOid; 060 061 /** a boolean to indicate whether the deduplication index is ready. */ 062 private boolean indexReady; 063 064 private final Provider<HarvestDefinitionDAO> hdDaoProvider; 065 private final Provider<JobDAO> jobDaoProvider; // Not used 066 private final Provider<DomainDAO> domainDAOProvider; 067 068 /** 069 * Create new instance of FullHarvest configured according to the properties of the supplied DomainConfiguration. 070 * Should only be used by the HarvestFactory class. 071 * 072 * @param harvestDefName the name of the harvest definition 073 * @param comments comments 074 * @param previousHarvestDefinitionOid This harvestDefinition is used to create this Fullharvest definition. 075 * @param maxCountObjects Limit for how many objects can be fetched per domain 076 * @param maxBytes Limit for how many bytes can be fetched per domain 077 * @param maxJobRunningTime Limit on how much time can be spent on each job. 0 means no limit 078 * @param isIndexReady Is the deduplication index ready for this harvest. 079 */ 080 public FullHarvest(String harvestDefName, String comments, Long previousHarvestDefinitionOid, long maxCountObjects, 081 long maxBytes, long maxJobRunningTime, boolean isIndexReady, Provider<HarvestDefinitionDAO> hdDaoProvider, 082 Provider<JobDAO> jobDaoProvider, Provider<ExtendedFieldDAO> extendedFieldDAOProvide, 083 Provider<DomainDAO> domainDAOProvider) { 084 super(extendedFieldDAOProvide); 085 ArgumentNotValid.checkNotNullOrEmpty(harvestDefName, "harvestDefName"); 086 ArgumentNotValid.checkNotNull(comments, "comments"); 087 this.previousHarvestDefinitionOid = previousHarvestDefinitionOid; 088 this.harvestDefName = harvestDefName; 089 this.comments = comments; 090 this.maxCountObjects = maxCountObjects; 091 this.numEvents = 0; 092 this.maxBytes = maxBytes; 093 this.maxJobRunningTime = maxJobRunningTime; 094 this.indexReady = isIndexReady; 095 this.hdDaoProvider = hdDaoProvider; 096 this.jobDaoProvider = jobDaoProvider; 097 this.domainDAOProvider = domainDAOProvider; 098 } 099 100 /** 101 * Get the previous HarvestDefinition which is used to base this. 102 * 103 * @return The previous HarvestDefinition 104 */ 105 public HarvestDefinition getPreviousHarvestDefinition() { 106 if (previousHarvestDefinitionOid != null) { 107 return hdDaoProvider.get().read(previousHarvestDefinitionOid); 108 } 109 return null; 110 } 111 112 /** 113 * Set the previous HarvestDefinition which is used to base this. 114 * 115 * @param prev The id of a HarvestDefinition 116 */ 117 public void setPreviousHarvestDefinition(Long prev) { 118 previousHarvestDefinitionOid = prev; 119 } 120 121 /** @return Returns the maxCountObjects. */ 122 public long getMaxCountObjects() { 123 return maxCountObjects; 124 } 125 126 /** @param maxCountObjects The maxCountObjects to set. */ 127 public void setMaxCountObjects(long maxCountObjects) { 128 this.maxCountObjects = maxCountObjects; 129 } 130 131 /** 132 * Get the maximum number of bytes that this fullharvest will harvest per domain, 0 for no limit. 133 * 134 * @return Total download limit in bytes per domain. 135 */ 136 public long getMaxBytes() { 137 return maxBytes; 138 } 139 140 /** 141 * Set the limit for how many bytes this fullharvest will harvest per domain, or -1 for no limit. 142 * 143 * @param maxBytes Number of bytes to stop harvesting at. 144 */ 145 public void setMaxBytes(long maxBytes) { 146 this.maxBytes = maxBytes; 147 } 148 149 /** 150 * Returns an iterator of domain configurations for this harvest definition. Domains are filtered out if, on the 151 * previous harvest, they: 1) were completed 2) reached their maxBytes limit (and the maxBytes limit has not changed 152 * since time of harvest) 3) reached their maxObjects limit (and the maxObjects limit has not changed since time of 153 * harvest) 4) died uncleanly (e.g. due to a manual shutdown of heritrix) on their last harvest. 154 * <p> 155 * Domains are also excluded if they are aliases of another domain. 156 * 157 * @return Iterator containing information about the domain configurations 158 */ 159 public Iterator<DomainConfiguration> getDomainConfigurations() { 160 if (previousHarvestDefinitionOid == null) { 161 // The first snapshot harvest 162 return hdDaoProvider.get().getSnapShotConfigurations(); 163 } else { // An iterative snapshot harvest 164 return getDomainConfigurationsForIterativeHarvest(); 165 } 166 } 167 168 /** 169 * @return a iterator of DomainConfigurations not finished in previous SnapShot harvest 170 */ 171 public Iterator<DomainConfiguration> getDomainConfigurationsForIterativeHarvest() { 172 final DomainDAO dao = domainDAOProvider.get(); 173 final HarvestDefinition previousHd = getPreviousHarvestDefinition(); 174 boolean useAlternateMethod = Settings.getBoolean(HarvesterSettings.USE_ALTERNATE_SNAPSHOT_JOBGENERATION_METHOD); 175 log.debug("Retrieving a list of domainconfigurations to continue SnapshotHarvest HD #{}({}) in HD #{} ({}). Using alternative snapshot generation method='{}'", 176 previousHd.getOid(), previousHd.getName(), getOid(), getName(), useAlternateMethod); 177 if (useAlternateMethod) { 178 return getAlternativeSnapshotJobGenerationMethod(dao, previousHd); 179 } else { 180 return getExistingSnapshotJobGenerationMethod(dao, previousHd); 181 } 182 } 183 184 /** 185 * Implements the old way of finding the DomainConfigurations for a iterative snapshot harvest. 186 * It fetches all the HarvestInfo records for the previous harvest, and then checks for each record 187 * if the domain was fully harvested in the previous harvest. If it was, the domain is skipped in the next harvest. 188 * 189 * @param dao a DomainDAO object. 190 * @param previousHd the previousHD for this fullharvest 191 * @return a iterator of DomainConfigurations for a iterative snapshot harvest. 192 */ 193 private Iterator<DomainConfiguration> getExistingSnapshotJobGenerationMethod(final DomainDAO dao, final HarvestDefinition previousHd) { 194 log.info("Running existing method for finding domainconfigs for iterative harvest #{} continuing harvest #{}", getOid(), previousHd.getOid()); 195 // Get a iterator of what has been harvested in the previous harvestdefinition 196 Iterator<HarvestInfo> i = dao.getHarvestInfoBasedOnPreviousHarvestDefinition(previousHd); 197 log.info("Completed making iterator of HarvestInfo records from HD#{} to be used for HD#{}", previousHd.getOid(), getOid()); 198 return new FilterIterator<HarvestInfo, DomainConfiguration>(i) { 199 protected DomainConfiguration filter(HarvestInfo harvestInfo) { 200 201 if (harvestInfo.getStopReason() == StopReason.DOWNLOAD_COMPLETE 202 || harvestInfo.getStopReason() == StopReason.DOWNLOAD_UNFINISHED) { 203 // Don't include the ones that finished or died 204 // in an unclean fashion 205 return null; 206 } 207 208 DomainConfiguration config = getConfigurationFromPreviousHarvest(harvestInfo, dao); 209 if (harvestInfo.getStopReason() == StopReason.CONFIG_SIZE_LIMIT) { 210 // Check if MaxBytes limit for DomainConfiguration have 211 // been raised since previous harvest. 212 // If this is the case, return the configuration 213 int compare = NumberUtils.compareInf(config.getMaxBytes(), harvestInfo.getSizeDataRetrieved()); 214 if (compare < 1) { 215 return null; 216 } else { 217 return config; 218 } 219 } 220 221 if (harvestInfo.getStopReason() == StopReason.CONFIG_OBJECT_LIMIT) { 222 // Check if MaxObjects limit for DomainConfiguration have 223 // been raised since previous harvest. 224 // If this is the case, return the configuration 225 int compare = NumberUtils.compareInf(config.getMaxObjects(), harvestInfo.getCountObjectRetrieved()); 226 if (compare < 1) { 227 return null; 228 } else { 229 return config; 230 } 231 } 232 Domain d = dao.read(config.getDomainName()); 233 234 if (d.getAliasInfo() != null && !d.getAliasInfo().isExpired()) { 235 // Don't include aliases 236 return null; 237 } else { 238 return config; 239 } 240 } 241 }; 242 } 243 244 245 /** 246 * Implements a new way of finding the DomainConfigurations for a iterative snapshot harvest. 247 * It identifies the domains harvested in the previous harvest, and then looks up the harvestInfo for this domain for that harvest. 248 * @param dao a DomainDAO object. 249 * @param previousHD the previousHD for this fullharvest 250 * @return a iterator of DomainConfigurations for a iterative snapshot harvest. 251 */ 252 private Iterator<DomainConfiguration> getAlternativeSnapshotJobGenerationMethod(final DomainDAO dao, final HarvestDefinition previousHd) { 253 log.info("Running alternate method for finding domainconfigs for iterative harvest #{} continuing harvest #{}", getOid(), previousHd.getOid()); 254 Iterator<Domain> j = dao.getDomainsInSnapshotHarvestOrder(previousHd.getOid()); 255 return new FilterIterator<Domain, DomainConfiguration>(j) { 256 @Override 257 protected DomainConfiguration filter(Domain d) { 258 HarvestInfo harvestInfo = dao.getHarvestInfoForDomainInHarvest(previousHd, d); 259 if (harvestInfo == null) { // Domain not found in HarvestInfo 260 return null; 261 } 262 log.trace("Found harvestInfo for domain '{}'", d.getName()); 263 if (harvestInfo.getStopReason() == StopReason.DOWNLOAD_COMPLETE 264 || harvestInfo.getStopReason() == StopReason.DOWNLOAD_UNFINISHED) { 265 // Don't include the ones that finished or died 266 // in an unclean fashion 267 return null; 268 } 269 DomainConfiguration config = getConfigurationFromPreviousHarvest(harvestInfo, dao); 270 // Check if max_bytes was reached 271 if (harvestInfo.getStopReason() == StopReason.CONFIG_SIZE_LIMIT) { 272 // Check if MaxBytes limit for DomainConfiguration have 273 // been raised since previous harvest. 274 // If this is the case, return the configuration 275 int compare = NumberUtils.compareInf(config.getMaxBytes(), harvestInfo.getSizeDataRetrieved()); 276 if (compare < 1) { 277 return null; 278 } else { 279 return config; 280 } 281 } 282 if (harvestInfo.getStopReason() == StopReason.CONFIG_OBJECT_LIMIT) { 283 // Check if MaxObjects limit for DomainConfiguration have 284 // been raised since previous harvest. 285 // If this is the case, return the configuration 286 int compare = NumberUtils.compareInf(config.getMaxObjects(), harvestInfo.getCountObjectRetrieved()); 287 if (compare < 1) { 288 return null; 289 } else { 290 return config; 291 } 292 } 293 if (d.getAliasInfo() != null && !d.getAliasInfo().isExpired()) { 294 // Don't include aliases 295 return null; 296 } else { 297 return config; 298 } 299 } 300 }; 301 } 302 303 /** 304 * Get the configuration used in a previous harvest. If the configuration in the harvestinfo cannot be found 305 * (deleted), uses the default configuration. 306 * 307 * @param harvestInfo A harvest info object from a previous harvest. 308 * @param dao The dao to read configurations from. 309 * @return A configuration if found and the download in this harvestinfo was complete, null otherwise 310 */ 311 private DomainConfiguration getConfigurationFromPreviousHarvest(final HarvestInfo harvestInfo, DomainDAO dao) { 312 // For each bit of harvest info that did not complete 313 try { 314 Domain domain = dao.read(harvestInfo.getDomainName()); 315 // Read the domain 316 DomainConfiguration configuration; 317 // Read the configuration 318 try { 319 configuration = domain.getConfiguration(harvestInfo.getDomainConfigurationName()); 320 } catch (UnknownID e) { 321 // If the old configuration cannot be found, fall 322 // back on default configuration 323 configuration = domain.getDefaultConfiguration(); 324 log.debug( 325 "Previous configuration '{}' for harvesting domain '{}' not found. Using default '{}' instead.", 326 harvestInfo.getDomainConfigurationName(), harvestInfo.getDomainName(), configuration.getName(), 327 e); 328 } 329 // Add the configuration to the list to harvest 330 return configuration; 331 } catch (UnknownID e) { 332 // If the domain doesn't exist, warn 333 log.debug("Previously harvested domain '{}' no longer exists. Ignoring this domain.", 334 harvestInfo.getDomainName(), e); 335 } catch (IOFailure e) { 336 // If the domain can't be read, warn 337 log.debug("Previously harvested domain '{}' can't be read. Ignoring this domain.", 338 harvestInfo.getDomainName(), e); 339 } 340 return null; 341 } 342 343 /** 344 * Check if this harvest definition should be run, given the time now. 345 * 346 * @param now The current time 347 * @return true if harvest definition should be run 348 */ 349 public boolean runNow(Date now) { 350 return getActive() && (numEvents < 1); 351 } 352 353 /** 354 * Returns whether this HarvestDefinition represents a snapshot harvest. 355 * 356 * @return Returns true 357 */ 358 public boolean isSnapShot() { 359 return true; 360 } 361 362 /** 363 * @return Returns the max job running time 364 */ 365 public long getMaxJobRunningTime() { 366 return maxJobRunningTime; 367 } 368 369 /** 370 * Set the limit for how many seconds each crawljob in this fullharvest will run, or 0 for no limit. 371 * 372 * @param maxJobRunningtime max number of seconds 373 */ 374 public void setMaxJobRunningTime(long maxJobRunningtime) { 375 this.maxJobRunningTime = maxJobRunningtime; 376 } 377 378 /** 379 * Is index ready. Used to check, whether or a FullHarvest is ready for scheduling. The scheduling requires, that 380 * the deduplication index used by the jobs in the FullHarvest, has already been prepared by the IndexServer. 381 * 382 * @return true, if the deduplication index is ready. Otherwise false. 383 */ 384 public boolean getIndexReady() { 385 return this.indexReady; 386 } 387 388 /** 389 * Set the indexReady field. 390 * 391 * @param isIndexReady The new value of the indexReady field. 392 */ 393 public void setIndexReady(boolean isIndexReady) { 394 this.indexReady = isIndexReady; 395 } 396 397}