001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.datamodel;
024
025import java.util.Date;
026import java.util.Iterator;
027
028import javax.inject.Provider;
029
030import org.slf4j.Logger;
031import org.slf4j.LoggerFactory;
032
033import dk.netarkivet.common.exceptions.ArgumentNotValid;
034import dk.netarkivet.common.exceptions.IOFailure;
035import dk.netarkivet.common.exceptions.UnknownID;
036import dk.netarkivet.common.utils.FilterIterator;
037import dk.netarkivet.common.utils.Settings;
038import dk.netarkivet.harvester.HarvesterSettings;
039import dk.netarkivet.harvester.datamodel.extendedfield.ExtendedFieldDAO;
040
041/**
042 * This class contains the specific properties and operations of snapshot harvest definitions.
043 */
044public class FullHarvest extends HarvestDefinition {
045
046    /** The class logger. */
047    private static final Logger log = LoggerFactory.getLogger(FullHarvest.class);
048
049    /** The maximum number of objects retrieved from each domain during a snapshot harvest. */
050    private long maxCountObjects;
051
052    /** The maximum number of bytes retrieved from each domain during a snapshot harvest. */
053    private long maxBytes;
054
055    /** The maximum time in seconds to run for each job generated by this definition. */
056    private long maxJobRunningTime;
057
058    /** The ID for the harvestdefinition, this FullHarvest is based upon. */
059    private Long previousHarvestDefinitionOid;
060
061    /** a boolean to indicate whether the deduplication index is ready. */
062    private boolean indexReady;
063
064    private final Provider<HarvestDefinitionDAO> hdDaoProvider;
065    private final Provider<JobDAO> jobDaoProvider; // Not used
066    private final Provider<DomainDAO> domainDAOProvider;
067
068    /**
069     * Create new instance of FullHarvest configured according to the properties of the supplied DomainConfiguration.
070     * Should only be used by the HarvestFactory class.
071     *
072     * @param harvestDefName the name of the harvest definition
073     * @param comments comments
074     * @param previousHarvestDefinitionOid This harvestDefinition is used to create this Fullharvest definition.
075     * @param maxCountObjects Limit for how many objects can be fetched per domain
076     * @param maxBytes Limit for how many bytes can be fetched per domain
077     * @param maxJobRunningTime Limit on how much time can be spent on each job. 0 means no limit
078     * @param isIndexReady Is the deduplication index ready for this harvest.
079     */
080    public FullHarvest(String harvestDefName, String comments, Long previousHarvestDefinitionOid, long maxCountObjects,
081            long maxBytes, long maxJobRunningTime, boolean isIndexReady, Provider<HarvestDefinitionDAO> hdDaoProvider,
082            Provider<JobDAO> jobDaoProvider, Provider<ExtendedFieldDAO> extendedFieldDAOProvide,
083            Provider<DomainDAO> domainDAOProvider) {
084        super(extendedFieldDAOProvide);
085        ArgumentNotValid.checkNotNullOrEmpty(harvestDefName, "harvestDefName");
086        ArgumentNotValid.checkNotNull(comments, "comments");
087        this.previousHarvestDefinitionOid = previousHarvestDefinitionOid;
088        this.harvestDefName = harvestDefName;
089        this.comments = comments;
090        this.maxCountObjects = maxCountObjects;
091        this.numEvents = 0;
092        this.maxBytes = maxBytes;
093        this.maxJobRunningTime = maxJobRunningTime;
094        this.indexReady = isIndexReady;
095        this.hdDaoProvider = hdDaoProvider;
096        this.jobDaoProvider = jobDaoProvider;
097        this.domainDAOProvider = domainDAOProvider;
098    }
099
100    /**
101     * Get the previous HarvestDefinition which is used to base this.
102     *
103     * @return The previous HarvestDefinition
104     */
105    public HarvestDefinition getPreviousHarvestDefinition() {
106        if (previousHarvestDefinitionOid != null) {
107            return hdDaoProvider.get().read(previousHarvestDefinitionOid);
108        }
109        return null;
110    }
111
112    /**
113     * Set the previous HarvestDefinition which is used to base this.
114     *
115     * @param prev The id of a HarvestDefinition
116     */
117    public void setPreviousHarvestDefinition(Long prev) {
118        previousHarvestDefinitionOid = prev;
119    }
120
121    /** @return Returns the maxCountObjects. */
122    public long getMaxCountObjects() {
123        return maxCountObjects;
124    }
125
126    /** @param maxCountObjects The maxCountObjects to set. */
127    public void setMaxCountObjects(long maxCountObjects) {
128        this.maxCountObjects = maxCountObjects;
129    }
130
131    /**
132     * Get the maximum number of bytes that this fullharvest will harvest per domain, 0 for no limit.
133     *
134     * @return Total download limit in bytes per domain.
135     */
136    public long getMaxBytes() {
137        return maxBytes;
138    }
139
140    /**
141     * Set the limit for how many bytes this fullharvest will harvest per domain, or -1 for no limit.
142     *
143     * @param maxBytes Number of bytes to stop harvesting at.
144     */
145    public void setMaxBytes(long maxBytes) {
146        this.maxBytes = maxBytes;
147    }
148
149    /**
150     * Returns an iterator of domain configurations for this harvest definition. Domains are filtered out if, on the
151     * previous harvest, they: 1) were completed 2) reached their maxBytes limit (and the maxBytes limit has not changed
152     * since time of harvest) 3) reached their maxObjects limit (and the maxObjects limit has not changed since time of
153     * harvest) 4) died uncleanly (e.g. due to a manual shutdown of heritrix) on their last harvest.
154     * <p>
155     * Domains are also excluded if they are aliases of another domain.
156     *
157     * @return Iterator containing information about the domain configurations
158     */
159    public Iterator<DomainConfiguration> getDomainConfigurations() {
160        if (previousHarvestDefinitionOid == null) {
161            // The first snapshot harvest
162            return hdDaoProvider.get().getSnapShotConfigurations();
163        } else { // An iterative snapshot harvest
164            return getDomainConfigurationsForIterativeHarvest();
165        }
166    }
167    
168    /**
169     * @return a iterator of DomainConfigurations not finished in previous SnapShot harvest  
170     */
171    public Iterator<DomainConfiguration> getDomainConfigurationsForIterativeHarvest() {
172        final DomainDAO dao = domainDAOProvider.get();
173        final HarvestDefinition previousHd = getPreviousHarvestDefinition();
174        boolean useAlternateMethod = Settings.getBoolean(HarvesterSettings.USE_ALTERNATE_SNAPSHOT_JOBGENERATION_METHOD);
175        log.debug("Retrieving a list of domainconfigurations to continue SnapshotHarvest HD #{}({}) in HD #{} ({}). Using alternative snapshot generation method='{}'", 
176                previousHd.getOid(), previousHd.getName(), getOid(), getName(), useAlternateMethod);
177        if (useAlternateMethod) {
178            return getAlternativeSnapshotJobGenerationMethod(dao, previousHd);
179        } else {
180            return getExistingSnapshotJobGenerationMethod(dao, previousHd);
181        }
182    }
183    
184    /**
185     * Implements the old way of finding the DomainConfigurations for a iterative snapshot harvest.
186     * It fetches all the HarvestInfo records for the previous harvest, and then checks for each record 
187     * if the domain was fully harvested in the previous harvest. If it was, the domain is skipped in the next harvest.
188     * 
189     * @param dao a DomainDAO object.
190     * @param previousHd the previousHD for this fullharvest
191     * @return a iterator of DomainConfigurations for a iterative snapshot harvest.
192     */
193    private Iterator<DomainConfiguration> getExistingSnapshotJobGenerationMethod(final DomainDAO dao, final HarvestDefinition previousHd) {
194        log.info("Running existing method for finding domainconfigs for iterative harvest #{} continuing harvest #{}", getOid(), previousHd.getOid());
195        // Get a iterator of what has been harvested in the previous harvestdefinition
196        Iterator<HarvestInfo> i = dao.getHarvestInfoBasedOnPreviousHarvestDefinition(previousHd);
197        log.info("Completed making iterator of HarvestInfo records from HD#{} to be used for HD#{}", previousHd.getOid(),  getOid());
198        return new FilterIterator<HarvestInfo, DomainConfiguration>(i) {
199            protected DomainConfiguration filter(HarvestInfo harvestInfo) {
200
201                if (harvestInfo.getStopReason() == StopReason.DOWNLOAD_COMPLETE
202                        || harvestInfo.getStopReason() == StopReason.DOWNLOAD_UNFINISHED) {
203                    // Don't include the ones that finished or died
204                    // in an unclean fashion
205                    return null;
206                }
207
208                DomainConfiguration config = getConfigurationFromPreviousHarvest(harvestInfo, dao);
209                if (harvestInfo.getStopReason() == StopReason.CONFIG_SIZE_LIMIT) {
210                    // Check if MaxBytes limit for DomainConfiguration have
211                    // been raised since previous harvest.
212                    // If this is the case, return the configuration
213                    int compare = NumberUtils.compareInf(config.getMaxBytes(), harvestInfo.getSizeDataRetrieved());
214                    if (compare < 1) {
215                        return null;
216                    } else {
217                        return config;
218                    }
219                }
220
221                if (harvestInfo.getStopReason() == StopReason.CONFIG_OBJECT_LIMIT) {
222                    // Check if MaxObjects limit for DomainConfiguration have
223                    // been raised since previous harvest.
224                    // If this is the case, return the configuration
225                    int compare = NumberUtils.compareInf(config.getMaxObjects(), harvestInfo.getCountObjectRetrieved());
226                    if (compare < 1) {
227                        return null;
228                    } else {
229                        return config;
230                    }
231                }
232                Domain d = dao.read(config.getDomainName());
233
234                if (d.getAliasInfo() != null && !d.getAliasInfo().isExpired()) {
235                    // Don't include aliases
236                    return null;
237                } else {
238                    return config;
239                }
240            }
241        };
242    }
243   
244    
245    /**
246     * Implements a new way of finding the DomainConfigurations for a iterative snapshot harvest.
247     * It identifies the domains harvested in the previous harvest, and then looks up the harvestInfo for this domain for that harvest.
248     * @param dao a DomainDAO object.
249     * @param previousHD the previousHD for this fullharvest
250     * @return a iterator of DomainConfigurations for a iterative snapshot harvest.
251     */
252    private Iterator<DomainConfiguration> getAlternativeSnapshotJobGenerationMethod(final DomainDAO dao, final HarvestDefinition previousHd) {
253        log.info("Running alternate method for finding domainconfigs for iterative harvest #{} continuing harvest #{}", getOid(), previousHd.getOid());
254        Iterator<Domain> j = dao.getDomainsInSnapshotHarvestOrder(previousHd.getOid());
255        return new FilterIterator<Domain, DomainConfiguration>(j) {
256            @Override
257            protected DomainConfiguration filter(Domain d) {
258                HarvestInfo harvestInfo = dao.getHarvestInfoForDomainInHarvest(previousHd, d);
259                if (harvestInfo == null) { // Domain not found in HarvestInfo
260                    return null;
261                }
262                log.trace("Found harvestInfo for domain '{}'", d.getName());
263                if (harvestInfo.getStopReason() == StopReason.DOWNLOAD_COMPLETE
264                        || harvestInfo.getStopReason() == StopReason.DOWNLOAD_UNFINISHED) {
265                    // Don't include the ones that finished or died
266                    // in an unclean fashion
267                    return null;
268                }
269                DomainConfiguration config = getConfigurationFromPreviousHarvest(harvestInfo, dao);
270                // Check if max_bytes was reached
271                if (harvestInfo.getStopReason() == StopReason.CONFIG_SIZE_LIMIT) {
272                    // Check if MaxBytes limit for DomainConfiguration have
273                    // been raised since previous harvest.
274                    // If this is the case, return the configuration
275                    int compare = NumberUtils.compareInf(config.getMaxBytes(), harvestInfo.getSizeDataRetrieved());
276                    if (compare < 1) {
277                        return null;
278                    } else {
279                        return config;
280                    }
281                }
282                if (harvestInfo.getStopReason() == StopReason.CONFIG_OBJECT_LIMIT) {
283                    // Check if MaxObjects limit for DomainConfiguration have
284                    // been raised since previous harvest.
285                    // If this is the case, return the configuration
286                    int compare = NumberUtils.compareInf(config.getMaxObjects(), harvestInfo.getCountObjectRetrieved());
287                    if (compare < 1) {
288                        return null;
289                    } else {
290                        return config;
291                    }
292                }
293                if (d.getAliasInfo() != null && !d.getAliasInfo().isExpired()) {
294                    // Don't include aliases
295                    return null;
296                } else {
297                    return config;
298                }
299            }
300        };
301    }
302 
303    /**
304     * Get the configuration used in a previous harvest. If the configuration in the harvestinfo cannot be found
305     * (deleted), uses the default configuration.
306     *
307     * @param harvestInfo A harvest info object from a previous harvest.
308     * @param dao The dao to read configurations from.
309     * @return A configuration if found and the download in this harvestinfo was complete, null otherwise
310     */
311    private DomainConfiguration getConfigurationFromPreviousHarvest(final HarvestInfo harvestInfo, DomainDAO dao) {
312        // For each bit of harvest info that did not complete
313        try {
314            Domain domain = dao.read(harvestInfo.getDomainName());
315            // Read the domain
316            DomainConfiguration configuration;
317            // Read the configuration
318            try {
319                configuration = domain.getConfiguration(harvestInfo.getDomainConfigurationName());
320            } catch (UnknownID e) {
321                // If the old configuration cannot be found, fall
322                // back on default configuration
323                configuration = domain.getDefaultConfiguration();
324                log.debug(
325                        "Previous configuration '{}' for harvesting domain '{}' not found. Using default '{}' instead.",
326                        harvestInfo.getDomainConfigurationName(), harvestInfo.getDomainName(), configuration.getName(),
327                        e);
328            }
329            // Add the configuration to the list to harvest
330            return configuration;
331        } catch (UnknownID e) {
332            // If the domain doesn't exist, warn
333            log.debug("Previously harvested domain '{}' no longer exists. Ignoring this domain.",
334                    harvestInfo.getDomainName(), e);
335        } catch (IOFailure e) {
336            // If the domain can't be read, warn
337            log.debug("Previously harvested domain '{}' can't be read. Ignoring this domain.",
338                    harvestInfo.getDomainName(), e);
339        }
340        return null;
341    }
342
343    /**
344     * Check if this harvest definition should be run, given the time now.
345     *
346     * @param now The current time
347     * @return true if harvest definition should be run
348     */
349    public boolean runNow(Date now) {
350        return getActive() && (numEvents < 1);
351    }
352
353    /**
354     * Returns whether this HarvestDefinition represents a snapshot harvest.
355     *
356     * @return Returns true
357     */
358    public boolean isSnapShot() {
359        return true;
360    }
361
362    /**
363     * @return Returns the max job running time
364     */
365    public long getMaxJobRunningTime() {
366        return maxJobRunningTime;
367    }
368
369    /**
370     * Set the limit for how many seconds each crawljob in this fullharvest will run, or 0 for no limit.
371     *
372     * @param maxJobRunningtime max number of seconds
373     */
374    public void setMaxJobRunningTime(long maxJobRunningtime) {
375        this.maxJobRunningTime = maxJobRunningtime;
376    }
377
378    /**
379     * Is index ready. Used to check, whether or a FullHarvest is ready for scheduling. The scheduling requires, that
380     * the deduplication index used by the jobs in the FullHarvest, has already been prepared by the IndexServer.
381     *
382     * @return true, if the deduplication index is ready. Otherwise false.
383     */
384    public boolean getIndexReady() {
385        return this.indexReady;
386    }
387
388    /**
389     * Set the indexReady field.
390     *
391     * @param isIndexReady The new value of the indexReady field.
392     */
393    public void setIndexReady(boolean isIndexReady) {
394        this.indexReady = isIndexReady;
395    }
396
397}