001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.datamodel;
024
025import java.util.Date;
026import java.util.Iterator;
027
028import javax.inject.Provider;
029
030import org.slf4j.Logger;
031import org.slf4j.LoggerFactory;
032
033import dk.netarkivet.common.exceptions.ArgumentNotValid;
034import dk.netarkivet.common.exceptions.IOFailure;
035import dk.netarkivet.common.exceptions.UnknownID;
036import dk.netarkivet.common.utils.FilterIterator;
037import dk.netarkivet.harvester.datamodel.extendedfield.ExtendedFieldDAO;
038
039/**
040 * This class contains the specific properties and operations of snapshot harvest definitions.
041 */
042public class FullHarvest extends HarvestDefinition {
043
044    /** The class logger. */
045    private static final Logger log = LoggerFactory.getLogger(FullHarvest.class);
046
047    /** The maximum number of objects retrieved from each domain during a snapshot harvest. */
048    private long maxCountObjects;
049
050    /** The maximum number of bytes retrieved from each domain during a snapshot harvest. */
051    private long maxBytes;
052
053    /** The maximum time in seconds to run for each job generated by this definition. */
054    private long maxJobRunningTime;
055
056    /** The ID for the harvestdefinition, this FullHarvest is based upon. */
057    private Long previousHarvestDefinitionOid;
058
059    /** a boolean to indicate whether the deduplication index is ready. */
060    private boolean indexReady;
061
062    private final Provider<HarvestDefinitionDAO> hdDaoProvider;
063    private final Provider<JobDAO> jobDaoProvider; // Not used
064    private final Provider<DomainDAO> domainDAOProvider;
065
066    /**
067     * Create new instance of FullHarvest configured according to the properties of the supplied DomainConfiguration.
068     * Should only be used by the HarvestFactory class.
069     *
070     * @param harvestDefName the name of the harvest definition
071     * @param comments comments
072     * @param previousHarvestDefinitionOid This harvestDefinition is used to create this Fullharvest definition.
073     * @param maxCountObjects Limit for how many objects can be fetched per domain
074     * @param maxBytes Limit for how many bytes can be fetched per domain
075     * @param maxJobRunningTime Limit on how much time can be spent on each job. 0 means no limit
076     * @param isIndexReady Is the deduplication index ready for this harvest.
077     */
078    public FullHarvest(String harvestDefName, String comments, Long previousHarvestDefinitionOid, long maxCountObjects,
079            long maxBytes, long maxJobRunningTime, boolean isIndexReady, Provider<HarvestDefinitionDAO> hdDaoProvider,
080            Provider<JobDAO> jobDaoProvider, Provider<ExtendedFieldDAO> extendedFieldDAOProvide,
081            Provider<DomainDAO> domainDAOProvider) {
082        super(extendedFieldDAOProvide);
083        ArgumentNotValid.checkNotNullOrEmpty(harvestDefName, "harvestDefName");
084        ArgumentNotValid.checkNotNull(comments, "comments");
085        this.previousHarvestDefinitionOid = previousHarvestDefinitionOid;
086        this.harvestDefName = harvestDefName;
087        this.comments = comments;
088        this.maxCountObjects = maxCountObjects;
089        this.numEvents = 0;
090        this.maxBytes = maxBytes;
091        this.maxJobRunningTime = maxJobRunningTime;
092        this.indexReady = isIndexReady;
093        this.hdDaoProvider = hdDaoProvider;
094        this.jobDaoProvider = jobDaoProvider;
095        this.domainDAOProvider = domainDAOProvider;
096    }
097
098    /**
099     * Get the previous HarvestDefinition which is used to base this.
100     *
101     * @return The previous HarvestDefinition
102     */
103    public HarvestDefinition getPreviousHarvestDefinition() {
104        if (previousHarvestDefinitionOid != null) {
105            return hdDaoProvider.get().read(previousHarvestDefinitionOid);
106        }
107        return null;
108    }
109
110    /**
111     * Set the previous HarvestDefinition which is used to base this.
112     *
113     * @param prev The id of a HarvestDefinition
114     */
115    public void setPreviousHarvestDefinition(Long prev) {
116        previousHarvestDefinitionOid = prev;
117    }
118
119    /** @return Returns the maxCountObjects. */
120    public long getMaxCountObjects() {
121        return maxCountObjects;
122    }
123
124    /** @param maxCountObjects The maxCountObjects to set. */
125    public void setMaxCountObjects(long maxCountObjects) {
126        this.maxCountObjects = maxCountObjects;
127    }
128
129    /**
130     * Get the maximum number of bytes that this fullharvest will harvest per domain, 0 for no limit.
131     *
132     * @return Total download limit in bytes per domain.
133     */
134    public long getMaxBytes() {
135        return maxBytes;
136    }
137
138    /**
139     * Set the limit for how many bytes this fullharvest will harvest per domain, or -1 for no limit.
140     *
141     * @param maxBytes Number of bytes to stop harvesting at.
142     */
143    public void setMaxBytes(long maxBytes) {
144        this.maxBytes = maxBytes;
145    }
146
147    /**
148     * Returns an iterator of domain configurations for this harvest definition. Domains are filtered out if, on the
149     * previous harvest, they: 1) were completed 2) reached their maxBytes limit (and the maxBytes limit has not changed
150     * since time of harvest) 3) reached their maxObjects limit (and the maxObjects limit has not changed since time of
151     * harvest) 4) died uncleanly (e.g. due to a manual shutdown of heritrix) on their last harvest.
152     * <p>
153     * Domains are also excluded if they are aliases of another domain.
154     *
155     * @return Iterator containing information about the domain configurations
156     */
157    public Iterator<DomainConfiguration> getDomainConfigurations() {
158        if (previousHarvestDefinitionOid == null) {
159            // The first snapshot harvest
160            //HarvestDefinitionDAO hdDao = HarvestDefinitionDAO.getInstance();
161            return hdDaoProvider.get().getSnapShotConfigurations();
162        }
163
164        // An iterative snapshot harvest
165        final DomainDAO dao = domainDAOProvider.get();
166        // Get what has been harvested
167        Iterator<HarvestInfo> i = dao.getHarvestInfoBasedOnPreviousHarvestDefinition(getPreviousHarvestDefinition());
168        //
169        return new FilterIterator<HarvestInfo, DomainConfiguration>(i) {
170            protected DomainConfiguration filter(HarvestInfo harvestInfo) {
171
172                if (harvestInfo.getStopReason() == StopReason.DOWNLOAD_COMPLETE
173                        || harvestInfo.getStopReason() == StopReason.DOWNLOAD_UNFINISHED) {
174                    // Don't include the ones that finished or died
175                    // in an unclean fashion
176                    return null;
177                }
178
179                DomainConfiguration config = getConfigurationFromPreviousHarvest(harvestInfo, dao);
180                if (harvestInfo.getStopReason() == StopReason.CONFIG_SIZE_LIMIT) {
181                    // Check if MaxBytes limit for DomainConfiguration have
182                    // been raised since previous harvest.
183                    // If this is the case, return the configuration
184                    int compare = NumberUtils.compareInf(config.getMaxBytes(), harvestInfo.getSizeDataRetrieved());
185                    if (compare < 1) {
186                        return null;
187                    } else {
188                        return config;
189                    }
190                }
191
192                if (harvestInfo.getStopReason() == StopReason.CONFIG_OBJECT_LIMIT) {
193                    // Check if MaxObjects limit for DomainConfiguration have
194                    // been raised since previous harvest.
195                    // If this is the case, return the configuration
196                    int compare = NumberUtils.compareInf(config.getMaxObjects(), harvestInfo.getCountObjectRetrieved());
197                    if (compare < 1) {
198                        return null;
199                    } else {
200                        return config;
201                    }
202                }
203                Domain d = dao.read(config.getDomainName());
204
205                if (d.getAliasInfo() != null && !d.getAliasInfo().isExpired()) {
206                    // Don't include aliases
207                    return null;
208                } else {
209                    return config;
210                }
211            }
212        };
213    }
214
215    /**
216     * Get the configuration used in a previous harvest. If the configuration in the harvestinfo cannot be found
217     * (deleted), uses the default configuration.
218     *
219     * @param harvestInfo A harvest info object from a previous harvest.
220     * @param dao The dao to read configurations from.
221     * @return A configuration if found and the download in this harvestinfo was complete, null otherwise
222     */
223    private DomainConfiguration getConfigurationFromPreviousHarvest(final HarvestInfo harvestInfo, DomainDAO dao) {
224        // For each bit of harvest info that did not complete
225        try {
226            Domain domain = dao.read(harvestInfo.getDomainName());
227            // Read the domain
228            DomainConfiguration configuration;
229            // Read the configuration
230            try {
231                configuration = domain.getConfiguration(harvestInfo.getDomainConfigurationName());
232            } catch (UnknownID e) {
233                // If the old configuration cannot be found, fall
234                // back on default configuration
235                configuration = domain.getDefaultConfiguration();
236                log.debug(
237                        "Previous configuration '{}' for harvesting domain '{}' not found. Using default '{}' instead.",
238                        harvestInfo.getDomainConfigurationName(), harvestInfo.getDomainName(), configuration.getName(),
239                        e);
240            }
241            // Add the configuration to the list to harvest
242            return configuration;
243        } catch (UnknownID e) {
244            // If the domain doesn't exist, warn
245            log.debug("Previously harvested domain '{}' no longer exists. Ignoring this domain.",
246                    harvestInfo.getDomainName(), e);
247        } catch (IOFailure e) {
248            // If the domain can't be read, warn
249            log.debug("Previously harvested domain '{}' can't be read. Ignoring this domain.",
250                    harvestInfo.getDomainName(), e);
251        }
252        return null;
253    }
254
255    /**
256     * Check if this harvest definition should be run, given the time now.
257     *
258     * @param now The current time
259     * @return true if harvest definition should be run
260     */
261    public boolean runNow(Date now) {
262        return getActive() && (numEvents < 1);
263    }
264
265    /**
266     * Returns whether this HarvestDefinition represents a snapshot harvest.
267     *
268     * @return Returns true
269     */
270    public boolean isSnapShot() {
271        return true;
272    }
273
274    /**
275     * @return Returns the max job running time
276     */
277    public long getMaxJobRunningTime() {
278        return maxJobRunningTime;
279    }
280
281    /**
282     * Set the limit for how many seconds each crawljob in this fullharvest will run, or 0 for no limit.
283     *
284     * @param maxJobRunningtime max number of seconds
285     */
286    public void setMaxJobRunningTime(long maxJobRunningtime) {
287        this.maxJobRunningTime = maxJobRunningtime;
288    }
289
290    /**
291     * Is index ready. Used to check, whether or a FullHarvest is ready for scheduling. The scheduling requires, that
292     * the deduplication index used by the jobs in the FullHarvest, has already been prepared by the IndexServer.
293     *
294     * @return true, if the deduplication index is ready. Otherwise false.
295     */
296    public boolean getIndexReady() {
297        return this.indexReady;
298    }
299
300    /**
301     * Set the indexReady field.
302     *
303     * @param isIndexReady The new value of the indexReady field.
304     */
305    public void setIndexReady(boolean isIndexReady) {
306        this.indexReady = isIndexReady;
307    }
308
309}