001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.datamodel;
024
025import java.util.ArrayList;
026import java.util.Iterator;
027import java.util.List;
028
029import org.slf4j.Logger;
030import org.slf4j.LoggerFactory;
031
032import dk.netarkivet.common.exceptions.ArgumentNotValid;
033import dk.netarkivet.common.exceptions.PermissionDenied;
034import dk.netarkivet.common.exceptions.UnknownID;
035import dk.netarkivet.common.utils.Named;
036import dk.netarkivet.common.utils.Settings;
037import dk.netarkivet.harvester.HarvesterSettings;
038
039/**
040 * This class describes a configuration for harvesting a domain. It combines a number of seedlists, a number of
041 * passwords, an order template, and some specialised settings to define the way to harvest a domain.
042 */
043public class DomainConfiguration implements Named {
044
045    /** The class logger. */
046    private static final Logger log = LoggerFactory.getLogger(DomainConfiguration.class);
047
048    /** The name of the configuration. */
049    private String configName;
050    /** The name of the order.xml (Heritrix template) used by this configuration. */
051    private String orderXmlName = "";
052    /** maximum number of objects harvested for this configuration in a snapshot harvest. */
053    private long maxObjects;
054    /** The maximum request rate. */
055    private int maxRequestRate;
056    /** Maximum number of bytes to download in a harvest. */
057    private long maxBytes;
058    /** The domain associated with this configuration. */
059    private String domainName;
060
061    /** The list of seedlists. */
062    private List<SeedList> seedlists;
063
064    /** The list of passwords that apply in this configuration. */
065    private List<Password> passwords;
066    /** The comments associated with this configuration. */
067    private String comments;
068
069    /** ID autogenerated by DB. */
070    private Long id;
071
072    /** The domainhistory associated with the domain. */
073    private DomainHistory domainhistory;
074
075    /** The crawlertraps associated with the domain. */
076    private List<String> crawlertraps;
077
078    /**
079     * How many objects should be harvested in a harvest to trust that our expected size of objects is less than the
080     * default number.
081     */
082    private static final long MIN_OBJECTS_TO_TRUST_SMALL_EXPECTATION = 50L;
083    /** The smallest number of bytes we accept per object. */
084    private static final int MIN_EXPECTATION = 1;
085
086    /**
087     * Create a new configuration for a domain.
088     *
089     * @param theConfigName The name of this configuration
090     * @param domain The domain that this configuration is for
091     * @param seedlists Seedlists to use in this configuration.
092     * @param passwords Passwords to use in this configuration.
093     */
094    public DomainConfiguration(String theConfigName, Domain domain, List<SeedList> seedlists, List<Password> passwords) {
095        this(theConfigName, domain.getName(), domain.getHistory(), domain.getCrawlerTraps(), seedlists, passwords);
096    }
097
098    /**
099     * Alternate constructor. TODO Filter all history not relevant for this configuration
100     *
101     * @param theConfigName theConfigName The name of this configuration
102     * @param domainName The name of the domain that this configuration is for
103     * @param history The domainhistory of the given domain
104     * @param crawlertraps The crawlertraps of the given domain
105     * @param seedlists Seedlists to use in this configuration
106     * @param passwords Passwords to use in this configuration.
107     */
108    public DomainConfiguration(String theConfigName, String domainName, DomainHistory history,
109            List<String> crawlertraps, List<SeedList> seedlists, List<Password> passwords) {
110        ArgumentNotValid.checkNotNullOrEmpty(theConfigName, "theConfigName");
111        ArgumentNotValid.checkNotNullOrEmpty(domainName, "domainName");
112        ArgumentNotValid.checkNotNull(passwords, "passwords");
113        ArgumentNotValid.checkNotNullOrEmpty(seedlists, "seedlists");
114
115        this.configName = theConfigName;
116        this.domainName = domainName;
117        this.domainhistory = history; // TODO Filter all history not relevant
118        // for this configuration
119        this.crawlertraps = crawlertraps;
120        this.seedlists = seedlists;
121        this.passwords = passwords;
122        this.comments = "";
123        this.maxRequestRate = Constants.DEFAULT_MAX_REQUEST_RATE;
124        this.maxObjects = Constants.DEFAULT_MAX_OBJECTS;
125        this.maxBytes = Constants.DEFAULT_MAX_BYTES;
126    }
127
128    /**
129     * Specify the name of the order.xml template to use.
130     *
131     * @param ordername order.xml template name
132     * @throws ArgumentNotValid if filename null or empty
133     */
134    public void setOrderXmlName(String ordername) {
135        ArgumentNotValid.checkNotNullOrEmpty(ordername, "ordername");
136        orderXmlName = ordername;
137    }
138
139    /**
140     * Specify the maximum number of objects to retrieve from the domain.
141     *
142     * @param max maximum number of objects to retrieve
143     * @throws ArgumentNotValid if max<-1
144     */
145    public void setMaxObjects(long max) {
146        if (max < -MIN_EXPECTATION) {
147            String msg = "maxObjects must be either -1 or positive, but was " + max;
148            log.debug(msg);
149            throw new ArgumentNotValid(msg);
150        }
151
152        maxObjects = max;
153    }
154
155    /**
156     * Specify the maximum request rate to use when harvesting data.
157     *
158     * @param maxrate the maximum request rate
159     * @throws ArgumentNotValid if maxrate<0
160     */
161    public void setMaxRequestRate(int maxrate) {
162        ArgumentNotValid.checkNotNegative(maxrate, "maxrate");
163
164        maxRequestRate = maxrate;
165    }
166
167    /**
168     * Specify the maximum number of bytes to download from a domain in a single harvest.
169     *
170     * @param maxBytes Maximum number of bytes to download, or -1 for no limit.
171     * @throws ArgumentNotValid if maxBytes < -1
172     */
173    public void setMaxBytes(long maxBytes) {
174        if (maxBytes < -MIN_EXPECTATION) {
175            String msg = "DomainConfiguration.maxBytes must be -1 or positive.";
176            log.debug(msg);
177            throw new ArgumentNotValid(msg);
178        }
179        this.maxBytes = maxBytes;
180    }
181
182    /**
183     * Get the configuration name.
184     *
185     * @return the configuration name
186     */
187    public String getName() {
188        return configName;
189    }
190
191    /**
192     * Returns comments.
193     *
194     * @return string containing comments
195     */
196    public String getComments() {
197        return comments;
198    }
199
200    /**
201     * Returns the name of the order xml file used by the domain.
202     *
203     * @return name of the order.xml file that should be used when harvesting the domain
204     */
205    public String getOrderXmlName() {
206        return orderXmlName;
207    }
208
209    /**
210     * Returns the maximum number of objects to harvest from the domain.
211     *
212     * @return maximum number of objects to harvest
213     */
214    public long getMaxObjects() {
215        return maxObjects;
216    }
217
218    /**
219     * Returns the maximum request rate to use when harvesting the domain.
220     *
221     * @return maximum request rate
222     */
223    public int getMaxRequestRate() {
224        return maxRequestRate;
225    }
226
227    /**
228     * Returns the maximum number of bytes to download during a single harvest of a domain.
229     *
230     * @return Maximum bytes limit, or -1 for no limit.
231     */
232    public long getMaxBytes() {
233        return maxBytes;
234    }
235
236    /**
237     * Returns the name of the domain aggregating this configuration.
238     *
239     * @return the name of the domain aggregating this configuration.
240     */
241    public String getDomainName() {
242        return domainName;
243    }
244
245    /**
246     * Get an iterator of seedlists used in this configuration.
247     *
248     * @return seedlists as iterator
249     */
250    public Iterator<SeedList> getSeedLists() {
251        return seedlists.iterator();
252    }
253
254    /**
255     * Add a new seedlist to the configuration. Must exist in the associated domain and the equal to that seedlist.
256     *
257     * @param seedlist the seedlist to add
258     * @param domain The domain to check if the seedlist exists
259     * @throws ArgumentNotValid if the seedlist is null
260     * @throws UnknownID if the seedlist is not defined on the domain
261     * @throws PermissionDenied if the seedlist is different from the one on the domain.
262     */
263    public void addSeedList(Domain domain, SeedList seedlist) {
264        ArgumentNotValid.checkNotNull(seedlist, "seedlist");
265        SeedList domainSeedlist = domain.getSeedList(seedlist.getName());
266        if (domainSeedlist == null || !domainSeedlist.equals(seedlist)) {
267            String message = "Cannot add seedlist " + seedlist + " to " + this + " as it differs from the one defined "
268                    + "for " + domain + ": " + domainSeedlist;
269            log.debug(message);
270            throw new PermissionDenied(message);
271        }
272        seedlists.add(domainSeedlist);
273    }
274
275    /**
276     * Sets the used seedlists to the given list. Note: list is copied.
277     *
278     * @param newSeedlists The seedlists to use.
279     * @param domain The domain where the seedlists should come from
280     * @throws ArgumentNotValid if the seedslists are null
281     */
282    public void setSeedLists(Domain domain, List<SeedList> newSeedlists) {
283        ArgumentNotValid.checkNotNull(newSeedlists, "newSeedlists");
284        this.seedlists = new ArrayList<SeedList>(newSeedlists.size());
285        for (SeedList s : newSeedlists) {
286            addSeedList(domain, s);
287        }
288    }
289
290    /**
291     * Get an iterator of passwords used in this configuration.
292     *
293     * @return The passwords in an iterator
294     */
295    public Iterator<Password> getPasswords() {
296        return passwords.iterator();
297    }
298
299    /**
300     * Add password to the configuration.
301     *
302     * @param password to add (must exist in the domain)
303     * @param domain the domain where the password should come from.
304     */
305    public void addPassword(Domain domain, Password password) {
306        ArgumentNotValid.checkNotNull(password, "password");
307        Password domainPassword = domain.getPassword(password.getName());
308        if (!domainPassword.equals(password)) {
309            String message = "Cannot add password " + password + " to " + this + " as it differs from the one defined "
310                    + "for " + domain + ": " + domainPassword;
311            log.debug(message);
312            throw new PermissionDenied(message);
313        }
314        passwords.add(domainPassword);
315    }
316
317    /**
318     * Gets the best expectation for how many objects a harvest using this configuration will retrieve, given a job with
319     * a maximum limit pr. domain
320     *
321     * @param objectLimit The maximum limit, or Constants.HERITRIX_MAXOBJECTS_INFINITY for no limit. This limit
322     * overrides the limit set on the configuration, unless override is in effect.
323     * @param byteLimit The maximum number of bytes that will be used as limit in the harvest. This limit overrides the
324     * limit set on the configuration, unless override is in effect.
325     * @return The expected number of objects.
326     */
327    public long getExpectedNumberOfObjects(long objectLimit, long byteLimit) {
328        long prevresultfactor = Settings.getLong(HarvesterSettings.ERRORFACTOR_PERMITTED_PREVRESULT);
329        HarvestInfo best = DomainHistory.getBestHarvestInfoExpectation(configName, this.domainhistory);
330
331        log.trace("Using domain info '{}' for configuration '{}'", best, toString());
332
333        long expectedObjectSize = getExpectedBytesPerObject(best);
334        // The maximum number of objects that the maxBytes or MAX_DOMAIN_SIZE
335        // setting gives.
336        long maximum;
337        if (objectLimit != Constants.HERITRIX_MAXOBJECTS_INFINITY || byteLimit != Constants.HERITRIX_MAXBYTES_INFINITY) {
338            maximum = minObjectsBytesLimit(objectLimit, byteLimit, expectedObjectSize);
339        } else if (maxObjects != Constants.HERITRIX_MAXOBJECTS_INFINITY
340                || maxBytes != Constants.HERITRIX_MAXBYTES_INFINITY) {
341            maximum = minObjectsBytesLimit(maxObjects, maxBytes, expectedObjectSize);
342        } else {
343            maximum = Settings.getLong(HarvesterSettings.MAX_DOMAIN_SIZE);
344        }
345        // get last number of objects harvested
346        long minimum;
347        if (best != null) {
348            minimum = best.getCountObjectRetrieved();
349        } else {
350            minimum = NumberUtils.minInf(Constants.HERITRIX_MAXOBJECTS_INFINITY, maxObjects);
351        }
352        // Calculate the expected number of objects we will harvest.
353        long expectation;
354        if (best != null) {
355            if (best.getStopReason() == StopReason.DOWNLOAD_COMPLETE && maximum != -1) {
356                // We set the expectation, so our harvest will exceed the
357                // expectation at most <factor> times if the domain is a lot
358                // larger than our best guess.
359                expectation = minimum + ((maximum - minimum) / prevresultfactor);
360            } else {
361                // if stopped for different reason than DOWNLOAD_COMPLETE we
362                // add half the harvested size to expectation
363                expectation = minimum + ((maximum - minimum) / 2);
364            }
365        } else {
366            // Best guess: minimum of default max domain size and domain object
367            // limit
368            expectation = NumberUtils.minInf(Settings.getLong(HarvesterSettings.MAX_DOMAIN_SIZE), maxObjects);
369        }
370        // Always limit to domain specifics if set to do so. We always expect
371        // to actually hit this limit
372        if ((maxObjects > Constants.HERITRIX_MAXOBJECTS_INFINITY && maximum > maxObjects)
373                || (maxBytes > Constants.HERITRIX_MAXBYTES_INFINITY && maximum > maxBytes / expectedObjectSize)) {
374            maximum = minObjectsBytesLimit(maxObjects, maxBytes, expectedObjectSize);
375        }
376        // Never return more than allowed maximum
377        expectation = Math.min(expectation, maximum);
378
379        log.trace("Expected number of objects for configuration '{}' is {}", toString(), expectation);
380
381        return expectation;
382    }
383
384    /**
385     * Return the lowest limit for the two values, or MAX_DOMAIN_SIZE if both are infinite, which is the max size we
386     * harvest from this domain.
387     *
388     * @param objectLimit A long value defining an object limit, or 0 for infinite
389     * @param byteLimit A long value defining a byte limit, or HarvesterSettings.MAX_DOMAIN_SIZE for infinite.
390     * @param expectedObjectSize The expected number of bytes per object
391     * @return The lowest of the two boundaries, or MAX_DOMAIN_SIZE if both are unlimited.
392     */
393    public long minObjectsBytesLimit(long objectLimit, long byteLimit, long expectedObjectSize) {
394        long maxObjectsByBytes = byteLimit / expectedObjectSize;
395        if (objectLimit != Constants.HERITRIX_MAXOBJECTS_INFINITY) {
396            if (byteLimit != Constants.HERITRIX_MAXBYTES_INFINITY) {
397                return Math.min(objectLimit, maxObjectsByBytes);
398            } else {
399                return objectLimit;
400            }
401        } else {
402            if (byteLimit != Constants.HERITRIX_MAXBYTES_INFINITY) {
403                return maxObjectsByBytes;
404            } else {
405                return Settings.getLong(HarvesterSettings.MAX_DOMAIN_SIZE);
406            }
407        }
408    }
409
410    /**
411     * How many bytes we can expect the average object of a domain to be. If we have harvested no objects from this
412     * domain before, we use a setting EXPECTED_AVERAGE_BYTES_PER_OBJECT. If we have objects, we use the harvestinfo
413     * from previous harvests to calculate the harvest, but we only accept a low estimate if the number of harvested
414     * objects is greater than the setting MIN_OBJECTS_TO_TRUST_SMALL_EXPECTATION.
415     *
416     * @param bestInfo The best (newest complete or biggest, as per getBestHarvestInfoExpectation()) harvest info we
417     * have for the domain.
418     * @return How large we expect the average object to be. This number will be >= MIN_EXPECTATION (unless nothing is
419     * harvested and is EXPECTED_AVERAGE_BYTES_PER_OBJECT <= 0).
420     */
421    private long getExpectedBytesPerObject(HarvestInfo bestInfo) {
422        long defaultExpectation = Settings.getLong(HarvesterSettings.EXPECTED_AVERAGE_BYTES_PER_OBJECT);
423        if (bestInfo != null && bestInfo.getCountObjectRetrieved() > 0) {
424            long expectation = Math.max(MIN_EXPECTATION,
425                    bestInfo.getSizeDataRetrieved() / bestInfo.getCountObjectRetrieved());
426            if (expectation < defaultExpectation
427                    && bestInfo.getCountObjectRetrieved() < MIN_OBJECTS_TO_TRUST_SMALL_EXPECTATION) {
428                return defaultExpectation;
429            }
430            return expectation;
431        } else {
432            return defaultExpectation;
433        }
434    }
435
436    /**
437     * Set the comments field.
438     *
439     * @param comments User-entered free-form comments.
440     */
441    public void setComments(String comments) {
442        ArgumentNotValid.checkNotNull(comments, "comments");
443        this.comments = comments;
444    }
445
446    /**
447     * Remove a password from the list of passwords used in this domain.
448     *
449     * @param passwordName Password to Remove.
450     */
451    public void removePassword(String passwordName) {
452        ArgumentNotValid.checkNotNullOrEmpty(passwordName, "passwordName");
453        if (!usesPassword(passwordName)) {
454            throw new UnknownID("No password named '" + passwordName + "' found in '" + this + "'");
455        }
456        for (Iterator<Password> i = passwords.iterator(); i.hasNext();) {
457            Password p = i.next();
458            if (p.getName().equals(passwordName)) {
459                i.remove();
460            }
461        }
462    }
463
464    /**
465     * Check whether this domain uses a given password.
466     *
467     * @param passwordName The given password
468     * @return whether the given password is used
469     */
470    public boolean usesPassword(String passwordName) {
471        ArgumentNotValid.checkNotNullOrEmpty(passwordName, "passwordName");
472        for (Password p : passwords) {
473            if (p.getName().equals(passwordName)) {
474                return true;
475            }
476        }
477        return false;
478    }
479
480    /**
481     * Sets the used passwords to the given list. Note: list is copied.
482     *
483     * @param newPasswords The passwords to use.
484     * @param domain The domain where the passwords should come from
485     * @throws ArgumentNotValid if the passwords are null
486     */
487    public void setPasswords(Domain domain, List<Password> newPasswords) {
488        ArgumentNotValid.checkNotNull(newPasswords, "newPasswords");
489        this.passwords = new ArrayList<Password>(newPasswords.size());
490        for (Password p : newPasswords) {
491            addPassword(domain, p);
492        }
493    }
494
495    /**
496     * Get the ID of this configuration.
497     *
498     * @return the ID of this configuration
499     */
500    public long getID() {
501        return id;
502    }
503
504    /**
505     * Set the ID of this configuration. Only for use by DBDAO
506     *
507     * @param anId use this id for this configuration
508     */
509    void setID(long anId) {
510        this.id = anId;
511    }
512
513    /**
514     * Check if this configuration has an ID set yet (doesn't happen until the DBDAO persists it).
515     *
516     * @return true, if the configuration has an ID
517     */
518    boolean hasID() {
519        return id != null;
520    }
521
522    /**
523     * ToString of DomainConfiguration class.
524     *
525     * @return a string with info about the instance of this class.
526     */
527    public String toString() {
528        return "Configuration '" + getName() + "' of domain '" + domainName + "'";
529    }
530
531    /**
532     * Set the crawlerltraps for this configuration.
533     *
534     * @param someCrawlertraps a list of crawlertraps
535     */
536    public void setCrawlertraps(List<String> someCrawlertraps) {
537        this.crawlertraps = someCrawlertraps;
538    }
539
540    /**
541     * @return the known crawlertraps for this configuration.
542     */
543    public List<String> getCrawlertraps() {
544        return this.crawlertraps;
545    }
546
547    /**
548     * @return the domainhistory for this configuration
549     */
550    public DomainHistory getDomainhistory() {
551        return domainhistory;
552    }
553
554    /**
555     * Set the domainHistory for this configuration.
556     *
557     * @param newDomainhistory the new domainHistory for this configuration( null is accepted for no History)
558     */
559    public void setDomainhistory(DomainHistory newDomainhistory) {
560        this.domainhistory = newDomainhistory;
561    }
562
563}