001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.datamodel;
024
025import java.util.ArrayList;
026import java.util.Iterator;
027import java.util.List;
028
029import org.slf4j.Logger;
030import org.slf4j.LoggerFactory;
031
032import com.antiaction.raptor.dao.AttributeBase;
033
034import dk.netarkivet.common.exceptions.ArgumentNotValid;
035import dk.netarkivet.common.exceptions.PermissionDenied;
036import dk.netarkivet.common.exceptions.UnknownID;
037import dk.netarkivet.common.utils.Named;
038import dk.netarkivet.common.utils.Settings;
039import dk.netarkivet.harvester.HarvesterSettings;
040import dk.netarkivet.harvester.datamodel.eav.EAV.AttributeAndType;
041
042/**
043 * This class describes a configuration for harvesting a domain. It combines a number of seedlists, a number of
044 * passwords, an order template, and some specialised settings to define the way to harvest a domain.
045 */
046public class DomainConfiguration implements Named {
047
048    /** The class logger. */
049    private static final Logger log = LoggerFactory.getLogger(DomainConfiguration.class);
050
051    /** The name of the configuration. */
052    private String configName;
053    /** The name of the order.xml (Heritrix template) used by this configuration. */
054    private String orderXmlName = "";
055    /** maximum number of objects harvested for this configuration in a snapshot harvest. */
056    private long maxObjects;
057    /** The maximum request rate. */
058    private int maxRequestRate;
059    /** Maximum number of bytes to download in a harvest. */
060    private long maxBytes;
061    /** The domain associated with this configuration. */
062    private String domainName;
063
064    /** The list of seedlists. */
065    private List<SeedList> seedlists;
066
067    /** The list of passwords that apply in this configuration. */
068    private List<Password> passwords;
069    /** The comments associated with this configuration. */
070    private String comments;
071
072    /** ID autogenerated by DB. */
073    private Long id;
074
075    /** The domainhistory associated with the domain. */
076    private DomainHistory domainhistory;
077
078    /** The crawlertraps associated with the domain. */
079    private List<String> crawlertraps;
080
081    /** This configurations EAV attributes and attribute types. */
082    private List<AttributeAndType> attributesAndTypes;
083
084    /**
085     * How many objects should be harvested in a harvest to trust that our expected size of objects is less than the
086     * default number.
087     */
088    private static final long MIN_OBJECTS_TO_TRUST_SMALL_EXPECTATION = 50L;
089    /** The smallest number of bytes we accept per object. */
090    private static final int MIN_EXPECTATION = 1;
091
092    /**
093     * Create a new configuration for a domain.
094     *
095     * @param theConfigName The name of this configuration
096     * @param domain The domain that this configuration is for
097     * @param seedlists Seedlists to use in this configuration.
098     * @param passwords Passwords to use in this configuration.
099     */
100    public DomainConfiguration(String theConfigName, Domain domain, List<SeedList> seedlists, List<Password> passwords) {
101        this(theConfigName, domain.getName(), domain.getHistory(), domain.getCrawlerTraps(), seedlists, passwords);
102    }
103
104    /**
105     * Alternate constructor. TODO Filter all history not relevant for this configuration
106     *
107     * @param theConfigName theConfigName The name of this configuration
108     * @param domainName The name of the domain that this configuration is for
109     * @param history The domainhistory of the given domain
110     * @param crawlertraps The crawlertraps of the given domain
111     * @param seedlists Seedlists to use in this configuration
112     * @param passwords Passwords to use in this configuration.
113     */
114    public DomainConfiguration(String theConfigName, String domainName, DomainHistory history,
115            List<String> crawlertraps, List<SeedList> seedlists, List<Password> passwords) {
116        ArgumentNotValid.checkNotNullOrEmpty(theConfigName, "theConfigName");
117        ArgumentNotValid.checkNotNullOrEmpty(domainName, "domainName");
118        ArgumentNotValid.checkNotNull(passwords, "passwords");
119        ArgumentNotValid.checkNotNullOrEmpty(seedlists, "seedlists");
120
121        this.configName = theConfigName;
122        this.domainName = domainName;
123        this.domainhistory = history; // TODO Filter all history not relevant
124        // for this configuration
125        this.crawlertraps = crawlertraps;
126        this.seedlists = seedlists;
127        this.passwords = passwords;
128        this.comments = "";
129        this.maxRequestRate = Constants.DEFAULT_MAX_REQUEST_RATE;
130        this.maxObjects = Constants.DEFAULT_MAX_OBJECTS;
131        this.maxBytes = Constants.DEFAULT_MAX_BYTES;
132    }
133
134    public static String cfgToString(DomainConfiguration cfg) {
135        if (cfg == null) {
136            return "cfg{null}";
137        }
138        String result = "cfg{" + cfg.getDomainName() + "," + cfg.getName() + ","+cfg.getMaxBytes()+","+cfg.getMaxObjects()+",";
139        if (cfg.getAttributesAndTypes() != null) {
140            for (AttributeAndType aat : cfg.getAttributesAndTypes()) {
141                AttributeBase ab = aat.attribute;
142                if (ab != null) {
143                    result += "(" + ab.id + "," + ab.entity_id + "," + ab.type_id + "," + ab.getInteger() + ")";
144                }
145            }
146        }
147        result += "}";
148        return result;
149    }
150
151    /**
152     * Specify the name of the order.xml template to use.
153     *
154     * @param ordername order.xml template name
155     * @throws ArgumentNotValid if filename null or empty
156     */
157    public void setOrderXmlName(String ordername) {
158        ArgumentNotValid.checkNotNullOrEmpty(ordername, "ordername");
159        orderXmlName = ordername;
160    }
161
162    /**
163     * Specify the maximum number of objects to retrieve from the domain.
164     *
165     * @param max maximum number of objects to retrieve
166     * @throws ArgumentNotValid if max<-1
167     */
168    public void setMaxObjects(long max) {
169        if (max < -MIN_EXPECTATION) {
170            String msg = "maxObjects must be either -1 or positive, but was " + max;
171            log.debug(msg);
172            throw new ArgumentNotValid(msg);
173        }
174
175        maxObjects = max;
176    }
177
178    /**
179     * Specify the maximum request rate to use when harvesting data.
180     *
181     * @param maxrate the maximum request rate
182     * @throws ArgumentNotValid if maxrate<0
183     */
184    public void setMaxRequestRate(int maxrate) {
185        ArgumentNotValid.checkNotNegative(maxrate, "maxrate");
186
187        maxRequestRate = maxrate;
188    }
189
190    /**
191     * Specify the maximum number of bytes to download from a domain in a single harvest.
192     *
193     * @param maxBytes Maximum number of bytes to download, or -1 for no limit.
194     * @throws ArgumentNotValid if maxBytes < -1
195     */
196    public void setMaxBytes(long maxBytes) {
197        if (maxBytes < -MIN_EXPECTATION) {
198            String msg = "DomainConfiguration.maxBytes must be -1 or positive.";
199            log.debug(msg);
200            throw new ArgumentNotValid(msg);
201        }
202        this.maxBytes = maxBytes;
203    }
204
205    /**
206     * Get the configuration name.
207     *
208     * @return the configuration name
209     */
210    public String getName() {
211        return configName;
212    }
213
214    /**
215     * Returns comments.
216     *
217     * @return string containing comments
218     */
219    public String getComments() {
220        return comments;
221    }
222
223    /**
224     * Returns the name of the order xml file used by the domain.
225     *
226     * @return name of the order.xml file that should be used when harvesting the domain
227     */
228    public String getOrderXmlName() {
229        return orderXmlName;
230    }
231
232    /**
233     * Returns the maximum number of objects to harvest from the domain.
234     *
235     * @return maximum number of objects to harvest
236     */
237    public long getMaxObjects() {
238        return maxObjects;
239    }
240
241    /**
242     * Returns the maximum request rate to use when harvesting the domain.
243     *
244     * @return maximum request rate
245     */
246    public int getMaxRequestRate() {
247        return maxRequestRate;
248    }
249
250    /**
251     * Returns the maximum number of bytes to download during a single harvest of a domain.
252     *
253     * @return Maximum bytes limit, or -1 for no limit.
254     */
255    public long getMaxBytes() {
256        return maxBytes;
257    }
258
259    /**
260     * Returns the name of the domain aggregating this configuration.
261     *
262     * @return the name of the domain aggregating this configuration.
263     */
264    public String getDomainName() {
265        return domainName;
266    }
267
268    /**
269     * Get an iterator of seedlists used in this configuration.
270     *
271     * @return seedlists as iterator
272     */
273    public Iterator<SeedList> getSeedLists() {
274        return seedlists.iterator();
275    }
276
277    /**
278     * Add a new seedlist to the configuration. Must exist in the associated domain and the equal to that seedlist.
279     *
280     * @param seedlist the seedlist to add
281     * @param domain The domain to check if the seedlist exists
282     * @throws ArgumentNotValid if the seedlist is null
283     * @throws UnknownID if the seedlist is not defined on the domain
284     * @throws PermissionDenied if the seedlist is different from the one on the domain.
285     */
286    public void addSeedList(Domain domain, SeedList seedlist) {
287        ArgumentNotValid.checkNotNull(seedlist, "seedlist");
288        SeedList domainSeedlist = domain.getSeedList(seedlist.getName());
289        if (domainSeedlist == null || !domainSeedlist.equals(seedlist)) {
290            String message = "Cannot add seedlist " + seedlist + " to " + this + " as it differs from the one defined "
291                    + "for " + domain + ": " + domainSeedlist;
292            log.debug(message);
293            throw new PermissionDenied(message);
294        }
295        seedlists.add(domainSeedlist);
296    }
297
298    /**
299     * Sets the used seedlists to the given list. Note: list is copied.
300     *
301     * @param newSeedlists The seedlists to use.
302     * @param domain The domain where the seedlists should come from
303     * @throws ArgumentNotValid if the seedslists are null
304     */
305    public void setSeedLists(Domain domain, List<SeedList> newSeedlists) {
306        ArgumentNotValid.checkNotNull(newSeedlists, "newSeedlists");
307        this.seedlists = new ArrayList<SeedList>(newSeedlists.size());
308        for (SeedList s : newSeedlists) {
309            addSeedList(domain, s);
310        }
311    }
312
313    /**
314     * Get an iterator of passwords used in this configuration.
315     *
316     * @return The passwords in an iterator
317     */
318    public Iterator<Password> getPasswords() {
319        return passwords.iterator();
320    }
321
322    /**
323     * Add password to the configuration.
324     *
325     * @param password to add (must exist in the domain)
326     * @param domain the domain where the password should come from.
327     */
328    public void addPassword(Domain domain, Password password) {
329        ArgumentNotValid.checkNotNull(password, "password");
330        Password domainPassword = domain.getPassword(password.getName());
331        if (!domainPassword.equals(password)) {
332            String message = "Cannot add password " + password + " to " + this + " as it differs from the one defined "
333                    + "for " + domain + ": " + domainPassword;
334            log.debug(message);
335            throw new PermissionDenied(message);
336        }
337        passwords.add(domainPassword);
338    }
339
340    /**
341     * Gets the best expectation for how many objects a harvest using this configuration will retrieve, given a job with
342     * a maximum limit pr. domain
343     *
344     * @param objectLimit The maximum limit, or Constants.HERITRIX_MAXOBJECTS_INFINITY for no limit. This limit
345     * overrides the limit set on the configuration, unless override is in effect.
346     * @param byteLimit The maximum number of bytes that will be used as limit in the harvest. This limit overrides the
347     * limit set on the configuration, unless override is in effect.
348     * @return The expected number of objects.
349     */
350    public long getExpectedNumberOfObjects(long objectLimit, long byteLimit) {
351        long prevresultfactor = Settings.getLong(HarvesterSettings.ERRORFACTOR_PERMITTED_PREVRESULT);
352        HarvestInfo best = DomainHistory.getBestHarvestInfoExpectation(configName, this.domainhistory);
353
354        log.trace("Getting expectation, using domain info '{}' for configuration '{}'", best, cfgToString(this));
355
356        long expectedObjectSize = getExpectedBytesPerObject(best);
357        // The maximum number of objects that the maxBytes or MAX_DOMAIN_SIZE
358        // setting gives.
359        long maximum;
360        if (objectLimit != Constants.HERITRIX_MAXOBJECTS_INFINITY || byteLimit != Constants.HERITRIX_MAXBYTES_INFINITY) {
361            maximum = minObjectsBytesLimit(objectLimit, byteLimit, expectedObjectSize);
362        } else if (maxObjects != Constants.HERITRIX_MAXOBJECTS_INFINITY
363                || maxBytes != Constants.HERITRIX_MAXBYTES_INFINITY) {
364            maximum = minObjectsBytesLimit(maxObjects, maxBytes, expectedObjectSize);
365        } else {
366            maximum = Settings.getLong(HarvesterSettings.MAX_DOMAIN_SIZE);
367        }
368        log.trace("Initial maximum: {}", maximum);
369        // get last number of objects harvested
370        long minimum;
371        if (best != null) {
372            minimum = best.getCountObjectRetrieved();
373        } else {
374            minimum = NumberUtils.minInf(Constants.HERITRIX_MAXOBJECTS_INFINITY, maxObjects);
375        }
376        log.trace("Initial minimum: {}", minimum);
377        // Calculate the expected number of objects we will harvest.
378        long expectation;
379        if (best != null) {
380            if (best.getStopReason() == StopReason.DOWNLOAD_COMPLETE && maximum != -1) {
381                // We set the expectation, so our harvest will exceed the
382                // expectation at most <factor> times if the domain is a lot
383                // larger than our best guess.
384                expectation = minimum + ((maximum - minimum) / prevresultfactor);
385            } else {
386                // if stopped for different reason than DOWNLOAD_COMPLETE we
387                // add half the harvested size to expectation
388                expectation = minimum + ((maximum - minimum) / 2);
389            }
390        } else {
391            // Best guess: minimum of default max domain size and domain object
392            // limit
393            expectation = NumberUtils.minInf(Settings.getLong(HarvesterSettings.MAX_DOMAIN_SIZE), maxObjects);
394        }
395        log.trace("Initial expectation: {}", expectation);
396        // Always limit to domain specifics if set to do so. We always expect
397        // to actually hit this limit
398        if ((maxObjects > Constants.HERITRIX_MAXOBJECTS_INFINITY && maximum > maxObjects)
399                || (maxBytes > Constants.HERITRIX_MAXBYTES_INFINITY && maximum > maxBytes / expectedObjectSize)) {
400            log.trace("Using domain limits for {}", cfgToString(this));
401            maximum = minObjectsBytesLimit(maxObjects, maxBytes, expectedObjectSize);
402            log.trace("New maximum: {}", maximum);
403        }
404        // Never return more than allowed maximum
405        expectation = Math.min(expectation, maximum);
406
407        log.trace("Expected number of objects for configuration '{}' is {}", cfgToString(this), expectation);
408
409        return expectation;
410    }
411
412    /**
413     * Return the lowest limit for the two values, or MAX_DOMAIN_SIZE if both are infinite, which is the max size we
414     * harvest from this domain.
415     *
416     * @param objectLimit A long value defining an object limit, or 0 for infinite
417     * @param byteLimit A long value defining a byte limit, or HarvesterSettings.MAX_DOMAIN_SIZE for infinite.
418     * @param expectedObjectSize The expected number of bytes per object
419     * @return The lowest of the two boundaries, or MAX_DOMAIN_SIZE if both are unlimited.
420     */
421    public long minObjectsBytesLimit(long objectLimit, long byteLimit, long expectedObjectSize) {
422        long maxObjectsByBytes = byteLimit / expectedObjectSize;
423        if (objectLimit != Constants.HERITRIX_MAXOBJECTS_INFINITY) {
424            if (byteLimit != Constants.HERITRIX_MAXBYTES_INFINITY) {
425                return Math.min(objectLimit, maxObjectsByBytes);
426            } else {
427                return objectLimit;
428            }
429        } else {
430            if (byteLimit != Constants.HERITRIX_MAXBYTES_INFINITY) {
431                return maxObjectsByBytes;
432            } else {
433                return Settings.getLong(HarvesterSettings.MAX_DOMAIN_SIZE);
434            }
435        }
436    }
437
438    /**
439     * How many bytes we can expect the average object of a domain to be. If we have harvested no objects from this
440     * domain before, we use a setting EXPECTED_AVERAGE_BYTES_PER_OBJECT. If we have objects, we use the harvestinfo
441     * from previous harvests to calculate the harvest, but we only accept a low estimate if the number of harvested
442     * objects is greater than the setting MIN_OBJECTS_TO_TRUST_SMALL_EXPECTATION.
443     *
444     * @param bestInfo The best (newest complete or biggest, as per getBestHarvestInfoExpectation()) harvest info we
445     * have for the domain.
446     * @return How large we expect the average object to be. This number will be >= MIN_EXPECTATION (unless nothing is
447     * harvested and is EXPECTED_AVERAGE_BYTES_PER_OBJECT <= 0).
448     */
449    private long getExpectedBytesPerObject(HarvestInfo bestInfo) {
450        long defaultExpectation = Settings.getLong(HarvesterSettings.EXPECTED_AVERAGE_BYTES_PER_OBJECT);
451        if (bestInfo != null && bestInfo.getCountObjectRetrieved() > 0) {
452            long expectation = Math.max(MIN_EXPECTATION,
453                    bestInfo.getSizeDataRetrieved() / bestInfo.getCountObjectRetrieved());
454            if (expectation < defaultExpectation
455                    && bestInfo.getCountObjectRetrieved() < MIN_OBJECTS_TO_TRUST_SMALL_EXPECTATION) {
456                return defaultExpectation;
457            }
458            return expectation;
459        } else {
460            return defaultExpectation;
461        }
462    }
463
464    /**
465     * Set the comments field.
466     *
467     * @param comments User-entered free-form comments.
468     */
469    public void setComments(String comments) {
470        ArgumentNotValid.checkNotNull(comments, "comments");
471        this.comments = comments;
472    }
473
474    /**
475     * Remove a password from the list of passwords used in this domain.
476     *
477     * @param passwordName Password to Remove.
478     */
479    public void removePassword(String passwordName) {
480        ArgumentNotValid.checkNotNullOrEmpty(passwordName, "passwordName");
481        if (!usesPassword(passwordName)) {
482            throw new UnknownID("No password named '" + passwordName + "' found in '" + this + "'");
483        }
484        for (Iterator<Password> i = passwords.iterator(); i.hasNext();) {
485            Password p = i.next();
486            if (p.getName().equals(passwordName)) {
487                i.remove();
488            }
489        }
490    }
491
492    /**
493     * Check whether this domain uses a given password.
494     *
495     * @param passwordName The given password
496     * @return whether the given password is used
497     */
498    public boolean usesPassword(String passwordName) {
499        ArgumentNotValid.checkNotNullOrEmpty(passwordName, "passwordName");
500        for (Password p : passwords) {
501            if (p.getName().equals(passwordName)) {
502                return true;
503            }
504        }
505        return false;
506    }
507
508    /**
509     * Sets the used passwords to the given list. Note: list is copied.
510     *
511     * @param newPasswords The passwords to use.
512     * @param domain The domain where the passwords should come from
513     * @throws ArgumentNotValid if the passwords are null
514     */
515    public void setPasswords(Domain domain, List<Password> newPasswords) {
516        ArgumentNotValid.checkNotNull(newPasswords, "newPasswords");
517        this.passwords = new ArrayList<Password>(newPasswords.size());
518        for (Password p : newPasswords) {
519            addPassword(domain, p);
520        }
521    }
522
523    /**
524     * Get the ID of this configuration.
525     *
526     * @return the ID of this configuration
527     */
528    public Long getID() {
529        return id;
530    }
531
532    /**
533     * Set the ID of this configuration. Only for use by DBDAO
534     *
535     * @param anId use this id for this configuration
536     */
537    void setID(long anId) {
538        this.id = anId;
539    }
540
541    /**
542     * Check if this configuration has an ID set yet (doesn't happen until the DBDAO persists it).
543     *
544     * @return true, if the configuration has an ID
545     */
546    boolean hasID() {
547        return id != null;
548    }
549
550    /**
551     * ToString of DomainConfiguration class.
552     *
553     * @return a string with info about the instance of this class.
554     */
555    public String toString() {
556        return "Configuration '" + getName() + "' of domain '" + domainName + "'";
557    }
558
559    /**
560     * Set the crawlerltraps for this configuration.
561     *
562     * @param someCrawlertraps a list of crawlertraps
563     */
564    public void setCrawlertraps(List<String> someCrawlertraps) {
565        this.crawlertraps = someCrawlertraps;
566    }
567
568    /**
569     * @return the known crawlertraps for this configuration.
570     */
571    public List<String> getCrawlertraps() {
572        return this.crawlertraps;
573    }
574
575    /**
576     * @return the domainhistory for this configuration
577     */
578    public DomainHistory getDomainhistory() {
579        return domainhistory;
580    }
581
582    /**
583     * Set the domainHistory for this configuration.
584     *
585     * @param newDomainhistory the new domainHistory for this configuration( null is accepted for no History)
586     */
587    public void setDomainhistory(DomainHistory newDomainhistory) {
588        this.domainhistory = newDomainhistory;
589    }
590
591    /**
592     * Change the name of configuration to the given configName.
593     * @param configName a new name for this configuration.
594     */
595        public void setName(String configName) {
596                this.configName = configName;
597        }
598
599    /**
600     * Get this configurations EAV attributes and attribute types.
601     * @return this configurations EAV attributes and attribute types
602     */
603    public List<AttributeAndType> getAttributesAndTypes() {
604        return attributesAndTypes;
605    }
606
607    /**
608     * Set this configurations EAV attributes and attribute types.
609     * @param attributesAndTypes EAV attributes and attribute types
610     */
611    public void setAttributesAndTypes(List<AttributeAndType> attributesAndTypes) {
612        this.attributesAndTypes = attributesAndTypes;
613    }
614
615}