001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.datamodel;
024
025import java.util.ArrayList;
026import java.util.Arrays;
027import java.util.Collections;
028import java.util.Date;
029import java.util.HashMap;
030import java.util.Iterator;
031import java.util.List;
032import java.util.Locale;
033import java.util.Map;
034import java.util.regex.Pattern;
035import java.util.regex.PatternSyntaxException;
036
037import org.slf4j.Logger;
038import org.slf4j.LoggerFactory;
039
040import dk.netarkivet.common.Constants;
041import dk.netarkivet.common.exceptions.ArgumentNotValid;
042import dk.netarkivet.common.exceptions.IllegalState;
043import dk.netarkivet.common.exceptions.PermissionDenied;
044import dk.netarkivet.common.exceptions.UnknownID;
045import dk.netarkivet.common.utils.DomainUtils;
046import dk.netarkivet.common.utils.Named;
047import dk.netarkivet.common.utils.Settings;
048import dk.netarkivet.common.utils.StringUtils;
049import dk.netarkivet.common.utils.TLD;
050import dk.netarkivet.harvester.HarvesterSettings;
051import dk.netarkivet.harvester.datamodel.dao.DAOProviderFactory;
052import dk.netarkivet.harvester.datamodel.extendedfield.ExtendableEntity;
053import dk.netarkivet.harvester.datamodel.extendedfield.ExtendedFieldTypes;
054import dk.netarkivet.harvester.datamodel.extendedfield.ExtendedFieldValue;
055import dk.netarkivet.harvester.utils.CrawlertrapsUtils;
056
057/**
058 * Represents known information about a domain A domain is identified by a domain name (ex: kb.dk)
059 * <p>
060 * The following information is used to control how a domain is harvested: Seedlists, configurations and passwords. Each
061 * seedlist defines one or more URL's that the harvester should use as starting points. A configuration defines a
062 * specific combination of settings (seedlist, harvester settings, passwords) that should be used during harvest.
063 * Passwords define user names and passwords that might be used for the domain.
064 * <p>
065 * Information about previous harvests of this domain is available via the domainHistory.
066 * <p>
067 * Information from the domain registrant (DK-HOSTMASTER) about the domain registration is available in the
068 * registration. This includes the dates where the domain was known to exist (included in a domain list), together with
069 * domain owner information.
070 * <p>
071 * Notice that each configuration references one of the seedlists by name, and possibly one of the Passwords.
072 */
073@SuppressWarnings({"rawtypes"})
074public class Domain extends ExtendableEntity implements Named {
075
076    /** The logger for this class. */
077    protected static final Logger log = LoggerFactory.getLogger(Domain.class);
078
079    /** The identification used to lookup the domain. */
080
081    private String domainName;
082
083    /**
084     * Map<String, DomainConfiguration> the various harvest configurations that can be used to harvest this domain.
085     */
086    private Map<String, DomainConfiguration> domainConfigurations;
087
088    /** Use this configuration unless otherwise specified. */
089    private String defaultConfigName;
090
091    /**
092     * Map<String, SeedList> The different seedlists used as starting points by the harvesters.
093     */
094    private Map<String, SeedList> seedlists;
095
096    /** Map<String, Password> with an entry for each known password. */
097    private Map<String, Password> passwords;
098
099    /**
100     * List of crawler traps, that is regexps that should be ignored for this domain.
101     */
102    private List<String> crawlerTraps;
103
104    /** Records all historical information about the domain. */
105    private DomainHistory history;
106
107    /**
108     * List<DomainOwnerInfo> contains information about the known owners of this domain.
109     */
110    private List<DomainOwnerInfo> domainOwnerInfos;
111
112    /** Comments that the user has entered. */
113    private String comments;
114
115    /** Edition is used by the DAO to keep track of changes. */
116    long edition = -1;
117
118    /**
119     * If non-null, this domain is considered an alias of the domain named. The field must be either null or aliasInfo
120     * that defines an alias from this domain to another, and the time the alias field was last updated. This is used to
121     * allow operators to check the domains that have been aliases for a long time.
122     * <p>
123     * Note that we do not allow transitive aliases, so the domain named in this field is not allowed to become an alias
124     * itself.
125     */
126    private AliasInfo aliasInfo;
127
128    /** ID autogenerated by DB DAO. */
129    private Long id;
130
131    /**
132     * Create new instance of a domain. It is generally recommended that getDefaultDomain is used instead of this
133     * constructor.
134     *
135     * @param theDomainName Name used to reference the domain
136     * @throws ArgumentNotValid if either of the arguments are null or empty, or if the domain does not match the regex
137     * for valid domains
138     */
139    protected Domain(String theDomainName) {
140        super(DAOProviderFactory.getExtendedFieldDAOProvider());
141        ArgumentNotValid.checkNotNullOrEmpty(theDomainName, "theDomainName");
142        if (!DomainUtils.isValidDomainName(theDomainName)) {
143            throw new ArgumentNotValid("Domain '" + theDomainName + "' does not match the regexp "
144                    + "defining valid domains: " + TLD.getInstance().getValidDomainMatcher().pattern());
145        }
146        domainName = theDomainName;
147        comments = "";
148        domainConfigurations = new HashMap<String, DomainConfiguration>();
149        seedlists = new HashMap<String, SeedList>();
150        passwords = new HashMap<String, Password>();
151        crawlerTraps = Collections.emptyList();
152        history = new DomainHistory();
153        domainOwnerInfos = new ArrayList<DomainOwnerInfo>();
154    }
155
156    /**
157     * Get a new domain, initialised with default values.
158     *
159     * @param domainName The name of the domain
160     * @return a domain with the given name
161     * @throws ArgumentNotValid if name is null or empty
162     */
163    public static Domain getDefaultDomain(String domainName) {
164        Domain myDomain;
165        myDomain = new Domain(domainName);
166
167        // Create default seed list containing one seed: http://www.domain
168        // or http://1.2.3.4 for IP-named domains.
169        String defaultSeedListName = Settings.get(HarvesterSettings.DEFAULT_SEEDLIST);
170
171        SeedList seedlist;
172        if (Constants.IP_KEY_REGEXP.matcher(domainName).matches()) {
173            // IP domains should not get www
174            seedlist = new SeedList(defaultSeedListName, "http://" + domainName);
175        } else {
176            seedlist = new SeedList(defaultSeedListName, "http://www." + domainName);
177        }
178        myDomain.addSeedList(seedlist);
179
180        List<SeedList> seedlists = Arrays.asList(seedlist);
181
182        // Create default configuration using the default seedlist
183        String domainDefaultConfig = Settings.get(HarvesterSettings.DOMAIN_DEFAULT_CONFIG);
184
185        DomainConfiguration cfg = new DomainConfiguration(domainDefaultConfig, myDomain, seedlists,
186                new ArrayList<Password>());
187        cfg.setOrderXmlName(Settings.get(HarvesterSettings.DOMAIN_DEFAULT_ORDERXML));
188        cfg.setMaxRequestRate(Integer.parseInt(Settings.get(HarvesterSettings.DOMAIN_CONFIG_MAXRATE)));
189        myDomain.addConfiguration(cfg);
190
191        return myDomain;
192    }
193
194    /**
195     * Adds a new configuration to the domain. If this is the first configuration added, it becomes the default
196     * configuration. The seedlist referenced by the configuration must already be registered in this domain otherwise
197     * an UnknownID exception is thrown.
198     *
199     * @param cfg the configuration that is added
200     * @throws UnknownID if the name of the seedlist referenced by cfg is unknown
201     * @throws PermissionDenied if a configuration with the same name already exists
202     * @throws ArgumentNotValid if null supplied
203     */
204    public void addConfiguration(DomainConfiguration cfg) {
205        ArgumentNotValid.checkNotNull(cfg, "cfg");
206
207        if (domainConfigurations.containsKey(cfg.getName())) {
208            throw new PermissionDenied("A configuration already exists with the name:" + cfg.getName()
209                    + "; in the domain:" + getName() + ";");
210        }
211
212        putConfiguration(cfg);
213
214        if (domainConfigurations.size() == 1) {
215            defaultConfigName = cfg.getName();
216        }
217    }
218
219    /**
220     * Set a configuration in the domain. This checks that the seedlists and passwords are legal.
221     *
222     * @param cfg The configuration to add.
223     */
224    private void putConfiguration(DomainConfiguration cfg) {
225        checkListContainsNamed(cfg, cfg.getSeedLists(), "seedlist", seedlists);
226        checkListContainsNamed(cfg, cfg.getPasswords(), "passwords", passwords);
227
228        domainConfigurations.put(cfg.getName(), cfg);
229    }
230
231    /**
232     * Helper method used to verify that a configuration does not reference seedlists or passwords that do not exist in
233     * this domain.
234     *
235     * @param cfg the configuration being checked
236     * @param items an iterator to the references that are checked (seedlists or passwords)
237     * @param typename the name of the references being checked
238     * @param m the corresponding domain map that must contain entries matching the names in the items
239     * @param <T> The type contained in items iterator. The type extends Named
240     */
241    private <T extends Named> void checkListContainsNamed(DomainConfiguration cfg, final Iterator<T> items,
242            final String typename, final Map m) {
243        while (items.hasNext()) {
244            Named named = items.next();
245
246            if (!m.containsKey(named.getName())) {
247                throw new UnknownID("Configuration:" + cfg.getName() + "; uses unknown " + typename + ":"
248                        + named.getName() + "; in the domain:" + getName() + ";");
249            }
250        }
251    }
252
253    /**
254     * Helper method that adds or updates an entry in a map. Used to add/update entries in seedlists and passwords maps
255     *
256     * @param m the map to modify
257     * @param name the name of the element to add or update
258     * @param addAction when true an add action is performed and en entry with the name is not allowed to exist in the
259     * map before the operation, when false an update operation is performed and an entry must already exists with the
260     * name in the map.
261     * @param value the object to add to m
262     * @param <T> The type contained as values in the map m.
263     */
264    private <T extends Named> void put(Map<String, T> m, String name, boolean addAction, T value) {
265        boolean alreadyExist = m.containsKey(name);
266
267        if (addAction && alreadyExist) {
268            throw new PermissionDenied("An entry already exists with the name:" + name + "; in the domain:" + getName()
269                    + ";");
270        }
271
272        if ((!addAction) && (!alreadyExist)) {
273            throw new UnknownID("No entry exists with the name '" + name + "' in the domain '" + getName() + "'");
274        }
275
276        m.put(name, value);
277    }
278
279    /**
280     * Adds a seed list to the domain.
281     *
282     * @param seedlist the actual seedslist.
283     * @throws ArgumentNotValid if an argument is null
284     * @throws PermissionDenied if the seedName already exists
285     */
286    public void addSeedList(SeedList seedlist) {
287        ArgumentNotValid.checkNotNull(seedlist, "seedlist");
288        put(seedlists, seedlist.getName(), true, seedlist);
289    }
290
291    /**
292     * Update a seed list to the domain. Replaces an existing seedlist with the same name.
293     *
294     * @param seedlist the actual seedslist.
295     * @throws ArgumentNotValid if an argument is null
296     * @throws UnknownID if the seedlist.getName() does not exists
297     */
298    public void updateSeedList(SeedList seedlist) {
299        ArgumentNotValid.checkNotNull(seedlist, "seedlist");
300        put(seedlists, seedlist.getName(), false, seedlist);
301    }
302
303    /**
304     * Adds a password to the domain.
305     *
306     * @param password A password object to add.
307     * @throws ArgumentNotValid if the argument is null
308     * @throws PermissionDenied if a password already exists with this name
309     */
310    public void addPassword(Password password) {
311        ArgumentNotValid.checkNotNull(password, "password");
312        put(passwords, password.getName(), true, password);
313    }
314
315    /**
316     * Updates a password on the domain.
317     *
318     * @param password A password object to update.
319     * @throws ArgumentNotValid if the argument is null
320     * @throws PermissionDenied if no password exists with this name
321     */
322    public void updatePassword(Password password) {
323        ArgumentNotValid.checkNotNull(password, "password");
324        put(passwords, password.getName(), false, password);
325    }
326
327    /**
328     * Mark a configuration as the default configuration to use. The configuration name must match an already added
329     * configuration, otherwise an UnknownID exception is thrown.
330     *
331     * @param cfgName a name of a configuration
332     * @throws UnknownID when the cfgName does not match an added configuration
333     * @throws ArgumentNotValid if cfgName is null or empty
334     */
335    public void setDefaultConfiguration(String cfgName) {
336        ArgumentNotValid.checkNotNullOrEmpty(cfgName, "cfgName");
337
338        if (!domainConfigurations.containsKey(cfgName)) {
339            throw new UnknownID("Default configuration not registered:" + cfgName + "; in the domain:" + getName()
340                    + ";");
341        }
342
343        defaultConfigName = cfgName;
344    }
345
346    /**
347     * Returns an already registered configuration.
348     *
349     * @param cfgName the name of an registered configuration
350     * @return the configuration
351     * @throws UnknownID if the name is not a registered configuration
352     * @throws ArgumentNotValid if cfgName is null or empty
353     */
354    public DomainConfiguration getConfiguration(String cfgName) {
355        ArgumentNotValid.checkNotNullOrEmpty(cfgName, "cfgName");
356
357        if (!domainConfigurations.containsKey(cfgName)) {
358            throw new UnknownID("Configuration '" + cfgName + "' not registered in the domain '" + getName() + "'");
359        }
360        DomainConfiguration cfg = domainConfigurations.get(cfgName);
361        cfg.setDomainhistory(this.getHistory());
362        return cfg;
363    }
364
365    /**
366     * Gets the default configuration. If no configuration has been explicitly set the first configuration added to this
367     * domain is returned. If no configurations have been added at all a UnknownID exception is thrown.
368     *
369     * @return the default configuration (never null)
370     * @throws UnknownID if no configurations exists
371     */
372    public DomainConfiguration getDefaultConfiguration() {
373        if (domainConfigurations.size() == 0) {
374            throw new UnknownID("No configurations have been registered in the domain:" + getName() + ";");
375        }
376
377        return getConfiguration(defaultConfigName);
378    }
379
380    /**
381     * Gets the name of this domain.
382     *
383     * @return the name of this domain
384     */
385    public String getName() {
386        return domainName;
387    }
388
389    /**
390     * @return the domain comments.
391     */
392    public String getComments() {
393        return comments;
394    }
395
396    /**
397     * Get the domain history.
398     *
399     * @return the domain history
400     */
401    public DomainHistory getHistory() {
402        return history;
403    }
404
405    /**
406     * Get a specific seedlist previously added to this domain.
407     *
408     * @param name the name of the seedlist to return
409     * @return the specified seedlist
410     * @throws ArgumentNotValid if name is null or empty
411     * @throws UnknownID if no seedlist has been added with the supplied name
412     */
413    public SeedList getSeedList(String name) {
414        ArgumentNotValid.checkNotNullOrEmpty(name, "name");
415
416        if (!hasSeedList(name)) {
417            throw new UnknownID("Seedlist '" + name + " has not been registered in the domain '" + getName() + "'");
418        }
419
420        return seedlists.get(name);
421    }
422
423    /**
424     * Return true if the named seedlist exists in this domain.
425     *
426     * @param name String representing a possible seedlist for the domain.
427     * @return true, if the named seedlist exists in this domain
428     */
429    public boolean hasSeedList(String name) {
430        ArgumentNotValid.checkNotNullOrEmpty(name, "name");
431
432        return seedlists.containsKey(name);
433    }
434
435    /**
436     * Removes a seedlist from this Domain. The seedlist must not be in use by any of the configurations, otherwise a
437     * PermissionDenied exception is thrown.
438     *
439     * @param name the name of the seedlist to remove
440     * @throws PermissionDenied if the seedlist is in use by a configuration or this is the last seedlist in this Domain
441     * @throws UnknownID if the no seedlist exists with the name
442     * @throws ArgumentNotValid if a null argument is supplied
443     */
444    public void removeSeedList(String name) {
445        ArgumentNotValid.checkNotNullOrEmpty(name, "name");
446
447        if (!seedlists.containsKey(name)) {
448            throw new UnknownID("Seedlist has not been registered:" + name + "; in the domain:" + getName() + ";");
449        }
450
451        if (seedlists.size() <= 1) {
452            throw new PermissionDenied("Can not remove the last seedlist:" + name + ";");
453        }
454
455        for (String cfgname : domainConfigurations.keySet()) {
456            DomainConfiguration cfg = domainConfigurations.get(cfgname);
457
458            for (Iterator<SeedList> i = cfg.getSeedLists(); i.hasNext();) {
459                SeedList seedlist = i.next();
460
461                if (seedlist.getName().equals(name)) {
462                    throw new PermissionDenied("The seedlist:" + name + "; is used by the configuration:" + cfgname
463                            + ";");
464                }
465            }
466        }
467
468        // if we get here without an exception - the seedlist is not in use
469        seedlists.remove(name);
470    }
471
472    /**
473     * Removes a password from this Domain. The password must not be in use by any of the configurations, otherwise a
474     * PermissionDenied exception is thrown.
475     *
476     * @param name the name of the password to remove
477     * @throws PermissionDenied if the password is in use by a configuration or this is the last password in this Domain
478     * @throws UnknownID if the no password exists with the name
479     * @throws ArgumentNotValid if a null argument is supplied
480     */
481    public void removePassword(String name) {
482        ArgumentNotValid.checkNotNullOrEmpty(name, "name");
483
484        if (!passwords.containsKey(name)) {
485            throw new UnknownID("Password has not been registered:" + name + "; in the domain:" + getName() + ";");
486        }
487
488        for (String cfgname : domainConfigurations.keySet()) {
489            DomainConfiguration cfg = domainConfigurations.get(cfgname);
490
491            if (cfg.usesPassword(name)) {
492                throw new PermissionDenied("The password:" + name + "; is used by the configuration:" + cfgname + ";");
493            }
494        }
495
496        // if we get here without an exception - the password is not in use
497        passwords.remove(name);
498    }
499
500    /**
501     * Removes a configuration from this domain. The default configuration can not be removed, instead PermissionDenied
502     * is thrown. It is not possible to remove a configuration that is referenced by one or more HarvestDefinitions
503     *
504     * @param configName The name of a configuration to remove.
505     * @throws ArgumentNotValid if name is null or empty
506     * @throws PermissionDenied if the default configuration is attempted removed or if one or more HarvestDefinitions
507     * reference the configuration
508     */
509    public void removeConfiguration(String configName) {
510        ArgumentNotValid.checkNotNullOrEmpty(configName, "configName");
511
512        if (defaultConfigName.equals(configName)) {
513            throw new PermissionDenied("The default configuration can not be removed:" + configName + ";");
514        }
515
516        if (!domainConfigurations.containsKey(configName)) {
517            throw new UnknownID("Configuration not registered:" + configName + ";");
518        }
519
520        // Test that no harvest definition uses this configuration
521        final DomainDAO dao = DomainDAO.getInstance();
522        if (!dao.mayDelete(getConfiguration(configName))) {
523            // Since this is an error case, spend a little time getting better
524            // info. This could be done a lot faster by adding a function to
525            // the DomainDAO.
526            HarvestDefinitionDAO hddao = HarvestDefinitionDAO.getInstance();
527            Iterator<HarvestDefinition> hds = hddao.getAllHarvestDefinitions();
528            List<String> usages = new ArrayList<String>();
529            while (hds.hasNext()) {
530                HarvestDefinition hd = hds.next();
531                Iterator<DomainConfiguration> configs = hd.getDomainConfigurations();
532                while (configs.hasNext()) {
533                    DomainConfiguration dc = configs.next();
534                    if (dc.getName().equals(configName) && dc.getDomainName().equals(getName())) {
535                        usages.add(hd.getName());
536                    }
537                }
538            }
539            throw new PermissionDenied("Cannot delete domain configuration '" + configName + "', because it is used "
540                    + "by the following " + "harvest definitions: " + usages);
541        }
542
543        domainConfigurations.remove(configName);
544    }
545
546    /**
547     * Gets all configurations belonging to this domain.
548     *
549     * @return all configurations belonging to this domain.
550     */
551    public Iterator<DomainConfiguration> getAllConfigurations() {
552        return domainConfigurations.values().iterator();
553    }
554
555    /**
556     * Get all seedlists belonging to this domain.
557     *
558     * @return all seedlists belonging to this domain
559     */
560    public Iterator<SeedList> getAllSeedLists() {
561        return seedlists.values().iterator();
562    }
563
564    /**
565     * Return the passwords defined for this domain.
566     *
567     * @return Iterator<Password> of known passwords.
568     */
569    public Iterator<Password> getAllPasswords() {
570        return passwords.values().iterator();
571    }
572
573    /**
574     * Gets all configurations belonging to this domain. The returned list is sorted by name according to language given
575     * in the parameter.
576     *
577     * @param loc contains the language sorting must adhere to
578     * @return all configurations belonging to this domain sorted according to language
579     */
580    public List<DomainConfiguration> getAllConfigurationsAsSortedList(Locale loc) {
581        ArgumentNotValid.checkNotNull(loc, "loc");
582        List<DomainConfiguration> resultSet = new ArrayList<DomainConfiguration>(domainConfigurations.values());
583        NamedUtils.sortNamedObjectList(loc, resultSet);
584        return resultSet;
585    }
586
587    /**
588     * Gets all seedlists belonging to this domain. The returned list is sorted by name according to language given in
589     * the parameter.
590     *
591     * @param loc contains the language sorting must adhere to
592     * @return all seedlists belonging to this domain sorted according to language
593     */
594    public List<SeedList> getAllSeedListsAsSortedList(Locale loc) {
595        ArgumentNotValid.checkNotNull(loc, "loc");
596        List<SeedList> resultSet = new ArrayList<SeedList>(seedlists.values());
597        NamedUtils.sortNamedObjectList(loc, resultSet);
598        return resultSet;
599    }
600
601    /**
602     * Returns the passwords defined for this domain. The returned list is sorted by name according to language given in
603     * the parameter.
604     *
605     * @param loc contains the language sorting must adhere to
606     * @return a sorted list of known passwords according to language
607     */
608    public List<Password> getAllPasswordsAsSortedList(Locale loc) {
609        ArgumentNotValid.checkNotNull(loc, "loc");
610        List<Password> resultSet = new ArrayList<Password>(passwords.values());
611        NamedUtils.sortNamedObjectList(loc, resultSet);
612        return resultSet;
613    }
614
615    /**
616     * Add owner information.
617     *
618     * @param owner owner
619     */
620    public void addOwnerInfo(DomainOwnerInfo owner) {
621        ArgumentNotValid.checkNotNull(owner, "owner");
622        domainOwnerInfos.add(owner);
623    }
624
625    /**
626     * Get array of domain owner information.
627     *
628     * @return array containing information about the domain owner(s)
629     */
630    public DomainOwnerInfo[] getAllDomainOwnerInfo() {
631        return domainOwnerInfos.toArray(new DomainOwnerInfo[0]);
632    }
633
634    /**
635     * Get password information.
636     *
637     * @param name the id of the password settings to retrieve
638     * @return the password information
639     * @throws UnknownID if no password info exists with the id "name"
640     */
641    public Password getPassword(String name) {
642        ArgumentNotValid.checkNotNullOrEmpty(name, "name");
643
644        if (!passwords.containsKey(name)) {
645            throw new UnknownID("Password has not been registered:" + name + "; in the domain:" + getName() + ";");
646        }
647
648        return passwords.get(name);
649    }
650
651    /**
652     * Set the comments for this domain.
653     *
654     * @param comments The new comments (can be null)
655     */
656    public void setComments(String comments) {
657        this.comments = comments;
658    }
659
660    /**
661     * Replaces existing configuration with cfg, using cfg.getName() as the id for the configuration.
662     *
663     * @param cfg the configuration to update
664     * @throws UnknownID if no configuration exists with the id cfg.getName(). ArgumentNotValid if cfg is null.
665     */
666    public void updateConfiguration(DomainConfiguration cfg) {
667        ArgumentNotValid.checkNotNull(cfg, "cfg");
668
669        if (!domainConfigurations.containsKey(cfg.getName())) {
670            throw new UnknownID("No configuration exists with the name:" + cfg.getName() + "; in the domain:"
671                    + getName() + ";");
672        }
673
674        putConfiguration(cfg);
675    }
676
677    /**
678     * Returns true if this domain has the named password.
679     *
680     * @param passwordName the identifier of the password info
681     * @return true if this domain has password info with id passwordname
682     */
683    public boolean hasPassword(String passwordName) {
684        return passwords.containsKey(passwordName);
685    }
686
687    /**
688     * Returns true if this domain has the named configuration.
689     *
690     * @param configName the identifier of the configuration
691     * @return true if this domain has a configuration with id configNmae
692     */
693    public boolean hasConfiguration(String configName) {
694        return domainConfigurations.containsKey(configName);
695    }
696
697    /**
698     * Get the edition number.
699     *
700     * @return the edition number
701     */
702    public long getEdition() {
703        return edition;
704    }
705
706    /**
707     * Set the edition number.
708     *
709     * @param theNewEdition the new edition
710     */
711    public void setEdition(long theNewEdition) {
712        edition = theNewEdition;
713    }
714
715    /**
716     * Get the ID of this domain. Only for use by DBDAO
717     *
718     * @return Get the ID of this domain
719     */
720    public long getID() {
721        return id;
722    }
723
724    /**
725     * Set the ID of this domain. Only for use by DBDAO.
726     *
727     * @param newId The new ID for this domain.
728     */
729    void setID(long newId) {
730        this.id = newId;
731    }
732
733    /**
734     * Check if this harvestinfo has an ID set yet (doesn't happen until the DBDAO persists it).
735     *
736     * @return true, if this domain has an ID different from null
737     */
738    boolean hasID() {
739        return id != null;
740    }
741
742    /**
743     * Return a human-readable representation of this object.
744     *
745     * @return Some string identifying the object. Do not use this for machine processing.
746     */
747    public String toString() {
748        StringBuilder sb = new StringBuilder();
749        sb.append("Domain:").append(getName()).append(";\n");
750        sb.append("Comment:").append(getComments()).append(";\n");
751
752        sb.append("Configurations:\n");
753
754        for (String cfgName : domainConfigurations.keySet()) {
755            sb.append("\t").append(cfgName).append(";\n");
756        }
757
758        sb.append("Seedlists:\n");
759
760        for (String seedName : seedlists.keySet()) {
761            sb.append("\t").append(seedName).append(";\n");
762        }
763
764        sb.append("Passwords:\n");
765
766        for (String pwName : passwords.keySet()) {
767            sb.append("\t").append(pwName).append(";\n");
768        }
769
770        sb.append("Extended Fields:\n");
771
772        for (int i = 0; i < extendedFieldValues.size(); i++) {
773            ExtendedFieldValue efv = extendedFieldValues.get(i);
774            sb.append("\t").append(efv.getExtendedFieldID() + ": " + efv.getContent()).append(";\n");
775        }
776
777        sb.append("---------------\n");
778
779        return sb.toString();
780    }
781
782    /**
783     * Sets a list of regular expressions defining urls that should never be harvested from this domain. The list (after
784     * trimming the strings, and any empty strings have been removed) is copied to a list that is stored immutably.
785     *
786     * @param regExps The list defining urls never to be harvested.
787     * @param strictMode If true, we throw ArgumentNotValid exception if invalid regexps are found
788     * @throws ArgumentNotValid if regExps is null or regExps contains invalid regular expressions (unless strictMode is
789     * false).
790     */
791    public void setCrawlerTraps(List<String> regExps, boolean strictMode) {
792        ArgumentNotValid.checkNotNull(regExps, "List<String> regExps");
793        List<String> cleanedListOfCrawlerTraps = new ArrayList<String>();
794        for (String crawlerTrap : regExps) {
795            log.trace("original trap: '" + crawlerTrap + "'");
796            String trimmedString = crawlerTrap.trim();
797            log.trace("trimmed  trap: '" + trimmedString + "'");
798            if (!(trimmedString.length() == 0)) {
799                cleanedListOfCrawlerTraps.add(crawlerTrap);
800            } else {
801                log.trace("Removed empty string from list of crawlertraps");
802            }
803        }
804        // Validate regexps
805        List<String> errMsgs = new ArrayList<String>();
806        for (String regexp : cleanedListOfCrawlerTraps) {
807                
808                boolean wellformed = false;
809            try {
810                Pattern.compile(regexp);
811                wellformed = CrawlertrapsUtils.isCrawlertrapsWellformedXML(regexp);
812                if (!wellformed){
813                        errMsgs.add("The expression '" + regexp + "' is not wellformed XML" 
814                                + " . Please correct the expression.");
815                }
816            } catch (PatternSyntaxException e) {
817                errMsgs.add("The expression '" + regexp + "' is not a proper regular expression: " 
818                                + e.getDescription() + " . Please correct the expression.");
819            }
820        }
821        if (strictMode) 
822                if (errMsgs.size() > 0) {
823            throw new ArgumentNotValid(errMsgs.size() +  " errors were found: " + StringUtils.conjoin(",", errMsgs));
824        } else {
825            log.warn(errMsgs.size() +  " errors were found: " + StringUtils.conjoin(",", errMsgs));
826        }
827        crawlerTraps = Collections.unmodifiableList(cleanedListOfCrawlerTraps);
828        log.debug("Domain {} has {} crawlertraps", domainName, crawlerTraps.size());
829    }
830
831    /**
832     * Returns the list of regexps never to be harvested from this domain, or the empty list if none. The returned list
833     * should never be null.
834     *
835     * @return The list of regexps of url's never to be harvested when harvesting this domain. This list is immutable.
836     */
837    public List<String> getCrawlerTraps() {
838        return crawlerTraps;
839    }
840
841    /**
842     * Returns the alias info for this domain, or null if this domain is not an alias.
843     *
844     * @return A domain name.
845     */
846    public AliasInfo getAliasInfo() {
847        return aliasInfo;
848    }
849
850    /**
851     * Update which domain this domain is considered an alias of. Calling this function will a) cause some slightly
852     * expensive checks to be performed, and b) set the time of last update. For object construction and copying, use
853     * setAlias.
854     *
855     * @param alias The name (e.g. "netarkivet.dk") of the domain that this domain is an alias of.
856     * @throws UnknownID If the given domain does not exist
857     * @throws IllegalState If updating the alias info would violate constraints of alias: No transitivity, no
858     * reflection.
859     */
860    public void updateAlias(String alias) {
861        if (getName().equals(alias)) {
862            String message = "Cannot make domain '" + this.getName() + "' an alias of itself";
863            log.debug(message);
864            throw new IllegalState(message);
865        }
866
867        if (alias != null) {
868            DomainDAO dao = DomainDAO.getInstance();
869            Domain otherD = dao.read(alias);
870            if (otherD.aliasInfo != null) {
871                String message = "Cannot make domain '" + this.getName() + "' an alias of '" + otherD.getName() + "',"
872                        + " as that domain is already an alias of '" + otherD.aliasInfo.getAliasOf() + "'";
873                log.debug(message);
874                throw new IllegalState(message);
875            }
876            if (dao.getAliases(getName()).size() != 0) {
877                List<String> aliasesForThisDomain = new ArrayList<String>();
878                for (AliasInfo ai : dao.getAliases(getName())) {
879                    aliasesForThisDomain.add(ai.getDomain());
880                }
881                String message = "Cannot make domain '" + this.getName() + "' an alias of '" + otherD.getName() + "',"
882                        + " as the domains '" + StringUtils.conjoin(",", aliasesForThisDomain) + "' are "
883                        + "already aliases of '" + this.getName() + "'";
884                log.debug(message);
885                throw new IllegalState(message);
886            }
887            setAliasInfo(new AliasInfo(domainName, alias, new Date()));
888        } else {
889            setAliasInfo(null);
890        }
891    }
892
893    /**
894     * Set the alias field on this object. This function performs no checking of existence of transitivity of alias
895     * domains, but it does check that the alias info is for this domain
896     *
897     * @param aliasInfo Alias information
898     * @throws ArgumentNotValid if the alias info is not for this domain
899     */
900    void setAliasInfo(AliasInfo aliasInfo) {
901        if (aliasInfo != null && !aliasInfo.getDomain().equals(domainName)) {
902            throw new ArgumentNotValid("AliasInfo must be for this domain");
903        }
904        this.aliasInfo = aliasInfo;
905    }
906
907    /**
908     * Gets the harvest info giving best information for expectation or how many objects a harvest using a given
909     * configuration will retrieve, we will prioritise the most recently harvest, where we have a full harvest.
910     *
911     * @param configName The name of the configuration
912     * @return The Harvest Information for the harvest defining the best expectation, including the number retrieved and
913     * the stop reason.
914     */
915    public HarvestInfo getBestHarvestInfoExpectation(String configName) {
916        ArgumentNotValid.checkNotNullOrEmpty(configName, "String configName");
917        return DomainHistory.getBestHarvestInfoExpectation(configName, this.getHistory());
918    }
919
920    /**
921     * All derived classes allow ExtendedFields from Type ExtendedFieldTypes.DOMAIN
922     *
923     * @return ExtendedFieldTypes.DOMAIN
924     */
925    protected int getExtendedFieldType() {
926        return ExtendedFieldTypes.DOMAIN;
927    }
928
929}