001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.datamodel; 024 025import java.util.ArrayList; 026import java.util.Arrays; 027import java.util.Collections; 028import java.util.Date; 029import java.util.HashMap; 030import java.util.Iterator; 031import java.util.List; 032import java.util.Locale; 033import java.util.Map; 034import java.util.regex.Pattern; 035import java.util.regex.PatternSyntaxException; 036 037import org.slf4j.Logger; 038import org.slf4j.LoggerFactory; 039 040import dk.netarkivet.common.Constants; 041import dk.netarkivet.common.exceptions.ArgumentNotValid; 042import dk.netarkivet.common.exceptions.IllegalState; 043import dk.netarkivet.common.exceptions.PermissionDenied; 044import dk.netarkivet.common.exceptions.UnknownID; 045import dk.netarkivet.common.utils.DomainUtils; 046import dk.netarkivet.common.utils.Named; 047import dk.netarkivet.common.utils.Settings; 048import dk.netarkivet.common.utils.StringUtils; 049import dk.netarkivet.harvester.HarvesterSettings; 050import dk.netarkivet.harvester.datamodel.dao.DAOProviderFactory; 051import dk.netarkivet.harvester.datamodel.extendedfield.ExtendableEntity; 052import dk.netarkivet.harvester.datamodel.extendedfield.ExtendedFieldTypes; 053import dk.netarkivet.harvester.datamodel.extendedfield.ExtendedFieldValue; 054 055/** 056 * Represents known information about a domain A domain is identified by a domain name (ex: kb.dk) 057 * <p> 058 * The following information is used to control how a domain is harvested: Seedlists, configurations and passwords. Each 059 * seedlist defines one or more URL's that the harvester should use as starting points. A configuration defines a 060 * specific combination of settings (seedlist, harvester settings, passwords) that should be used during harvest. 061 * Passwords define user names and passwords that might be used for the domain. 062 * <p> 063 * Information about previous harvests of this domain is available via the domainHistory. 064 * <p> 065 * Information from the domain registrant (DK-HOSTMASTER) about the domain registration is available in the 066 * registration. This includes the dates where the domain was known to exist (included in a domain list), together with 067 * domain owner information. 068 * <p> 069 * Notice that each configuration references one of the seedlists by name, and possibly one of the Passwords. 070 */ 071@SuppressWarnings({"rawtypes"}) 072public class Domain extends ExtendableEntity implements Named { 073 074 /** Prefix all domain names with this string. */ 075 protected static final Logger log = LoggerFactory.getLogger(Domain.class); 076 077 /** The identification used to lookup the domain. */ 078 079 private String domainName; 080 081 /** 082 * Map<String, DomainConfiguration> the various harvest configurations that can be used to harvest this domain. 083 */ 084 private Map<String, DomainConfiguration> domainConfigurations; 085 086 /** Use this configuration unless otherwise specified. */ 087 private String defaultConfigName; 088 089 /** 090 * Map<String, SeedList> The different seedlists used as starting points by the harvesters. 091 */ 092 private Map<String, SeedList> seedlists; 093 094 /** Map<String, Password> with an entry for each known password. */ 095 private Map<String, Password> passwords; 096 097 /** 098 * List of crawler traps, that is regexps that should be ignored for this domain. 099 */ 100 private List<String> crawlerTraps; 101 102 /** Records all historical information about the domain. */ 103 private DomainHistory history; 104 105 /** 106 * List<DomainOwnerInfo> contains information about the known owners of this domain. 107 */ 108 private List<DomainOwnerInfo> domainOwnerInfos; 109 110 /** Comments that the user has entered. */ 111 private String comments; 112 113 /** Edition is used by the DAO to keep track of changes. */ 114 long edition = -1; 115 116 /** 117 * If non-null, this domain is considered an alias of the domain named. The field must be either null or aliasInfo 118 * that defines an alias from this domain to another, and the time the alias field was last updated. This is used to 119 * allow operators to check the domains that have been aliases for a long time. 120 * <p> 121 * Note that we do not allow transitive aliases, so the domain named in this field is not allowed to become an alias 122 * itself. 123 */ 124 private AliasInfo aliasInfo; 125 126 /** ID autogenerated by DB DAO. */ 127 private Long id; 128 129 /** 130 * Create new instance of a domain. It is generally recommended that getDefaultDomain is used instead of this 131 * constructor. 132 * 133 * @param theDomainName Name used to reference the domain 134 * @throws ArgumentNotValid if either of the arguments are null or empty, or if the domain does not match the regex 135 * for valid domains 136 */ 137 protected Domain(String theDomainName) { 138 super(DAOProviderFactory.getExtendedFieldDAOProvider()); 139 ArgumentNotValid.checkNotNullOrEmpty(theDomainName, "theDomainName"); 140 if (!DomainUtils.isValidDomainName(theDomainName)) { 141 throw new ArgumentNotValid("Domain '" + theDomainName + "' does not match the regexp " 142 + "defining valid domains: " + DomainUtils.VALID_DOMAIN_MATCHER.pattern()); 143 } 144 domainName = theDomainName; 145 comments = ""; 146 domainConfigurations = new HashMap<String, DomainConfiguration>(); 147 seedlists = new HashMap<String, SeedList>(); 148 passwords = new HashMap<String, Password>(); 149 crawlerTraps = Collections.emptyList(); 150 history = new DomainHistory(); 151 domainOwnerInfos = new ArrayList<DomainOwnerInfo>(); 152 } 153 154 /** 155 * Get a new domain, initialised with default values. 156 * 157 * @param domainName The name of the domain 158 * @return a domain with the given name 159 * @throws ArgumentNotValid if name is null or empty 160 */ 161 public static Domain getDefaultDomain(String domainName) { 162 Domain myDomain; 163 myDomain = new Domain(domainName); 164 165 // Create default seed list containing one seed: http://www.domain 166 // or http://1.2.3.4 for IP-named domains. 167 String defaultSeedListName = Settings.get(HarvesterSettings.DEFAULT_SEEDLIST); 168 169 SeedList seedlist; 170 if (Constants.IP_KEY_REGEXP.matcher(domainName).matches()) { 171 // IP domains should not get www 172 seedlist = new SeedList(defaultSeedListName, "http://" + domainName); 173 } else { 174 seedlist = new SeedList(defaultSeedListName, "http://www." + domainName); 175 } 176 myDomain.addSeedList(seedlist); 177 178 List<SeedList> seedlists = Arrays.asList(seedlist); 179 180 // Create default configuration using the default seedlist 181 String domainDefaultConfig = Settings.get(HarvesterSettings.DOMAIN_DEFAULT_CONFIG); 182 183 DomainConfiguration cfg = new DomainConfiguration(domainDefaultConfig, myDomain, seedlists, 184 new ArrayList<Password>()); 185 cfg.setOrderXmlName(Settings.get(HarvesterSettings.DOMAIN_DEFAULT_ORDERXML)); 186 cfg.setMaxRequestRate(Integer.parseInt(Settings.get(HarvesterSettings.DOMAIN_CONFIG_MAXRATE))); 187 myDomain.addConfiguration(cfg); 188 189 return myDomain; 190 } 191 192 /** 193 * Adds a new configuration to the domain. If this is the first configuration added, it becomes the default 194 * configuration. The seedlist referenced by the configuration must already be registered in this domain otherwise 195 * an UnknownID exception is thrown. 196 * 197 * @param cfg the configuration that is added 198 * @throws UnknownID if the name of the seedlist referenced by cfg is unknown 199 * @throws PermissionDenied if a configuration with the same name already exists 200 * @throws ArgumentNotValid if null supplied 201 */ 202 public void addConfiguration(DomainConfiguration cfg) { 203 ArgumentNotValid.checkNotNull(cfg, "cfg"); 204 205 if (domainConfigurations.containsKey(cfg.getName())) { 206 throw new PermissionDenied("A configuration already exists with the name:" + cfg.getName() 207 + "; in the domain:" + getName() + ";"); 208 } 209 210 putConfiguration(cfg); 211 212 if (domainConfigurations.size() == 1) { 213 defaultConfigName = cfg.getName(); 214 } 215 } 216 217 /** 218 * Set a configuration in the domain. This checks that the seedlists and passwords are legal. 219 * 220 * @param cfg The configuration to add. 221 */ 222 private void putConfiguration(DomainConfiguration cfg) { 223 checkListContainsNamed(cfg, cfg.getSeedLists(), "seedlist", seedlists); 224 checkListContainsNamed(cfg, cfg.getPasswords(), "passwords", passwords); 225 226 domainConfigurations.put(cfg.getName(), cfg); 227 } 228 229 /** 230 * Helper method used to verify that a configuration does not reference seedlists or passwords that do not exist in 231 * this domain. 232 * 233 * @param cfg the configuration being checked 234 * @param items an iterator to the references that are checked (seedlists or passwords) 235 * @param typename the name of the references being checked 236 * @param m the corresponding domain map that must contain entries matching the names in the items 237 * @param <T> The type contained in items iterator. The type extends Named 238 */ 239 private <T extends Named> void checkListContainsNamed(DomainConfiguration cfg, final Iterator<T> items, 240 final String typename, final Map m) { 241 while (items.hasNext()) { 242 Named named = items.next(); 243 244 if (!m.containsKey(named.getName())) { 245 throw new UnknownID("Configuration:" + cfg.getName() + "; uses unknown " + typename + ":" 246 + named.getName() + "; in the domain:" + getName() + ";"); 247 } 248 } 249 } 250 251 /** 252 * Helper method that adds or updates an entry in a map. Used to add/update entries in seedlists and passwords maps 253 * 254 * @param m the map to modify 255 * @param name the name of the element to add or update 256 * @param addAction when true an add action is performed and en entry with the name is not allowed to exist in the 257 * map before the operation, when false an update operation is performed and an entry must already exists with the 258 * name in the map. 259 * @param value the object to add to m 260 * @param <T> The type contained as values in the map m. 261 */ 262 private <T extends Named> void put(Map<String, T> m, String name, boolean addAction, T value) { 263 boolean alreadyExist = m.containsKey(name); 264 265 if (addAction && alreadyExist) { 266 throw new PermissionDenied("An entry already exists with the name:" + name + "; in the domain:" + getName() 267 + ";"); 268 } 269 270 if ((!addAction) && (!alreadyExist)) { 271 throw new UnknownID("No entry exists with the name '" + name + "' in the domain '" + getName() + "'"); 272 } 273 274 m.put(name, value); 275 } 276 277 /** 278 * Adds a seed list to the domain. 279 * 280 * @param seedlist the actual seedslist. 281 * @throws ArgumentNotValid if an argument is null 282 * @throws PermissionDenied if the seedName already exists 283 */ 284 public void addSeedList(SeedList seedlist) { 285 ArgumentNotValid.checkNotNull(seedlist, "seedlist"); 286 put(seedlists, seedlist.getName(), true, seedlist); 287 } 288 289 /** 290 * Update a seed list to the domain. Replaces an existing seedlist with the same name. 291 * 292 * @param seedlist the actual seedslist. 293 * @throws ArgumentNotValid if an argument is null 294 * @throws UnknownID if the seedlist.getName() does not exists 295 */ 296 public void updateSeedList(SeedList seedlist) { 297 ArgumentNotValid.checkNotNull(seedlist, "seedlist"); 298 put(seedlists, seedlist.getName(), false, seedlist); 299 } 300 301 /** 302 * Adds a password to the domain. 303 * 304 * @param password A password object to add. 305 * @throws ArgumentNotValid if the argument is null 306 * @throws PermissionDenied if a password already exists with this name 307 */ 308 public void addPassword(Password password) { 309 ArgumentNotValid.checkNotNull(password, "password"); 310 put(passwords, password.getName(), true, password); 311 } 312 313 /** 314 * Updates a password on the domain. 315 * 316 * @param password A password object to update. 317 * @throws ArgumentNotValid if the argument is null 318 * @throws PermissionDenied if no password exists with this name 319 */ 320 public void updatePassword(Password password) { 321 ArgumentNotValid.checkNotNull(password, "password"); 322 put(passwords, password.getName(), false, password); 323 } 324 325 /** 326 * Mark a configuration as the default configuration to use. The configuration name must match an already added 327 * configuration, otherwise an UnknownID exception is thrown. 328 * 329 * @param cfgName a name of a configuration 330 * @throws UnknownID when the cfgName does not match an added configuration 331 * @throws ArgumentNotValid if cfgName is null or empty 332 */ 333 public void setDefaultConfiguration(String cfgName) { 334 ArgumentNotValid.checkNotNullOrEmpty(cfgName, "cfgName"); 335 336 if (!domainConfigurations.containsKey(cfgName)) { 337 throw new UnknownID("Default configuration not registered:" + cfgName + "; in the domain:" + getName() 338 + ";"); 339 } 340 341 defaultConfigName = cfgName; 342 } 343 344 /** 345 * Returns an already registered configuration. 346 * 347 * @param cfgName the name of an registered configuration 348 * @return the configuration 349 * @throws UnknownID if the name is not a registered configuration 350 * @throws ArgumentNotValid if cfgName is null or empty 351 */ 352 public DomainConfiguration getConfiguration(String cfgName) { 353 ArgumentNotValid.checkNotNullOrEmpty(cfgName, "cfgName"); 354 355 if (!domainConfigurations.containsKey(cfgName)) { 356 throw new UnknownID("Configuration '" + cfgName + "' not registered in the domain '" + getName() + "'"); 357 } 358 DomainConfiguration cfg = domainConfigurations.get(cfgName); 359 cfg.setDomainhistory(this.getHistory()); 360 return cfg; 361 } 362 363 /** 364 * Gets the default configuration. If no configuration has been explicitly set the first configuration added to this 365 * domain is returned. If no configurations have been added at all a UnknownID exception is thrown. 366 * 367 * @return the default configuration (never null) 368 * @throws UnknownID if no configurations exists 369 */ 370 public DomainConfiguration getDefaultConfiguration() { 371 if (domainConfigurations.size() == 0) { 372 throw new UnknownID("No configurations have been registered in the domain:" + getName() + ";"); 373 } 374 375 return getConfiguration(defaultConfigName); 376 } 377 378 /** 379 * Gets the name of this domain. 380 * 381 * @return the name of this domain 382 */ 383 public String getName() { 384 return domainName; 385 } 386 387 /** 388 * @return the domain comments. 389 */ 390 public String getComments() { 391 return comments; 392 } 393 394 /** 395 * Get the domain history. 396 * 397 * @return the domain history 398 */ 399 public DomainHistory getHistory() { 400 return history; 401 } 402 403 /** 404 * Get a specific seedlist previously added to this domain. 405 * 406 * @param name the name of the seedlist to return 407 * @return the specified seedlist 408 * @throws ArgumentNotValid if name is null or empty 409 * @throws UnknownID if no seedlist has been added with the supplied name 410 */ 411 public SeedList getSeedList(String name) { 412 ArgumentNotValid.checkNotNullOrEmpty(name, "name"); 413 414 if (!hasSeedList(name)) { 415 throw new UnknownID("Seedlist '" + name + " has not been registered in the domain '" + getName() + "'"); 416 } 417 418 return seedlists.get(name); 419 } 420 421 /** 422 * Return true if the named seedlist exists in this domain. 423 * 424 * @param name String representing a possible seedlist for the domain. 425 * @return true, if the named seedlist exists in this domain 426 */ 427 public boolean hasSeedList(String name) { 428 ArgumentNotValid.checkNotNullOrEmpty(name, "name"); 429 430 return seedlists.containsKey(name); 431 } 432 433 /** 434 * Removes a seedlist from this Domain. The seedlist must not be in use by any of the configurations, otherwise a 435 * PermissionDenied exception is thrown. 436 * 437 * @param name the name of the seedlist to remove 438 * @throws PermissionDenied if the seedlist is in use by a configuration or this is the last seedlist in this Domain 439 * @throws UnknownID if the no seedlist exists with the name 440 * @throws ArgumentNotValid if a null argument is supplied 441 */ 442 public void removeSeedList(String name) { 443 ArgumentNotValid.checkNotNullOrEmpty(name, "name"); 444 445 if (!seedlists.containsKey(name)) { 446 throw new UnknownID("Seedlist has not been registered:" + name + "; in the domain:" + getName() + ";"); 447 } 448 449 if (seedlists.size() <= 1) { 450 throw new PermissionDenied("Can not remove the last seedlist:" + name + ";"); 451 } 452 453 for (String cfgname : domainConfigurations.keySet()) { 454 DomainConfiguration cfg = domainConfigurations.get(cfgname); 455 456 for (Iterator<SeedList> i = cfg.getSeedLists(); i.hasNext();) { 457 SeedList seedlist = i.next(); 458 459 if (seedlist.getName().equals(name)) { 460 throw new PermissionDenied("The seedlist:" + name + "; is used by the configuration:" + cfgname 461 + ";"); 462 } 463 } 464 } 465 466 // if we get here without an exception - the seedlist is not in use 467 seedlists.remove(name); 468 } 469 470 /** 471 * Removes a password from this Domain. The password must not be in use by any of the configurations, otherwise a 472 * PermissionDenied exception is thrown. 473 * 474 * @param name the name of the password to remove 475 * @throws PermissionDenied if the password is in use by a configuration or this is the last password in this Domain 476 * @throws UnknownID if the no password exists with the name 477 * @throws ArgumentNotValid if a null argument is supplied 478 */ 479 public void removePassword(String name) { 480 ArgumentNotValid.checkNotNullOrEmpty(name, "name"); 481 482 if (!passwords.containsKey(name)) { 483 throw new UnknownID("Password has not been registered:" + name + "; in the domain:" + getName() + ";"); 484 } 485 486 for (String cfgname : domainConfigurations.keySet()) { 487 DomainConfiguration cfg = domainConfigurations.get(cfgname); 488 489 if (cfg.usesPassword(name)) { 490 throw new PermissionDenied("The password:" + name + "; is used by the configuration:" + cfgname + ";"); 491 } 492 } 493 494 // if we get here without an exception - the password is not in use 495 passwords.remove(name); 496 } 497 498 /** 499 * Removes a configuration from this domain. The default configuration can not be removed, instead PermissionDenied 500 * is thrown. It is not possible to remove a configuration that is referenced by one or more HarvestDefinitions 501 * 502 * @param configName The name of a configuration to remove. 503 * @throws ArgumentNotValid if name is null or empty 504 * @throws PermissionDenied if the default configuration is attempted removed or if one or more HarvestDefinitions 505 * reference the configuration 506 */ 507 public void removeConfiguration(String configName) { 508 ArgumentNotValid.checkNotNullOrEmpty(configName, "configName"); 509 510 if (defaultConfigName.equals(configName)) { 511 throw new PermissionDenied("The default configuration can not be removed:" + configName + ";"); 512 } 513 514 if (!domainConfigurations.containsKey(configName)) { 515 throw new UnknownID("Configuration not registered:" + configName + ";"); 516 } 517 518 // Test that no harvest definition uses this configuration 519 final DomainDAO dao = DomainDAO.getInstance(); 520 if (!dao.mayDelete(getConfiguration(configName))) { 521 // Since this is an error case, spend a little time getting better 522 // info. This could be done a lot faster by adding a function to 523 // the DomainDAO. 524 HarvestDefinitionDAO hddao = HarvestDefinitionDAO.getInstance(); 525 Iterator<HarvestDefinition> hds = hddao.getAllHarvestDefinitions(); 526 List<String> usages = new ArrayList<String>(); 527 while (hds.hasNext()) { 528 HarvestDefinition hd = hds.next(); 529 Iterator<DomainConfiguration> configs = hd.getDomainConfigurations(); 530 while (configs.hasNext()) { 531 DomainConfiguration dc = configs.next(); 532 if (dc.getName().equals(configName) && dc.getDomainName().equals(getName())) { 533 usages.add(hd.getName()); 534 } 535 } 536 } 537 throw new PermissionDenied("Cannot delete domain configuration '" + configName + "', because it is used " 538 + "by the following " + "harvest definitions: " + usages); 539 } 540 541 domainConfigurations.remove(configName); 542 } 543 544 /** 545 * Gets all configurations belonging to this domain. 546 * 547 * @return all configurations belonging to this domain. 548 */ 549 public Iterator<DomainConfiguration> getAllConfigurations() { 550 return domainConfigurations.values().iterator(); 551 } 552 553 /** 554 * Get all seedlists belonging to this domain. 555 * 556 * @return all seedlists belonging to this domain 557 */ 558 public Iterator<SeedList> getAllSeedLists() { 559 return seedlists.values().iterator(); 560 } 561 562 /** 563 * Return the passwords defined for this domain. 564 * 565 * @return Iterator<Password> of known passwords. 566 */ 567 public Iterator<Password> getAllPasswords() { 568 return passwords.values().iterator(); 569 } 570 571 /** 572 * Gets all configurations belonging to this domain. The returned list is sorted by name according to language given 573 * in the parameter. 574 * 575 * @param loc contains the language sorting must adhere to 576 * @return all configurations belonging to this domain sorted according to language 577 */ 578 public List<DomainConfiguration> getAllConfigurationsAsSortedList(Locale loc) { 579 ArgumentNotValid.checkNotNull(loc, "loc"); 580 List<DomainConfiguration> resultSet = new ArrayList<DomainConfiguration>(domainConfigurations.values()); 581 NamedUtils.sortNamedObjectList(loc, resultSet); 582 return resultSet; 583 } 584 585 /** 586 * Gets all seedlists belonging to this domain. The returned list is sorted by name according to language given in 587 * the parameter. 588 * 589 * @param loc contains the language sorting must adhere to 590 * @return all seedlists belonging to this domain sorted according to language 591 */ 592 public List<SeedList> getAllSeedListsAsSortedList(Locale loc) { 593 ArgumentNotValid.checkNotNull(loc, "loc"); 594 List<SeedList> resultSet = new ArrayList<SeedList>(seedlists.values()); 595 NamedUtils.sortNamedObjectList(loc, resultSet); 596 return resultSet; 597 } 598 599 /** 600 * Returns the passwords defined for this domain. The returned list is sorted by name according to language given in 601 * the parameter. 602 * 603 * @param loc contains the language sorting must adhere to 604 * @return a sorted list of known passwords according to language 605 */ 606 public List<Password> getAllPasswordsAsSortedList(Locale loc) { 607 ArgumentNotValid.checkNotNull(loc, "loc"); 608 List<Password> resultSet = new ArrayList<Password>(passwords.values()); 609 NamedUtils.sortNamedObjectList(loc, resultSet); 610 return resultSet; 611 } 612 613 /** 614 * Add owner information. 615 * 616 * @param owner owner 617 */ 618 public void addOwnerInfo(DomainOwnerInfo owner) { 619 ArgumentNotValid.checkNotNull(owner, "owner"); 620 domainOwnerInfos.add(owner); 621 } 622 623 /** 624 * Get array of domain owner information. 625 * 626 * @return array containing information about the domain owner(s) 627 */ 628 public DomainOwnerInfo[] getAllDomainOwnerInfo() { 629 return domainOwnerInfos.toArray(new DomainOwnerInfo[0]); 630 } 631 632 /** 633 * Get password information. 634 * 635 * @param name the id of the password settings to retrieve 636 * @return the password information 637 * @throws UnknownID if no password info exists with the id "name" 638 */ 639 public Password getPassword(String name) { 640 ArgumentNotValid.checkNotNullOrEmpty(name, "name"); 641 642 if (!passwords.containsKey(name)) { 643 throw new UnknownID("Password has not been registered:" + name + "; in the domain:" + getName() + ";"); 644 } 645 646 return passwords.get(name); 647 } 648 649 /** 650 * Set the comments for this domain. 651 * 652 * @param comments The new comments (can be null) 653 */ 654 public void setComments(String comments) { 655 this.comments = comments; 656 } 657 658 /** 659 * Replaces existing configuration with cfg, using cfg.getName() as the id for the configuration. 660 * 661 * @param cfg the configuration to update 662 * @throws UnknownID if no configuration exists with the id cfg.getName(). ArgumentNotValid if cfg is null. 663 */ 664 public void updateConfiguration(DomainConfiguration cfg) { 665 ArgumentNotValid.checkNotNull(cfg, "cfg"); 666 667 if (!domainConfigurations.containsKey(cfg.getName())) { 668 throw new UnknownID("No configuration exists with the name:" + cfg.getName() + "; in the domain:" 669 + getName() + ";"); 670 } 671 672 putConfiguration(cfg); 673 } 674 675 /** 676 * Returns true if this domain has the named password. 677 * 678 * @param passwordName the identifier of the password info 679 * @return true if this domain has password info with id passwordname 680 */ 681 public boolean hasPassword(String passwordName) { 682 return passwords.containsKey(passwordName); 683 } 684 685 /** 686 * Returns true if this domain has the named configuration. 687 * 688 * @param configName the identifier of the configuration 689 * @return true if this domain has a configuration with id configNmae 690 */ 691 public boolean hasConfiguration(String configName) { 692 return domainConfigurations.containsKey(configName); 693 } 694 695 /** 696 * Get the edition number. 697 * 698 * @return the edition number 699 */ 700 public long getEdition() { 701 return edition; 702 } 703 704 /** 705 * Set the edition number. 706 * 707 * @param theNewEdition the new edition 708 */ 709 public void setEdition(long theNewEdition) { 710 edition = theNewEdition; 711 } 712 713 /** 714 * Get the ID of this domain. Only for use by DBDAO 715 * 716 * @return Get the ID of this domain 717 */ 718 public long getID() { 719 return id; 720 } 721 722 /** 723 * Set the ID of this domain. Only for use by DBDAO. 724 * 725 * @param newId The new ID for this domain. 726 */ 727 void setID(long newId) { 728 this.id = newId; 729 } 730 731 /** 732 * Check if this harvestinfo has an ID set yet (doesn't happen until the DBDAO persists it). 733 * 734 * @return true, if this domain has an ID different from null 735 */ 736 boolean hasID() { 737 return id != null; 738 } 739 740 /** 741 * Return a human-readable representation of this object. 742 * 743 * @return Some string identifying the object. Do not use this for machine processing. 744 */ 745 public String toString() { 746 StringBuilder sb = new StringBuilder(); 747 sb.append("Domain:").append(getName()).append(";\n"); 748 sb.append("Comment:").append(getComments()).append(";\n"); 749 750 sb.append("Configurations:\n"); 751 752 for (String cfgName : domainConfigurations.keySet()) { 753 sb.append("\t").append(cfgName).append(";\n"); 754 } 755 756 sb.append("Seedlists:\n"); 757 758 for (String seedName : seedlists.keySet()) { 759 sb.append("\t").append(seedName).append(";\n"); 760 } 761 762 sb.append("Passwords:\n"); 763 764 for (String pwName : passwords.keySet()) { 765 sb.append("\t").append(pwName).append(";\n"); 766 } 767 768 sb.append("Extended Fields:\n"); 769 770 for (int i = 0; i < extendedFieldValues.size(); i++) { 771 ExtendedFieldValue efv = extendedFieldValues.get(i); 772 sb.append("\t").append(efv.getExtendedFieldID() + ": " + efv.getContent()).append(";\n"); 773 } 774 775 sb.append("---------------\n"); 776 777 return sb.toString(); 778 } 779 780 /** 781 * Sets a list of regular expressions defining urls that should never be harvested from this domain. The list (after 782 * trimming the strings, and any empty strings have been removed) is copied to a list that is stored immutably. 783 * 784 * @param regExps The list defining urls never to be harvested. 785 * @param strictMode If true, we throw ArgumentNotValid exception if invalid regexps are found 786 * @throws ArgumentNotValid if regExps is null or regExps contains invalid regular expressions (unless strictMode is 787 * false). 788 */ 789 public void setCrawlerTraps(List<String> regExps, boolean strictMode) { 790 ArgumentNotValid.checkNotNull(regExps, "List<String> regExps"); 791 List<String> cleanedListOfCrawlerTraps = new ArrayList<String>(); 792 for (String crawlerTrap : regExps) { 793 log.trace("original trap: '" + crawlerTrap + "'"); 794 String trimmedString = crawlerTrap.trim(); 795 log.trace("trimmed trap: '" + trimmedString + "'"); 796 if (!(trimmedString.length() == 0)) { 797 cleanedListOfCrawlerTraps.add(crawlerTrap); 798 } else { 799 log.trace("Removed empty string from list of crawlertraps"); 800 } 801 } 802 // Validate regexps 803 for (String regexp : cleanedListOfCrawlerTraps) { 804 try { 805 Pattern.compile(regexp); 806 } catch (PatternSyntaxException e) { 807 final String errMsg = "The regular expression '" + regexp + "' is invalid. " 808 + "Please correct the expression."; 809 if (strictMode) { 810 throw new ArgumentNotValid(errMsg, e); 811 } else { 812 log.warn(errMsg, e); 813 } 814 } 815 } 816 crawlerTraps = Collections.unmodifiableList(cleanedListOfCrawlerTraps); 817 } 818 819 /** 820 * Returns the list of regexps never to be harvested from this domain, or the empty list if none. The returned list 821 * should never be null. 822 * 823 * @return The list of regexps of url's never to be harvested when harvesting this domain. This list is immutable. 824 */ 825 public List<String> getCrawlerTraps() { 826 return crawlerTraps; 827 } 828 829 /** 830 * Returns the alias info for this domain, or null if this domain is not an alias. 831 * 832 * @return A domain name. 833 */ 834 public AliasInfo getAliasInfo() { 835 return aliasInfo; 836 } 837 838 /** 839 * Update which domain this domain is considered an alias of. Calling this function will a) cause some slightly 840 * expensive checks to be performed, and b) set the time of last update. For object construction and copying, use 841 * setAlias. 842 * 843 * @param alias The name (e.g. "netarkivet.dk") of the domain that this domain is an alias of. 844 * @throws UnknownID If the given domain does not exist 845 * @throws IllegalState If updating the alias info would violate constraints of alias: No transitivity, no 846 * reflection. 847 */ 848 public void updateAlias(String alias) { 849 if (getName().equals(alias)) { 850 String message = "Cannot make domain '" + this.getName() + "' an alias of itself"; 851 log.debug(message); 852 throw new IllegalState(message); 853 } 854 855 if (alias != null) { 856 DomainDAO dao = DomainDAO.getInstance(); 857 Domain otherD = dao.read(alias); 858 if (otherD.aliasInfo != null) { 859 String message = "Cannot make domain '" + this.getName() + "' an alias of '" + otherD.getName() + "'," 860 + " as that domain is already an alias of '" + otherD.aliasInfo.getAliasOf() + "'"; 861 log.debug(message); 862 throw new IllegalState(message); 863 } 864 if (dao.getAliases(getName()).size() != 0) { 865 List<String> aliasesForThisDomain = new ArrayList<String>(); 866 for (AliasInfo ai : dao.getAliases(getName())) { 867 aliasesForThisDomain.add(ai.getDomain()); 868 } 869 String message = "Cannot make domain '" + this.getName() + "' an alias of '" + otherD.getName() + "'," 870 + " as the domains '" + StringUtils.conjoin(",", aliasesForThisDomain) + "' are " 871 + "already aliases of '" + this.getName() + "'"; 872 log.debug(message); 873 throw new IllegalState(message); 874 } 875 setAliasInfo(new AliasInfo(domainName, alias, new Date())); 876 } else { 877 setAliasInfo(null); 878 } 879 } 880 881 /** 882 * Set the alias field on this object. This function performs no checking of existence of transitivity of alias 883 * domains, but it does check that the alias info is for this domain 884 * 885 * @param aliasInfo Alias information 886 * @throws ArgumentNotValid if the alias info is not for this domain 887 */ 888 void setAliasInfo(AliasInfo aliasInfo) { 889 if (aliasInfo != null && !aliasInfo.getDomain().equals(domainName)) { 890 throw new ArgumentNotValid("AliasInfo must be for this domain"); 891 } 892 this.aliasInfo = aliasInfo; 893 } 894 895 /** 896 * Gets the harvest info giving best information for expectation or how many objects a harvest using a given 897 * configuration will retrieve, we will prioritise the most recently harvest, where we have a full harvest. 898 * 899 * @param configName The name of the configuration 900 * @return The Harvest Information for the harvest defining the best expectation, including the number retrieved and 901 * the stop reason. 902 */ 903 public HarvestInfo getBestHarvestInfoExpectation(String configName) { 904 ArgumentNotValid.checkNotNullOrEmpty(configName, "String configName"); 905 return DomainHistory.getBestHarvestInfoExpectation(configName, this.getHistory()); 906 } 907 908 /** 909 * All derived classes allow ExtendedFields from Type ExtendedFieldTypes.DOMAIN 910 * 911 * @return ExtendedFieldTypes.DOMAIN 912 */ 913 protected int getExtendedFieldType() { 914 return ExtendedFieldTypes.DOMAIN; 915 } 916 917}