001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.datamodel; 024 025import java.util.ArrayList; 026import java.util.Arrays; 027import java.util.Collections; 028import java.util.Date; 029import java.util.HashMap; 030import java.util.Iterator; 031import java.util.List; 032import java.util.Locale; 033import java.util.Map; 034import java.util.regex.Pattern; 035import java.util.regex.PatternSyntaxException; 036 037import org.slf4j.Logger; 038import org.slf4j.LoggerFactory; 039 040import dk.netarkivet.common.Constants; 041import dk.netarkivet.common.exceptions.ArgumentNotValid; 042import dk.netarkivet.common.exceptions.IllegalState; 043import dk.netarkivet.common.exceptions.PermissionDenied; 044import dk.netarkivet.common.exceptions.UnknownID; 045import dk.netarkivet.common.utils.DomainUtils; 046import dk.netarkivet.common.utils.Named; 047import dk.netarkivet.common.utils.Settings; 048import dk.netarkivet.common.utils.StringUtils; 049import dk.netarkivet.common.utils.TLD; 050import dk.netarkivet.harvester.HarvesterSettings; 051import dk.netarkivet.harvester.datamodel.dao.DAOProviderFactory; 052import dk.netarkivet.harvester.datamodel.extendedfield.ExtendableEntity; 053import dk.netarkivet.harvester.datamodel.extendedfield.ExtendedFieldTypes; 054import dk.netarkivet.harvester.datamodel.extendedfield.ExtendedFieldValue; 055import dk.netarkivet.harvester.utils.CrawlertrapsUtils; 056 057/** 058 * Represents known information about a domain A domain is identified by a domain name (ex: kb.dk) 059 * <p> 060 * The following information is used to control how a domain is harvested: Seedlists, configurations and passwords. Each 061 * seedlist defines one or more URL's that the harvester should use as starting points. A configuration defines a 062 * specific combination of settings (seedlist, harvester settings, passwords) that should be used during harvest. 063 * Passwords define user names and passwords that might be used for the domain. 064 * <p> 065 * Information about previous harvests of this domain is available via the domainHistory. 066 * <p> 067 * Information from the domain registrant (DK-HOSTMASTER) about the domain registration is available in the 068 * registration. This includes the dates where the domain was known to exist (included in a domain list), together with 069 * domain owner information. 070 * <p> 071 * Notice that each configuration references one of the seedlists by name, and possibly one of the Passwords. 072 */ 073@SuppressWarnings({"rawtypes"}) 074public class Domain extends ExtendableEntity implements Named { 075 076 /** The logger for this class. */ 077 protected static final Logger log = LoggerFactory.getLogger(Domain.class); 078 079 /** The identification used to lookup the domain. */ 080 081 private String domainName; 082 083 /** 084 * Map<String, DomainConfiguration> the various harvest configurations that can be used to harvest this domain. 085 */ 086 private Map<String, DomainConfiguration> domainConfigurations; 087 088 /** Use this configuration unless otherwise specified. */ 089 private String defaultConfigName; 090 091 /** 092 * Map<String, SeedList> The different seedlists used as starting points by the harvesters. 093 */ 094 private Map<String, SeedList> seedlists; 095 096 /** Map<String, Password> with an entry for each known password. */ 097 private Map<String, Password> passwords; 098 099 /** 100 * List of crawler traps, that is regexps that should be ignored for this domain. 101 */ 102 private List<String> crawlerTraps; 103 104 /** Records all historical information about the domain. */ 105 private DomainHistory history; 106 107 /** 108 * List<DomainOwnerInfo> contains information about the known owners of this domain. 109 */ 110 private List<DomainOwnerInfo> domainOwnerInfos; 111 112 /** Comments that the user has entered. */ 113 private String comments; 114 115 /** Edition is used by the DAO to keep track of changes. */ 116 long edition = -1; 117 118 /** 119 * If non-null, this domain is considered an alias of the domain named. The field must be either null or aliasInfo 120 * that defines an alias from this domain to another, and the time the alias field was last updated. This is used to 121 * allow operators to check the domains that have been aliases for a long time. 122 * <p> 123 * Note that we do not allow transitive aliases, so the domain named in this field is not allowed to become an alias 124 * itself. 125 */ 126 private AliasInfo aliasInfo; 127 128 /** ID autogenerated by DB DAO. */ 129 private Long id; 130 131 /** 132 * Create new instance of a domain. It is generally recommended that getDefaultDomain is used instead of this 133 * constructor. 134 * 135 * @param theDomainName Name used to reference the domain 136 * @throws ArgumentNotValid if either of the arguments are null or empty, or if the domain does not match the regex 137 * for valid domains 138 */ 139 protected Domain(String theDomainName) { 140 super(DAOProviderFactory.getExtendedFieldDAOProvider()); 141 ArgumentNotValid.checkNotNullOrEmpty(theDomainName, "theDomainName"); 142 if (!DomainUtils.isValidDomainName(theDomainName)) { 143 throw new ArgumentNotValid("Domain '" + theDomainName + "' does not match the regexp " 144 + "defining valid domains: " + TLD.getInstance().getValidDomainMatcher().pattern()); 145 } 146 domainName = theDomainName; 147 comments = ""; 148 domainConfigurations = new HashMap<String, DomainConfiguration>(); 149 seedlists = new HashMap<String, SeedList>(); 150 passwords = new HashMap<String, Password>(); 151 crawlerTraps = Collections.emptyList(); 152 history = new DomainHistory(); 153 domainOwnerInfos = new ArrayList<DomainOwnerInfo>(); 154 } 155 156 /** 157 * Get a new domain, initialised with default values. 158 * 159 * @param domainName The name of the domain 160 * @return a domain with the given name 161 * @throws ArgumentNotValid if name is null or empty 162 */ 163 public static Domain getDefaultDomain(String domainName) { 164 Domain myDomain; 165 myDomain = new Domain(domainName); 166 167 // Create default seed list containing one seed: http://www.domain 168 // or http://1.2.3.4 for IP-named domains. 169 String defaultSeedListName = Settings.get(HarvesterSettings.DEFAULT_SEEDLIST); 170 171 SeedList seedlist; 172 if (Constants.IP_KEY_REGEXP.matcher(domainName).matches()) { 173 // IP domains should not get www 174 seedlist = new SeedList(defaultSeedListName, "http://" + domainName); 175 } else { 176 seedlist = new SeedList(defaultSeedListName, "http://www." + domainName); 177 } 178 myDomain.addSeedList(seedlist); 179 180 List<SeedList> seedlists = Arrays.asList(seedlist); 181 182 // Create default configuration using the default seedlist 183 String domainDefaultConfig = Settings.get(HarvesterSettings.DOMAIN_DEFAULT_CONFIG); 184 185 DomainConfiguration cfg = new DomainConfiguration(domainDefaultConfig, myDomain, seedlists, 186 new ArrayList<Password>()); 187 cfg.setOrderXmlName(Settings.get(HarvesterSettings.DOMAIN_DEFAULT_ORDERXML)); 188 cfg.setMaxRequestRate(Integer.parseInt(Settings.get(HarvesterSettings.DOMAIN_CONFIG_MAXRATE))); 189 myDomain.addConfiguration(cfg); 190 191 return myDomain; 192 } 193 194 /** 195 * Adds a new configuration to the domain. If this is the first configuration added, it becomes the default 196 * configuration. The seedlist referenced by the configuration must already be registered in this domain otherwise 197 * an UnknownID exception is thrown. 198 * 199 * @param cfg the configuration that is added 200 * @throws UnknownID if the name of the seedlist referenced by cfg is unknown 201 * @throws PermissionDenied if a configuration with the same name already exists 202 * @throws ArgumentNotValid if null supplied 203 */ 204 public void addConfiguration(DomainConfiguration cfg) { 205 ArgumentNotValid.checkNotNull(cfg, "cfg"); 206 207 if (domainConfigurations.containsKey(cfg.getName())) { 208 throw new PermissionDenied("A configuration already exists with the name:" + cfg.getName() 209 + "; in the domain:" + getName() + ";"); 210 } 211 212 putConfiguration(cfg); 213 214 if (domainConfigurations.size() == 1) { 215 defaultConfigName = cfg.getName(); 216 } 217 } 218 219 /** 220 * Set a configuration in the domain. This checks that the seedlists and passwords are legal. 221 * 222 * @param cfg The configuration to add. 223 */ 224 private void putConfiguration(DomainConfiguration cfg) { 225 checkListContainsNamed(cfg, cfg.getSeedLists(), "seedlist", seedlists); 226 checkListContainsNamed(cfg, cfg.getPasswords(), "passwords", passwords); 227 228 domainConfigurations.put(cfg.getName(), cfg); 229 } 230 231 /** 232 * Helper method used to verify that a configuration does not reference seedlists or passwords that do not exist in 233 * this domain. 234 * 235 * @param cfg the configuration being checked 236 * @param items an iterator to the references that are checked (seedlists or passwords) 237 * @param typename the name of the references being checked 238 * @param m the corresponding domain map that must contain entries matching the names in the items 239 * @param <T> The type contained in items iterator. The type extends Named 240 */ 241 private <T extends Named> void checkListContainsNamed(DomainConfiguration cfg, final Iterator<T> items, 242 final String typename, final Map m) { 243 while (items.hasNext()) { 244 Named named = items.next(); 245 246 if (!m.containsKey(named.getName())) { 247 throw new UnknownID("Configuration:" + cfg.getName() + "; uses unknown " + typename + ":" 248 + named.getName() + "; in the domain:" + getName() + ";"); 249 } 250 } 251 } 252 253 /** 254 * Helper method that adds or updates an entry in a map. Used to add/update entries in seedlists and passwords maps 255 * 256 * @param m the map to modify 257 * @param name the name of the element to add or update 258 * @param addAction when true an add action is performed and en entry with the name is not allowed to exist in the 259 * map before the operation, when false an update operation is performed and an entry must already exists with the 260 * name in the map. 261 * @param value the object to add to m 262 * @param <T> The type contained as values in the map m. 263 */ 264 private <T extends Named> void put(Map<String, T> m, String name, boolean addAction, T value) { 265 boolean alreadyExist = m.containsKey(name); 266 267 if (addAction && alreadyExist) { 268 throw new PermissionDenied("An entry already exists with the name:" + name + "; in the domain:" + getName() 269 + ";"); 270 } 271 272 if ((!addAction) && (!alreadyExist)) { 273 throw new UnknownID("No entry exists with the name '" + name + "' in the domain '" + getName() + "'"); 274 } 275 276 m.put(name, value); 277 } 278 279 /** 280 * Adds a seed list to the domain. 281 * 282 * @param seedlist the actual seedslist. 283 * @throws ArgumentNotValid if an argument is null 284 * @throws PermissionDenied if the seedName already exists 285 */ 286 public void addSeedList(SeedList seedlist) { 287 ArgumentNotValid.checkNotNull(seedlist, "seedlist"); 288 put(seedlists, seedlist.getName(), true, seedlist); 289 } 290 291 /** 292 * Update a seed list to the domain. Replaces an existing seedlist with the same name. 293 * 294 * @param seedlist the actual seedslist. 295 * @throws ArgumentNotValid if an argument is null 296 * @throws UnknownID if the seedlist.getName() does not exists 297 */ 298 public void updateSeedList(SeedList seedlist) { 299 ArgumentNotValid.checkNotNull(seedlist, "seedlist"); 300 put(seedlists, seedlist.getName(), false, seedlist); 301 } 302 303 /** 304 * Adds a password to the domain. 305 * 306 * @param password A password object to add. 307 * @throws ArgumentNotValid if the argument is null 308 * @throws PermissionDenied if a password already exists with this name 309 */ 310 public void addPassword(Password password) { 311 ArgumentNotValid.checkNotNull(password, "password"); 312 put(passwords, password.getName(), true, password); 313 } 314 315 /** 316 * Updates a password on the domain. 317 * 318 * @param password A password object to update. 319 * @throws ArgumentNotValid if the argument is null 320 * @throws PermissionDenied if no password exists with this name 321 */ 322 public void updatePassword(Password password) { 323 ArgumentNotValid.checkNotNull(password, "password"); 324 put(passwords, password.getName(), false, password); 325 } 326 327 /** 328 * Mark a configuration as the default configuration to use. The configuration name must match an already added 329 * configuration, otherwise an UnknownID exception is thrown. 330 * 331 * @param cfgName a name of a configuration 332 * @throws UnknownID when the cfgName does not match an added configuration 333 * @throws ArgumentNotValid if cfgName is null or empty 334 */ 335 public void setDefaultConfiguration(String cfgName) { 336 ArgumentNotValid.checkNotNullOrEmpty(cfgName, "cfgName"); 337 338 if (!domainConfigurations.containsKey(cfgName)) { 339 throw new UnknownID("Default configuration not registered:" + cfgName + "; in the domain:" + getName() 340 + ";"); 341 } 342 343 defaultConfigName = cfgName; 344 } 345 346 /** 347 * Returns an already registered configuration. 348 * 349 * @param cfgName the name of an registered configuration 350 * @return the configuration 351 * @throws UnknownID if the name is not a registered configuration 352 * @throws ArgumentNotValid if cfgName is null or empty 353 */ 354 public DomainConfiguration getConfiguration(String cfgName) { 355 ArgumentNotValid.checkNotNullOrEmpty(cfgName, "cfgName"); 356 357 if (!domainConfigurations.containsKey(cfgName)) { 358 throw new UnknownID("Configuration '" + cfgName + "' not registered in the domain '" + getName() + "'"); 359 } 360 DomainConfiguration cfg = domainConfigurations.get(cfgName); 361 cfg.setDomainhistory(this.getHistory()); 362 return cfg; 363 } 364 365 /** 366 * Gets the default configuration. If no configuration has been explicitly set the first configuration added to this 367 * domain is returned. If no configurations have been added at all a UnknownID exception is thrown. 368 * 369 * @return the default configuration (never null) 370 * @throws UnknownID if no configurations exists 371 */ 372 public DomainConfiguration getDefaultConfiguration() { 373 if (domainConfigurations.size() == 0) { 374 throw new UnknownID("No configurations have been registered in the domain:" + getName() + ";"); 375 } 376 377 return getConfiguration(defaultConfigName); 378 } 379 380 /** 381 * Gets the name of this domain. 382 * 383 * @return the name of this domain 384 */ 385 public String getName() { 386 return domainName; 387 } 388 389 /** 390 * @return the domain comments. 391 */ 392 public String getComments() { 393 return comments; 394 } 395 396 /** 397 * Get the domain history. 398 * 399 * @return the domain history 400 */ 401 public DomainHistory getHistory() { 402 return history; 403 } 404 405 /** 406 * Get a specific seedlist previously added to this domain. 407 * 408 * @param name the name of the seedlist to return 409 * @return the specified seedlist 410 * @throws ArgumentNotValid if name is null or empty 411 * @throws UnknownID if no seedlist has been added with the supplied name 412 */ 413 public SeedList getSeedList(String name) { 414 ArgumentNotValid.checkNotNullOrEmpty(name, "name"); 415 416 if (!hasSeedList(name)) { 417 throw new UnknownID("Seedlist '" + name + " has not been registered in the domain '" + getName() + "'"); 418 } 419 420 return seedlists.get(name); 421 } 422 423 /** 424 * Return true if the named seedlist exists in this domain. 425 * 426 * @param name String representing a possible seedlist for the domain. 427 * @return true, if the named seedlist exists in this domain 428 */ 429 public boolean hasSeedList(String name) { 430 ArgumentNotValid.checkNotNullOrEmpty(name, "name"); 431 432 return seedlists.containsKey(name); 433 } 434 435 /** 436 * Removes a seedlist from this Domain. The seedlist must not be in use by any of the configurations, otherwise a 437 * PermissionDenied exception is thrown. 438 * 439 * @param name the name of the seedlist to remove 440 * @throws PermissionDenied if the seedlist is in use by a configuration or this is the last seedlist in this Domain 441 * @throws UnknownID if the no seedlist exists with the name 442 * @throws ArgumentNotValid if a null argument is supplied 443 */ 444 public void removeSeedList(String name) { 445 ArgumentNotValid.checkNotNullOrEmpty(name, "name"); 446 447 if (!seedlists.containsKey(name)) { 448 throw new UnknownID("Seedlist has not been registered:" + name + "; in the domain:" + getName() + ";"); 449 } 450 451 if (seedlists.size() <= 1) { 452 throw new PermissionDenied("Can not remove the last seedlist:" + name + ";"); 453 } 454 455 for (String cfgname : domainConfigurations.keySet()) { 456 DomainConfiguration cfg = domainConfigurations.get(cfgname); 457 458 for (Iterator<SeedList> i = cfg.getSeedLists(); i.hasNext();) { 459 SeedList seedlist = i.next(); 460 461 if (seedlist.getName().equals(name)) { 462 throw new PermissionDenied("The seedlist:" + name + "; is used by the configuration:" + cfgname 463 + ";"); 464 } 465 } 466 } 467 468 // if we get here without an exception - the seedlist is not in use 469 seedlists.remove(name); 470 } 471 472 /** 473 * Removes a password from this Domain. The password must not be in use by any of the configurations, otherwise a 474 * PermissionDenied exception is thrown. 475 * 476 * @param name the name of the password to remove 477 * @throws PermissionDenied if the password is in use by a configuration or this is the last password in this Domain 478 * @throws UnknownID if the no password exists with the name 479 * @throws ArgumentNotValid if a null argument is supplied 480 */ 481 public void removePassword(String name) { 482 ArgumentNotValid.checkNotNullOrEmpty(name, "name"); 483 484 if (!passwords.containsKey(name)) { 485 throw new UnknownID("Password has not been registered:" + name + "; in the domain:" + getName() + ";"); 486 } 487 488 for (String cfgname : domainConfigurations.keySet()) { 489 DomainConfiguration cfg = domainConfigurations.get(cfgname); 490 491 if (cfg.usesPassword(name)) { 492 throw new PermissionDenied("The password:" + name + "; is used by the configuration:" + cfgname + ";"); 493 } 494 } 495 496 // if we get here without an exception - the password is not in use 497 passwords.remove(name); 498 } 499 500 /** 501 * Removes a configuration from this domain. The default configuration can not be removed, instead PermissionDenied 502 * is thrown. It is not possible to remove a configuration that is referenced by one or more HarvestDefinitions 503 * 504 * @param configName The name of a configuration to remove. 505 * @throws ArgumentNotValid if name is null or empty 506 * @throws PermissionDenied if the default configuration is attempted removed or if one or more HarvestDefinitions 507 * reference the configuration 508 */ 509 public void removeConfiguration(String configName) { 510 ArgumentNotValid.checkNotNullOrEmpty(configName, "configName"); 511 512 if (defaultConfigName.equals(configName)) { 513 throw new PermissionDenied("The default configuration can not be removed:" + configName + ";"); 514 } 515 516 if (!domainConfigurations.containsKey(configName)) { 517 throw new UnknownID("Configuration not registered:" + configName + ";"); 518 } 519 520 // Test that no harvest definition uses this configuration 521 final DomainDAO dao = DomainDAO.getInstance(); 522 if (!dao.mayDelete(getConfiguration(configName))) { 523 // Since this is an error case, spend a little time getting better 524 // info. This could be done a lot faster by adding a function to 525 // the DomainDAO. 526 HarvestDefinitionDAO hddao = HarvestDefinitionDAO.getInstance(); 527 Iterator<HarvestDefinition> hds = hddao.getAllHarvestDefinitions(); 528 List<String> usages = new ArrayList<String>(); 529 while (hds.hasNext()) { 530 HarvestDefinition hd = hds.next(); 531 Iterator<DomainConfiguration> configs = hd.getDomainConfigurations(); 532 while (configs.hasNext()) { 533 DomainConfiguration dc = configs.next(); 534 if (dc.getName().equals(configName) && dc.getDomainName().equals(getName())) { 535 usages.add(hd.getName()); 536 } 537 } 538 } 539 throw new PermissionDenied("Cannot delete domain configuration '" + configName + "', because it is used " 540 + "by the following " + "harvest definitions: " + usages); 541 } 542 543 domainConfigurations.remove(configName); 544 } 545 546 /** 547 * Gets all configurations belonging to this domain. 548 * 549 * @return all configurations belonging to this domain. 550 */ 551 public Iterator<DomainConfiguration> getAllConfigurations() { 552 return domainConfigurations.values().iterator(); 553 } 554 555 /** 556 * Get all seedlists belonging to this domain. 557 * 558 * @return all seedlists belonging to this domain 559 */ 560 public Iterator<SeedList> getAllSeedLists() { 561 return seedlists.values().iterator(); 562 } 563 564 /** 565 * Return the passwords defined for this domain. 566 * 567 * @return Iterator<Password> of known passwords. 568 */ 569 public Iterator<Password> getAllPasswords() { 570 return passwords.values().iterator(); 571 } 572 573 /** 574 * Gets all configurations belonging to this domain. The returned list is sorted by name according to language given 575 * in the parameter. 576 * 577 * @param loc contains the language sorting must adhere to 578 * @return all configurations belonging to this domain sorted according to language 579 */ 580 public List<DomainConfiguration> getAllConfigurationsAsSortedList(Locale loc) { 581 ArgumentNotValid.checkNotNull(loc, "loc"); 582 List<DomainConfiguration> resultSet = new ArrayList<DomainConfiguration>(domainConfigurations.values()); 583 NamedUtils.sortNamedObjectList(loc, resultSet); 584 return resultSet; 585 } 586 587 /** 588 * Gets all seedlists belonging to this domain. The returned list is sorted by name according to language given in 589 * the parameter. 590 * 591 * @param loc contains the language sorting must adhere to 592 * @return all seedlists belonging to this domain sorted according to language 593 */ 594 public List<SeedList> getAllSeedListsAsSortedList(Locale loc) { 595 ArgumentNotValid.checkNotNull(loc, "loc"); 596 List<SeedList> resultSet = new ArrayList<SeedList>(seedlists.values()); 597 NamedUtils.sortNamedObjectList(loc, resultSet); 598 return resultSet; 599 } 600 601 /** 602 * Returns the passwords defined for this domain. The returned list is sorted by name according to language given in 603 * the parameter. 604 * 605 * @param loc contains the language sorting must adhere to 606 * @return a sorted list of known passwords according to language 607 */ 608 public List<Password> getAllPasswordsAsSortedList(Locale loc) { 609 ArgumentNotValid.checkNotNull(loc, "loc"); 610 List<Password> resultSet = new ArrayList<Password>(passwords.values()); 611 NamedUtils.sortNamedObjectList(loc, resultSet); 612 return resultSet; 613 } 614 615 /** 616 * Add owner information. 617 * 618 * @param owner owner 619 */ 620 public void addOwnerInfo(DomainOwnerInfo owner) { 621 ArgumentNotValid.checkNotNull(owner, "owner"); 622 domainOwnerInfos.add(owner); 623 } 624 625 /** 626 * Get array of domain owner information. 627 * 628 * @return array containing information about the domain owner(s) 629 */ 630 public DomainOwnerInfo[] getAllDomainOwnerInfo() { 631 return domainOwnerInfos.toArray(new DomainOwnerInfo[0]); 632 } 633 634 /** 635 * Get password information. 636 * 637 * @param name the id of the password settings to retrieve 638 * @return the password information 639 * @throws UnknownID if no password info exists with the id "name" 640 */ 641 public Password getPassword(String name) { 642 ArgumentNotValid.checkNotNullOrEmpty(name, "name"); 643 644 if (!passwords.containsKey(name)) { 645 throw new UnknownID("Password has not been registered:" + name + "; in the domain:" + getName() + ";"); 646 } 647 648 return passwords.get(name); 649 } 650 651 /** 652 * Set the comments for this domain. 653 * 654 * @param comments The new comments (can be null) 655 */ 656 public void setComments(String comments) { 657 this.comments = comments; 658 } 659 660 /** 661 * Replaces existing configuration with cfg, using cfg.getName() as the id for the configuration. 662 * 663 * @param cfg the configuration to update 664 * @throws UnknownID if no configuration exists with the id cfg.getName(). ArgumentNotValid if cfg is null. 665 */ 666 public void updateConfiguration(DomainConfiguration cfg) { 667 ArgumentNotValid.checkNotNull(cfg, "cfg"); 668 669 if (!domainConfigurations.containsKey(cfg.getName())) { 670 throw new UnknownID("No configuration exists with the name:" + cfg.getName() + "; in the domain:" 671 + getName() + ";"); 672 } 673 674 putConfiguration(cfg); 675 } 676 677 /** 678 * Returns true if this domain has the named password. 679 * 680 * @param passwordName the identifier of the password info 681 * @return true if this domain has password info with id passwordname 682 */ 683 public boolean hasPassword(String passwordName) { 684 return passwords.containsKey(passwordName); 685 } 686 687 /** 688 * Returns true if this domain has the named configuration. 689 * 690 * @param configName the identifier of the configuration 691 * @return true if this domain has a configuration with id configNmae 692 */ 693 public boolean hasConfiguration(String configName) { 694 return domainConfigurations.containsKey(configName); 695 } 696 697 /** 698 * Get the edition number. 699 * 700 * @return the edition number 701 */ 702 public long getEdition() { 703 return edition; 704 } 705 706 /** 707 * Set the edition number. 708 * 709 * @param theNewEdition the new edition 710 */ 711 public void setEdition(long theNewEdition) { 712 edition = theNewEdition; 713 } 714 715 /** 716 * Get the ID of this domain. Only for use by DBDAO 717 * 718 * @return Get the ID of this domain 719 */ 720 public long getID() { 721 return id; 722 } 723 724 /** 725 * Set the ID of this domain. Only for use by DBDAO. 726 * 727 * @param newId The new ID for this domain. 728 */ 729 void setID(long newId) { 730 this.id = newId; 731 } 732 733 /** 734 * Check if this harvestinfo has an ID set yet (doesn't happen until the DBDAO persists it). 735 * 736 * @return true, if this domain has an ID different from null 737 */ 738 boolean hasID() { 739 return id != null; 740 } 741 742 /** 743 * Return a human-readable representation of this object. 744 * 745 * @return Some string identifying the object. Do not use this for machine processing. 746 */ 747 public String toString() { 748 StringBuilder sb = new StringBuilder(); 749 sb.append("Domain:").append(getName()).append(";\n"); 750 sb.append("Comment:").append(getComments()).append(";\n"); 751 752 sb.append("Configurations:\n"); 753 754 for (String cfgName : domainConfigurations.keySet()) { 755 sb.append("\t").append(cfgName).append(";\n"); 756 } 757 758 sb.append("Seedlists:\n"); 759 760 for (String seedName : seedlists.keySet()) { 761 sb.append("\t").append(seedName).append(";\n"); 762 } 763 764 sb.append("Passwords:\n"); 765 766 for (String pwName : passwords.keySet()) { 767 sb.append("\t").append(pwName).append(";\n"); 768 } 769 770 sb.append("Extended Fields:\n"); 771 772 for (int i = 0; i < extendedFieldValues.size(); i++) { 773 ExtendedFieldValue efv = extendedFieldValues.get(i); 774 sb.append("\t").append(efv.getExtendedFieldID() + ": " + efv.getContent()).append(";\n"); 775 } 776 777 sb.append("---------------\n"); 778 779 return sb.toString(); 780 } 781 782 /** 783 * Sets a list of regular expressions defining urls that should never be harvested from this domain. The list (after 784 * trimming the strings, and any empty strings have been removed) is copied to a list that is stored immutably. 785 * 786 * @param regExps The list defining urls never to be harvested. 787 * @param strictMode If true, we throw ArgumentNotValid exception if invalid regexps are found 788 * @throws ArgumentNotValid if regExps is null or regExps contains invalid regular expressions (unless strictMode is 789 * false). 790 */ 791 public void setCrawlerTraps(List<String> regExps, boolean strictMode) { 792 ArgumentNotValid.checkNotNull(regExps, "List<String> regExps"); 793 List<String> cleanedListOfCrawlerTraps = new ArrayList<String>(); 794 for (String crawlerTrap : regExps) { 795 log.trace("original trap: '" + crawlerTrap + "'"); 796 String trimmedString = crawlerTrap.trim(); 797 log.trace("trimmed trap: '" + trimmedString + "'"); 798 if (!(trimmedString.length() == 0)) { 799 cleanedListOfCrawlerTraps.add(crawlerTrap); 800 } else { 801 log.trace("Removed empty string from list of crawlertraps"); 802 } 803 } 804 // Validate regexps 805 List<String> errMsgs = new ArrayList<String>(); 806 for (String regexp : cleanedListOfCrawlerTraps) { 807 808 boolean wellformed = false; 809 try { 810 Pattern.compile(regexp); 811 wellformed = CrawlertrapsUtils.isCrawlertrapsWellformedXML(regexp); 812 if (!wellformed){ 813 errMsgs.add("The expression '" + regexp + "' is not wellformed XML" 814 + " . Please correct the expression."); 815 } 816 } catch (PatternSyntaxException e) { 817 errMsgs.add("The expression '" + regexp + "' is not a proper regular expression: " 818 + e.getDescription() + " . Please correct the expression."); 819 } 820 } 821 if (strictMode) 822 if (errMsgs.size() > 0) { 823 throw new ArgumentNotValid(errMsgs.size() + " errors were found: " + StringUtils.conjoin(",", errMsgs)); 824 } else { 825 log.warn(errMsgs.size() + " errors were found: " + StringUtils.conjoin(",", errMsgs)); 826 } 827 crawlerTraps = Collections.unmodifiableList(cleanedListOfCrawlerTraps); 828 log.debug("Domain {} has {} crawlertraps", domainName, crawlerTraps.size()); 829 } 830 831 /** 832 * Returns the list of regexps never to be harvested from this domain, or the empty list if none. The returned list 833 * should never be null. 834 * 835 * @return The list of regexps of url's never to be harvested when harvesting this domain. This list is immutable. 836 */ 837 public List<String> getCrawlerTraps() { 838 return crawlerTraps; 839 } 840 841 /** 842 * Returns the alias info for this domain, or null if this domain is not an alias. 843 * 844 * @return A domain name. 845 */ 846 public AliasInfo getAliasInfo() { 847 return aliasInfo; 848 } 849 850 /** 851 * Update which domain this domain is considered an alias of. Calling this function will a) cause some slightly 852 * expensive checks to be performed, and b) set the time of last update. For object construction and copying, use 853 * setAlias. 854 * 855 * @param alias The name (e.g. "netarkivet.dk") of the domain that this domain is an alias of. 856 * @throws UnknownID If the given domain does not exist 857 * @throws IllegalState If updating the alias info would violate constraints of alias: No transitivity, no 858 * reflection. 859 */ 860 public void updateAlias(String alias) { 861 if (getName().equals(alias)) { 862 String message = "Cannot make domain '" + this.getName() + "' an alias of itself"; 863 log.debug(message); 864 throw new IllegalState(message); 865 } 866 867 if (alias != null) { 868 DomainDAO dao = DomainDAO.getInstance(); 869 Domain otherD = dao.read(alias); 870 if (otherD.aliasInfo != null) { 871 String message = "Cannot make domain '" + this.getName() + "' an alias of '" + otherD.getName() + "'," 872 + " as that domain is already an alias of '" + otherD.aliasInfo.getAliasOf() + "'"; 873 log.debug(message); 874 throw new IllegalState(message); 875 } 876 if (dao.getAliases(getName()).size() != 0) { 877 List<String> aliasesForThisDomain = new ArrayList<String>(); 878 for (AliasInfo ai : dao.getAliases(getName())) { 879 aliasesForThisDomain.add(ai.getDomain()); 880 } 881 String message = "Cannot make domain '" + this.getName() + "' an alias of '" + otherD.getName() + "'," 882 + " as the domains '" + StringUtils.conjoin(",", aliasesForThisDomain) + "' are " 883 + "already aliases of '" + this.getName() + "'"; 884 log.debug(message); 885 throw new IllegalState(message); 886 } 887 setAliasInfo(new AliasInfo(domainName, alias, new Date())); 888 } else { 889 setAliasInfo(null); 890 } 891 } 892 893 /** 894 * Set the alias field on this object. This function performs no checking of existence of transitivity of alias 895 * domains, but it does check that the alias info is for this domain 896 * 897 * @param aliasInfo Alias information 898 * @throws ArgumentNotValid if the alias info is not for this domain 899 */ 900 void setAliasInfo(AliasInfo aliasInfo) { 901 if (aliasInfo != null && !aliasInfo.getDomain().equals(domainName)) { 902 throw new ArgumentNotValid("AliasInfo must be for this domain"); 903 } 904 this.aliasInfo = aliasInfo; 905 } 906 907 /** 908 * Gets the harvest info giving best information for expectation or how many objects a harvest using a given 909 * configuration will retrieve, we will prioritise the most recently harvest, where we have a full harvest. 910 * 911 * @param configName The name of the configuration 912 * @return The Harvest Information for the harvest defining the best expectation, including the number retrieved and 913 * the stop reason. 914 */ 915 public HarvestInfo getBestHarvestInfoExpectation(String configName) { 916 ArgumentNotValid.checkNotNullOrEmpty(configName, "String configName"); 917 return DomainHistory.getBestHarvestInfoExpectation(configName, this.getHistory()); 918 } 919 920 /** 921 * All derived classes allow ExtendedFields from Type ExtendedFieldTypes.DOMAIN 922 * 923 * @return ExtendedFieldTypes.DOMAIN 924 */ 925 protected int getExtendedFieldType() { 926 return ExtendedFieldTypes.DOMAIN; 927 } 928 929}