001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.datamodel; 024 025import java.util.ArrayList; 026import java.util.Iterator; 027import java.util.List; 028 029import org.slf4j.Logger; 030import org.slf4j.LoggerFactory; 031 032import com.antiaction.raptor.dao.AttributeBase; 033 034import dk.netarkivet.common.exceptions.ArgumentNotValid; 035import dk.netarkivet.common.exceptions.PermissionDenied; 036import dk.netarkivet.common.exceptions.UnknownID; 037import dk.netarkivet.common.utils.Named; 038import dk.netarkivet.common.utils.Settings; 039import dk.netarkivet.harvester.HarvesterSettings; 040import dk.netarkivet.harvester.datamodel.eav.EAV.AttributeAndType; 041 042/** 043 * This class describes a configuration for harvesting a domain. It combines a number of seedlists, a number of 044 * passwords, an order template, and some specialised settings to define the way to harvest a domain. 045 */ 046public class DomainConfiguration implements Named { 047 048 /** The class logger. */ 049 private static final Logger log = LoggerFactory.getLogger(DomainConfiguration.class); 050 051 /** The name of the configuration. */ 052 private String configName; 053 /** The name of the order.xml (Heritrix template) used by this configuration. */ 054 private String orderXmlName = ""; 055 /** maximum number of objects harvested for this configuration in a snapshot harvest. */ 056 private long maxObjects; 057 /** The maximum request rate. */ 058 private int maxRequestRate; 059 /** Maximum number of bytes to download in a harvest. */ 060 private long maxBytes; 061 /** The domain associated with this configuration. */ 062 private String domainName; 063 064 /** The list of seedlists. */ 065 private List<SeedList> seedlists; 066 067 /** The list of passwords that apply in this configuration. */ 068 private List<Password> passwords; 069 /** The comments associated with this configuration. */ 070 private String comments; 071 072 /** ID autogenerated by DB. */ 073 private Long id; 074 075 /** The domainhistory associated with the domain. */ 076 private DomainHistory domainhistory; 077 078 /** The crawlertraps associated with the domain. */ 079 private List<String> crawlertraps; 080 081 /** This configurations EAV attributes and attribute types. */ 082 private List<AttributeAndType> attributesAndTypes; 083 084 /** 085 * How many objects should be harvested in a harvest to trust that our expected size of objects is less than the 086 * default number. 087 */ 088 private static final long MIN_OBJECTS_TO_TRUST_SMALL_EXPECTATION = 50L; 089 /** The smallest number of bytes we accept per object. */ 090 private static final int MIN_EXPECTATION = 1; 091 092 /** 093 * Create a new configuration for a domain. 094 * 095 * @param theConfigName The name of this configuration 096 * @param domain The domain that this configuration is for 097 * @param seedlists Seedlists to use in this configuration. 098 * @param passwords Passwords to use in this configuration. 099 */ 100 public DomainConfiguration(String theConfigName, Domain domain, List<SeedList> seedlists, List<Password> passwords) { 101 this(theConfigName, domain.getName(), domain.getHistory(), domain.getCrawlerTraps(), seedlists, passwords); 102 } 103 104 /** 105 * Alternate constructor. TODO Filter all history not relevant for this configuration 106 * 107 * @param theConfigName theConfigName The name of this configuration 108 * @param domainName The name of the domain that this configuration is for 109 * @param history The domainhistory of the given domain 110 * @param crawlertraps The crawlertraps of the given domain 111 * @param seedlists Seedlists to use in this configuration 112 * @param passwords Passwords to use in this configuration. 113 */ 114 public DomainConfiguration(String theConfigName, String domainName, DomainHistory history, 115 List<String> crawlertraps, List<SeedList> seedlists, List<Password> passwords) { 116 ArgumentNotValid.checkNotNullOrEmpty(theConfigName, "theConfigName"); 117 ArgumentNotValid.checkNotNullOrEmpty(domainName, "domainName"); 118 ArgumentNotValid.checkNotNull(passwords, "passwords"); 119 ArgumentNotValid.checkNotNullOrEmpty(seedlists, "seedlists"); 120 121 this.configName = theConfigName; 122 this.domainName = domainName; 123 this.domainhistory = history; // TODO Filter all history not relevant 124 // for this configuration 125 this.crawlertraps = crawlertraps; 126 this.seedlists = seedlists; 127 this.passwords = passwords; 128 this.comments = ""; 129 this.maxRequestRate = Constants.DEFAULT_MAX_REQUEST_RATE; 130 this.maxObjects = Constants.DEFAULT_MAX_OBJECTS; 131 this.maxBytes = Constants.DEFAULT_MAX_BYTES; 132 } 133 134 public static String cfgToString(DomainConfiguration cfg) { 135 if (cfg == null) { 136 return "cfg{null}"; 137 } 138 String result = "cfg{" + cfg.getDomainName() + "," + cfg.getName() + ","+cfg.getMaxBytes()+","+cfg.getMaxObjects()+","; 139 if (cfg.getAttributesAndTypes() != null) { 140 for (AttributeAndType aat : cfg.getAttributesAndTypes()) { 141 AttributeBase ab = aat.attribute; 142 if (ab != null) { 143 result += "(" + ab.id + "," + ab.entity_id + "," + ab.type_id + "," + ab.getInteger() + ")"; 144 } 145 } 146 } 147 result += "}"; 148 return result; 149 } 150 151 /** 152 * Specify the name of the order.xml template to use. 153 * 154 * @param ordername order.xml template name 155 * @throws ArgumentNotValid if filename null or empty 156 */ 157 public void setOrderXmlName(String ordername) { 158 ArgumentNotValid.checkNotNullOrEmpty(ordername, "ordername"); 159 orderXmlName = ordername; 160 } 161 162 /** 163 * Specify the maximum number of objects to retrieve from the domain. 164 * 165 * @param max maximum number of objects to retrieve 166 * @throws ArgumentNotValid if max<-1 167 */ 168 public void setMaxObjects(long max) { 169 if (max < -MIN_EXPECTATION) { 170 String msg = "maxObjects must be either -1 or positive, but was " + max; 171 log.debug(msg); 172 throw new ArgumentNotValid(msg); 173 } 174 175 maxObjects = max; 176 } 177 178 /** 179 * Specify the maximum request rate to use when harvesting data. 180 * 181 * @param maxrate the maximum request rate 182 * @throws ArgumentNotValid if maxrate<0 183 */ 184 public void setMaxRequestRate(int maxrate) { 185 ArgumentNotValid.checkNotNegative(maxrate, "maxrate"); 186 187 maxRequestRate = maxrate; 188 } 189 190 /** 191 * Specify the maximum number of bytes to download from a domain in a single harvest. 192 * 193 * @param maxBytes Maximum number of bytes to download, or -1 for no limit. 194 * @throws ArgumentNotValid if maxBytes < -1 195 */ 196 public void setMaxBytes(long maxBytes) { 197 if (maxBytes < -MIN_EXPECTATION) { 198 String msg = "DomainConfiguration.maxBytes must be -1 or positive."; 199 log.debug(msg); 200 throw new ArgumentNotValid(msg); 201 } 202 this.maxBytes = maxBytes; 203 } 204 205 /** 206 * Get the configuration name. 207 * 208 * @return the configuration name 209 */ 210 public String getName() { 211 return configName; 212 } 213 214 /** 215 * Returns comments. 216 * 217 * @return string containing comments 218 */ 219 public String getComments() { 220 return comments; 221 } 222 223 /** 224 * Returns the name of the order xml file used by the domain. 225 * 226 * @return name of the order.xml file that should be used when harvesting the domain 227 */ 228 public String getOrderXmlName() { 229 return orderXmlName; 230 } 231 232 /** 233 * Returns the maximum number of objects to harvest from the domain. 234 * 235 * @return maximum number of objects to harvest 236 */ 237 public long getMaxObjects() { 238 return maxObjects; 239 } 240 241 /** 242 * Returns the maximum request rate to use when harvesting the domain. 243 * 244 * @return maximum request rate 245 */ 246 public int getMaxRequestRate() { 247 return maxRequestRate; 248 } 249 250 /** 251 * Returns the maximum number of bytes to download during a single harvest of a domain. 252 * 253 * @return Maximum bytes limit, or -1 for no limit. 254 */ 255 public long getMaxBytes() { 256 return maxBytes; 257 } 258 259 /** 260 * Returns the name of the domain aggregating this configuration. 261 * 262 * @return the name of the domain aggregating this configuration. 263 */ 264 public String getDomainName() { 265 return domainName; 266 } 267 268 /** 269 * Get an iterator of seedlists used in this configuration. 270 * 271 * @return seedlists as iterator 272 */ 273 public Iterator<SeedList> getSeedLists() { 274 return seedlists.iterator(); 275 } 276 277 /** 278 * Add a new seedlist to the configuration. Must exist in the associated domain and the equal to that seedlist. 279 * 280 * @param seedlist the seedlist to add 281 * @param domain The domain to check if the seedlist exists 282 * @throws ArgumentNotValid if the seedlist is null 283 * @throws UnknownID if the seedlist is not defined on the domain 284 * @throws PermissionDenied if the seedlist is different from the one on the domain. 285 */ 286 public void addSeedList(Domain domain, SeedList seedlist) { 287 ArgumentNotValid.checkNotNull(seedlist, "seedlist"); 288 SeedList domainSeedlist = domain.getSeedList(seedlist.getName()); 289 if (domainSeedlist == null || !domainSeedlist.equals(seedlist)) { 290 String message = "Cannot add seedlist " + seedlist + " to " + this + " as it differs from the one defined " 291 + "for " + domain + ": " + domainSeedlist; 292 log.debug(message); 293 throw new PermissionDenied(message); 294 } 295 seedlists.add(domainSeedlist); 296 } 297 298 /** 299 * Sets the used seedlists to the given list. Note: list is copied. 300 * 301 * @param newSeedlists The seedlists to use. 302 * @param domain The domain where the seedlists should come from 303 * @throws ArgumentNotValid if the seedslists are null 304 */ 305 public void setSeedLists(Domain domain, List<SeedList> newSeedlists) { 306 ArgumentNotValid.checkNotNull(newSeedlists, "newSeedlists"); 307 this.seedlists = new ArrayList<SeedList>(newSeedlists.size()); 308 for (SeedList s : newSeedlists) { 309 addSeedList(domain, s); 310 } 311 } 312 313 /** 314 * Get an iterator of passwords used in this configuration. 315 * 316 * @return The passwords in an iterator 317 */ 318 public Iterator<Password> getPasswords() { 319 return passwords.iterator(); 320 } 321 322 /** 323 * Add password to the configuration. 324 * 325 * @param password to add (must exist in the domain) 326 * @param domain the domain where the password should come from. 327 */ 328 public void addPassword(Domain domain, Password password) { 329 ArgumentNotValid.checkNotNull(password, "password"); 330 Password domainPassword = domain.getPassword(password.getName()); 331 if (!domainPassword.equals(password)) { 332 String message = "Cannot add password " + password + " to " + this + " as it differs from the one defined " 333 + "for " + domain + ": " + domainPassword; 334 log.debug(message); 335 throw new PermissionDenied(message); 336 } 337 passwords.add(domainPassword); 338 } 339 340 /** 341 * Gets the best expectation for how many objects a harvest using this configuration will retrieve, given a job with 342 * a maximum limit pr. domain 343 * 344 * @param objectLimit The maximum limit, or Constants.HERITRIX_MAXOBJECTS_INFINITY for no limit. This limit 345 * overrides the limit set on the configuration, unless override is in effect. 346 * @param byteLimit The maximum number of bytes that will be used as limit in the harvest. This limit overrides the 347 * limit set on the configuration, unless override is in effect. 348 * @return The expected number of objects. 349 */ 350 public long getExpectedNumberOfObjects(long objectLimit, long byteLimit) { 351 long prevresultfactor = Settings.getLong(HarvesterSettings.ERRORFACTOR_PERMITTED_PREVRESULT); 352 HarvestInfo best = DomainHistory.getBestHarvestInfoExpectation(configName, this.domainhistory); 353 354 log.trace("Getting expectation, using domain info '{}' for configuration '{}'", best, cfgToString(this)); 355 356 long expectedObjectSize = getExpectedBytesPerObject(best); 357 // The maximum number of objects that the maxBytes or MAX_DOMAIN_SIZE 358 // setting gives. 359 long maximum; 360 if (objectLimit != Constants.HERITRIX_MAXOBJECTS_INFINITY || byteLimit != Constants.HERITRIX_MAXBYTES_INFINITY) { 361 maximum = minObjectsBytesLimit(objectLimit, byteLimit, expectedObjectSize); 362 } else if (maxObjects != Constants.HERITRIX_MAXOBJECTS_INFINITY 363 || maxBytes != Constants.HERITRIX_MAXBYTES_INFINITY) { 364 maximum = minObjectsBytesLimit(maxObjects, maxBytes, expectedObjectSize); 365 } else { 366 maximum = Settings.getLong(HarvesterSettings.MAX_DOMAIN_SIZE); 367 } 368 log.trace("Initial maximum: {}", maximum); 369 // get last number of objects harvested 370 long minimum; 371 if (best != null) { 372 minimum = best.getCountObjectRetrieved(); 373 } else { 374 minimum = NumberUtils.minInf(Constants.HERITRIX_MAXOBJECTS_INFINITY, maxObjects); 375 } 376 log.trace("Initial minimum: {}", minimum); 377 // Calculate the expected number of objects we will harvest. 378 long expectation; 379 if (best != null) { 380 if (best.getStopReason() == StopReason.DOWNLOAD_COMPLETE && maximum != -1) { 381 // We set the expectation, so our harvest will exceed the 382 // expectation at most <factor> times if the domain is a lot 383 // larger than our best guess. 384 expectation = minimum + ((maximum - minimum) / prevresultfactor); 385 } else { 386 // if stopped for different reason than DOWNLOAD_COMPLETE we 387 // add half the harvested size to expectation 388 expectation = minimum + ((maximum - minimum) / 2); 389 } 390 } else { 391 // Best guess: minimum of default max domain size and domain object 392 // limit 393 expectation = NumberUtils.minInf(Settings.getLong(HarvesterSettings.MAX_DOMAIN_SIZE), maxObjects); 394 } 395 log.trace("Initial expectation: {}", expectation); 396 // Always limit to domain specifics if set to do so. We always expect 397 // to actually hit this limit 398 if ((maxObjects > Constants.HERITRIX_MAXOBJECTS_INFINITY && maximum > maxObjects) 399 || (maxBytes > Constants.HERITRIX_MAXBYTES_INFINITY && maximum > maxBytes / expectedObjectSize)) { 400 log.trace("Using domain limits for {}", cfgToString(this)); 401 maximum = minObjectsBytesLimit(maxObjects, maxBytes, expectedObjectSize); 402 log.trace("New maximum: {}", maximum); 403 } 404 // Never return more than allowed maximum 405 expectation = Math.min(expectation, maximum); 406 407 log.trace("Expected number of objects for configuration '{}' is {}", cfgToString(this), expectation); 408 409 return expectation; 410 } 411 412 /** 413 * Return the lowest limit for the two values, or MAX_DOMAIN_SIZE if both are infinite, which is the max size we 414 * harvest from this domain. 415 * 416 * @param objectLimit A long value defining an object limit, or 0 for infinite 417 * @param byteLimit A long value defining a byte limit, or HarvesterSettings.MAX_DOMAIN_SIZE for infinite. 418 * @param expectedObjectSize The expected number of bytes per object 419 * @return The lowest of the two boundaries, or MAX_DOMAIN_SIZE if both are unlimited. 420 */ 421 public long minObjectsBytesLimit(long objectLimit, long byteLimit, long expectedObjectSize) { 422 long maxObjectsByBytes = byteLimit / expectedObjectSize; 423 if (objectLimit != Constants.HERITRIX_MAXOBJECTS_INFINITY) { 424 if (byteLimit != Constants.HERITRIX_MAXBYTES_INFINITY) { 425 return Math.min(objectLimit, maxObjectsByBytes); 426 } else { 427 return objectLimit; 428 } 429 } else { 430 if (byteLimit != Constants.HERITRIX_MAXBYTES_INFINITY) { 431 return maxObjectsByBytes; 432 } else { 433 return Settings.getLong(HarvesterSettings.MAX_DOMAIN_SIZE); 434 } 435 } 436 } 437 438 /** 439 * How many bytes we can expect the average object of a domain to be. If we have harvested no objects from this 440 * domain before, we use a setting EXPECTED_AVERAGE_BYTES_PER_OBJECT. If we have objects, we use the harvestinfo 441 * from previous harvests to calculate the harvest, but we only accept a low estimate if the number of harvested 442 * objects is greater than the setting MIN_OBJECTS_TO_TRUST_SMALL_EXPECTATION. 443 * 444 * @param bestInfo The best (newest complete or biggest, as per getBestHarvestInfoExpectation()) harvest info we 445 * have for the domain. 446 * @return How large we expect the average object to be. This number will be >= MIN_EXPECTATION (unless nothing is 447 * harvested and is EXPECTED_AVERAGE_BYTES_PER_OBJECT <= 0). 448 */ 449 private long getExpectedBytesPerObject(HarvestInfo bestInfo) { 450 long defaultExpectation = Settings.getLong(HarvesterSettings.EXPECTED_AVERAGE_BYTES_PER_OBJECT); 451 if (bestInfo != null && bestInfo.getCountObjectRetrieved() > 0) { 452 long expectation = Math.max(MIN_EXPECTATION, 453 bestInfo.getSizeDataRetrieved() / bestInfo.getCountObjectRetrieved()); 454 if (expectation < defaultExpectation 455 && bestInfo.getCountObjectRetrieved() < MIN_OBJECTS_TO_TRUST_SMALL_EXPECTATION) { 456 return defaultExpectation; 457 } 458 return expectation; 459 } else { 460 return defaultExpectation; 461 } 462 } 463 464 /** 465 * Set the comments field. 466 * 467 * @param comments User-entered free-form comments. 468 */ 469 public void setComments(String comments) { 470 ArgumentNotValid.checkNotNull(comments, "comments"); 471 this.comments = comments; 472 } 473 474 /** 475 * Remove a password from the list of passwords used in this domain. 476 * 477 * @param passwordName Password to Remove. 478 */ 479 public void removePassword(String passwordName) { 480 ArgumentNotValid.checkNotNullOrEmpty(passwordName, "passwordName"); 481 if (!usesPassword(passwordName)) { 482 throw new UnknownID("No password named '" + passwordName + "' found in '" + this + "'"); 483 } 484 for (Iterator<Password> i = passwords.iterator(); i.hasNext();) { 485 Password p = i.next(); 486 if (p.getName().equals(passwordName)) { 487 i.remove(); 488 } 489 } 490 } 491 492 /** 493 * Check whether this domain uses a given password. 494 * 495 * @param passwordName The given password 496 * @return whether the given password is used 497 */ 498 public boolean usesPassword(String passwordName) { 499 ArgumentNotValid.checkNotNullOrEmpty(passwordName, "passwordName"); 500 for (Password p : passwords) { 501 if (p.getName().equals(passwordName)) { 502 return true; 503 } 504 } 505 return false; 506 } 507 508 /** 509 * Sets the used passwords to the given list. Note: list is copied. 510 * 511 * @param newPasswords The passwords to use. 512 * @param domain The domain where the passwords should come from 513 * @throws ArgumentNotValid if the passwords are null 514 */ 515 public void setPasswords(Domain domain, List<Password> newPasswords) { 516 ArgumentNotValid.checkNotNull(newPasswords, "newPasswords"); 517 this.passwords = new ArrayList<Password>(newPasswords.size()); 518 for (Password p : newPasswords) { 519 addPassword(domain, p); 520 } 521 } 522 523 /** 524 * Get the ID of this configuration. 525 * 526 * @return the ID of this configuration 527 */ 528 public Long getID() { 529 return id; 530 } 531 532 /** 533 * Set the ID of this configuration. Only for use by DBDAO 534 * 535 * @param anId use this id for this configuration 536 */ 537 void setID(long anId) { 538 this.id = anId; 539 } 540 541 /** 542 * Check if this configuration has an ID set yet (doesn't happen until the DBDAO persists it). 543 * 544 * @return true, if the configuration has an ID 545 */ 546 boolean hasID() { 547 return id != null; 548 } 549 550 /** 551 * ToString of DomainConfiguration class. 552 * 553 * @return a string with info about the instance of this class. 554 */ 555 public String toString() { 556 return "Configuration '" + getName() + "' of domain '" + domainName + "'"; 557 } 558 559 /** 560 * Set the crawlerltraps for this configuration. 561 * 562 * @param someCrawlertraps a list of crawlertraps 563 */ 564 public void setCrawlertraps(List<String> someCrawlertraps) { 565 this.crawlertraps = someCrawlertraps; 566 } 567 568 /** 569 * @return the known crawlertraps for this configuration. 570 */ 571 public List<String> getCrawlertraps() { 572 return this.crawlertraps; 573 } 574 575 /** 576 * @return the domainhistory for this configuration 577 */ 578 public DomainHistory getDomainhistory() { 579 return domainhistory; 580 } 581 582 /** 583 * Set the domainHistory for this configuration. 584 * 585 * @param newDomainhistory the new domainHistory for this configuration( null is accepted for no History) 586 */ 587 public void setDomainhistory(DomainHistory newDomainhistory) { 588 this.domainhistory = newDomainhistory; 589 } 590 591 /** 592 * Change the name of configuration to the given configName. 593 * @param configName a new name for this configuration. 594 */ 595 public void setName(String configName) { 596 this.configName = configName; 597 } 598 599 /** 600 * Get this configurations EAV attributes and attribute types. 601 * @return this configurations EAV attributes and attribute types 602 */ 603 public List<AttributeAndType> getAttributesAndTypes() { 604 return attributesAndTypes; 605 } 606 607 /** 608 * Set this configurations EAV attributes and attribute types. 609 * @param attributesAndTypes EAV attributes and attribute types 610 */ 611 public void setAttributesAndTypes(List<AttributeAndType> attributesAndTypes) { 612 this.attributesAndTypes = attributesAndTypes; 613 } 614 615}