001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.datamodel; 024 025import java.util.ArrayList; 026import java.util.Iterator; 027import java.util.List; 028 029import org.slf4j.Logger; 030import org.slf4j.LoggerFactory; 031 032import dk.netarkivet.common.exceptions.ArgumentNotValid; 033import dk.netarkivet.common.exceptions.PermissionDenied; 034import dk.netarkivet.common.exceptions.UnknownID; 035import dk.netarkivet.common.utils.Named; 036import dk.netarkivet.common.utils.Settings; 037import dk.netarkivet.harvester.HarvesterSettings; 038 039/** 040 * This class describes a configuration for harvesting a domain. It combines a number of seedlists, a number of 041 * passwords, an order template, and some specialised settings to define the way to harvest a domain. 042 */ 043public class DomainConfiguration implements Named { 044 045 /** The class logger. */ 046 private static final Logger log = LoggerFactory.getLogger(DomainConfiguration.class); 047 048 /** The name of the configuration. */ 049 private String configName; 050 /** The name of the order.xml (Heritrix template) used by this configuration. */ 051 private String orderXmlName = ""; 052 /** maximum number of objects harvested for this configuration in a snapshot harvest. */ 053 private long maxObjects; 054 /** The maximum request rate. */ 055 private int maxRequestRate; 056 /** Maximum number of bytes to download in a harvest. */ 057 private long maxBytes; 058 /** The domain associated with this configuration. */ 059 private String domainName; 060 061 /** The list of seedlists. */ 062 private List<SeedList> seedlists; 063 064 /** The list of passwords that apply in this configuration. */ 065 private List<Password> passwords; 066 /** The comments associated with this configuration. */ 067 private String comments; 068 069 /** ID autogenerated by DB. */ 070 private Long id; 071 072 /** The domainhistory associated with the domain. */ 073 private DomainHistory domainhistory; 074 075 /** The crawlertraps associated with the domain. */ 076 private List<String> crawlertraps; 077 078 /** 079 * How many objects should be harvested in a harvest to trust that our expected size of objects is less than the 080 * default number. 081 */ 082 private static final long MIN_OBJECTS_TO_TRUST_SMALL_EXPECTATION = 50L; 083 /** The smallest number of bytes we accept per object. */ 084 private static final int MIN_EXPECTATION = 1; 085 086 /** 087 * Create a new configuration for a domain. 088 * 089 * @param theConfigName The name of this configuration 090 * @param domain The domain that this configuration is for 091 * @param seedlists Seedlists to use in this configuration. 092 * @param passwords Passwords to use in this configuration. 093 */ 094 public DomainConfiguration(String theConfigName, Domain domain, List<SeedList> seedlists, List<Password> passwords) { 095 this(theConfigName, domain.getName(), domain.getHistory(), domain.getCrawlerTraps(), seedlists, passwords); 096 } 097 098 /** 099 * Alternate constructor. TODO Filter all history not relevant for this configuration 100 * 101 * @param theConfigName theConfigName The name of this configuration 102 * @param domainName The name of the domain that this configuration is for 103 * @param history The domainhistory of the given domain 104 * @param crawlertraps The crawlertraps of the given domain 105 * @param seedlists Seedlists to use in this configuration 106 * @param passwords Passwords to use in this configuration. 107 */ 108 public DomainConfiguration(String theConfigName, String domainName, DomainHistory history, 109 List<String> crawlertraps, List<SeedList> seedlists, List<Password> passwords) { 110 ArgumentNotValid.checkNotNullOrEmpty(theConfigName, "theConfigName"); 111 ArgumentNotValid.checkNotNullOrEmpty(domainName, "domainName"); 112 ArgumentNotValid.checkNotNull(passwords, "passwords"); 113 ArgumentNotValid.checkNotNullOrEmpty(seedlists, "seedlists"); 114 115 this.configName = theConfigName; 116 this.domainName = domainName; 117 this.domainhistory = history; // TODO Filter all history not relevant 118 // for this configuration 119 this.crawlertraps = crawlertraps; 120 this.seedlists = seedlists; 121 this.passwords = passwords; 122 this.comments = ""; 123 this.maxRequestRate = Constants.DEFAULT_MAX_REQUEST_RATE; 124 this.maxObjects = Constants.DEFAULT_MAX_OBJECTS; 125 this.maxBytes = Constants.DEFAULT_MAX_BYTES; 126 } 127 128 /** 129 * Specify the name of the order.xml template to use. 130 * 131 * @param ordername order.xml template name 132 * @throws ArgumentNotValid if filename null or empty 133 */ 134 public void setOrderXmlName(String ordername) { 135 ArgumentNotValid.checkNotNullOrEmpty(ordername, "ordername"); 136 orderXmlName = ordername; 137 } 138 139 /** 140 * Specify the maximum number of objects to retrieve from the domain. 141 * 142 * @param max maximum number of objects to retrieve 143 * @throws ArgumentNotValid if max<-1 144 */ 145 public void setMaxObjects(long max) { 146 if (max < -MIN_EXPECTATION) { 147 String msg = "maxObjects must be either -1 or positive, but was " + max; 148 log.debug(msg); 149 throw new ArgumentNotValid(msg); 150 } 151 152 maxObjects = max; 153 } 154 155 /** 156 * Specify the maximum request rate to use when harvesting data. 157 * 158 * @param maxrate the maximum request rate 159 * @throws ArgumentNotValid if maxrate<0 160 */ 161 public void setMaxRequestRate(int maxrate) { 162 ArgumentNotValid.checkNotNegative(maxrate, "maxrate"); 163 164 maxRequestRate = maxrate; 165 } 166 167 /** 168 * Specify the maximum number of bytes to download from a domain in a single harvest. 169 * 170 * @param maxBytes Maximum number of bytes to download, or -1 for no limit. 171 * @throws ArgumentNotValid if maxBytes < -1 172 */ 173 public void setMaxBytes(long maxBytes) { 174 if (maxBytes < -MIN_EXPECTATION) { 175 String msg = "DomainConfiguration.maxBytes must be -1 or positive."; 176 log.debug(msg); 177 throw new ArgumentNotValid(msg); 178 } 179 this.maxBytes = maxBytes; 180 } 181 182 /** 183 * Get the configuration name. 184 * 185 * @return the configuration name 186 */ 187 public String getName() { 188 return configName; 189 } 190 191 /** 192 * Returns comments. 193 * 194 * @return string containing comments 195 */ 196 public String getComments() { 197 return comments; 198 } 199 200 /** 201 * Returns the name of the order xml file used by the domain. 202 * 203 * @return name of the order.xml file that should be used when harvesting the domain 204 */ 205 public String getOrderXmlName() { 206 return orderXmlName; 207 } 208 209 /** 210 * Returns the maximum number of objects to harvest from the domain. 211 * 212 * @return maximum number of objects to harvest 213 */ 214 public long getMaxObjects() { 215 return maxObjects; 216 } 217 218 /** 219 * Returns the maximum request rate to use when harvesting the domain. 220 * 221 * @return maximum request rate 222 */ 223 public int getMaxRequestRate() { 224 return maxRequestRate; 225 } 226 227 /** 228 * Returns the maximum number of bytes to download during a single harvest of a domain. 229 * 230 * @return Maximum bytes limit, or -1 for no limit. 231 */ 232 public long getMaxBytes() { 233 return maxBytes; 234 } 235 236 /** 237 * Returns the name of the domain aggregating this configuration. 238 * 239 * @return the name of the domain aggregating this configuration. 240 */ 241 public String getDomainName() { 242 return domainName; 243 } 244 245 /** 246 * Get an iterator of seedlists used in this configuration. 247 * 248 * @return seedlists as iterator 249 */ 250 public Iterator<SeedList> getSeedLists() { 251 return seedlists.iterator(); 252 } 253 254 /** 255 * Add a new seedlist to the configuration. Must exist in the associated domain and the equal to that seedlist. 256 * 257 * @param seedlist the seedlist to add 258 * @param domain The domain to check if the seedlist exists 259 * @throws ArgumentNotValid if the seedlist is null 260 * @throws UnknownID if the seedlist is not defined on the domain 261 * @throws PermissionDenied if the seedlist is different from the one on the domain. 262 */ 263 public void addSeedList(Domain domain, SeedList seedlist) { 264 ArgumentNotValid.checkNotNull(seedlist, "seedlist"); 265 SeedList domainSeedlist = domain.getSeedList(seedlist.getName()); 266 if (domainSeedlist == null || !domainSeedlist.equals(seedlist)) { 267 String message = "Cannot add seedlist " + seedlist + " to " + this + " as it differs from the one defined " 268 + "for " + domain + ": " + domainSeedlist; 269 log.debug(message); 270 throw new PermissionDenied(message); 271 } 272 seedlists.add(domainSeedlist); 273 } 274 275 /** 276 * Sets the used seedlists to the given list. Note: list is copied. 277 * 278 * @param newSeedlists The seedlists to use. 279 * @param domain The domain where the seedlists should come from 280 * @throws ArgumentNotValid if the seedslists are null 281 */ 282 public void setSeedLists(Domain domain, List<SeedList> newSeedlists) { 283 ArgumentNotValid.checkNotNull(newSeedlists, "newSeedlists"); 284 this.seedlists = new ArrayList<SeedList>(newSeedlists.size()); 285 for (SeedList s : newSeedlists) { 286 addSeedList(domain, s); 287 } 288 } 289 290 /** 291 * Get an iterator of passwords used in this configuration. 292 * 293 * @return The passwords in an iterator 294 */ 295 public Iterator<Password> getPasswords() { 296 return passwords.iterator(); 297 } 298 299 /** 300 * Add password to the configuration. 301 * 302 * @param password to add (must exist in the domain) 303 * @param domain the domain where the password should come from. 304 */ 305 public void addPassword(Domain domain, Password password) { 306 ArgumentNotValid.checkNotNull(password, "password"); 307 Password domainPassword = domain.getPassword(password.getName()); 308 if (!domainPassword.equals(password)) { 309 String message = "Cannot add password " + password + " to " + this + " as it differs from the one defined " 310 + "for " + domain + ": " + domainPassword; 311 log.debug(message); 312 throw new PermissionDenied(message); 313 } 314 passwords.add(domainPassword); 315 } 316 317 /** 318 * Gets the best expectation for how many objects a harvest using this configuration will retrieve, given a job with 319 * a maximum limit pr. domain 320 * 321 * @param objectLimit The maximum limit, or Constants.HERITRIX_MAXOBJECTS_INFINITY for no limit. This limit 322 * overrides the limit set on the configuration, unless override is in effect. 323 * @param byteLimit The maximum number of bytes that will be used as limit in the harvest. This limit overrides the 324 * limit set on the configuration, unless override is in effect. 325 * @return The expected number of objects. 326 */ 327 public long getExpectedNumberOfObjects(long objectLimit, long byteLimit) { 328 long prevresultfactor = Settings.getLong(HarvesterSettings.ERRORFACTOR_PERMITTED_PREVRESULT); 329 HarvestInfo best = DomainHistory.getBestHarvestInfoExpectation(configName, this.domainhistory); 330 331 log.trace("Using domain info '{}' for configuration '{}'", best, toString()); 332 333 long expectedObjectSize = getExpectedBytesPerObject(best); 334 // The maximum number of objects that the maxBytes or MAX_DOMAIN_SIZE 335 // setting gives. 336 long maximum; 337 if (objectLimit != Constants.HERITRIX_MAXOBJECTS_INFINITY || byteLimit != Constants.HERITRIX_MAXBYTES_INFINITY) { 338 maximum = minObjectsBytesLimit(objectLimit, byteLimit, expectedObjectSize); 339 } else if (maxObjects != Constants.HERITRIX_MAXOBJECTS_INFINITY 340 || maxBytes != Constants.HERITRIX_MAXBYTES_INFINITY) { 341 maximum = minObjectsBytesLimit(maxObjects, maxBytes, expectedObjectSize); 342 } else { 343 maximum = Settings.getLong(HarvesterSettings.MAX_DOMAIN_SIZE); 344 } 345 // get last number of objects harvested 346 long minimum; 347 if (best != null) { 348 minimum = best.getCountObjectRetrieved(); 349 } else { 350 minimum = NumberUtils.minInf(Constants.HERITRIX_MAXOBJECTS_INFINITY, maxObjects); 351 } 352 // Calculate the expected number of objects we will harvest. 353 long expectation; 354 if (best != null) { 355 if (best.getStopReason() == StopReason.DOWNLOAD_COMPLETE && maximum != -1) { 356 // We set the expectation, so our harvest will exceed the 357 // expectation at most <factor> times if the domain is a lot 358 // larger than our best guess. 359 expectation = minimum + ((maximum - minimum) / prevresultfactor); 360 } else { 361 // if stopped for different reason than DOWNLOAD_COMPLETE we 362 // add half the harvested size to expectation 363 expectation = minimum + ((maximum - minimum) / 2); 364 } 365 } else { 366 // Best guess: minimum of default max domain size and domain object 367 // limit 368 expectation = NumberUtils.minInf(Settings.getLong(HarvesterSettings.MAX_DOMAIN_SIZE), maxObjects); 369 } 370 // Always limit to domain specifics if set to do so. We always expect 371 // to actually hit this limit 372 if ((maxObjects > Constants.HERITRIX_MAXOBJECTS_INFINITY && maximum > maxObjects) 373 || (maxBytes > Constants.HERITRIX_MAXBYTES_INFINITY && maximum > maxBytes / expectedObjectSize)) { 374 maximum = minObjectsBytesLimit(maxObjects, maxBytes, expectedObjectSize); 375 } 376 // Never return more than allowed maximum 377 expectation = Math.min(expectation, maximum); 378 379 log.trace("Expected number of objects for configuration '{}' is {}", toString(), expectation); 380 381 return expectation; 382 } 383 384 /** 385 * Return the lowest limit for the two values, or MAX_DOMAIN_SIZE if both are infinite, which is the max size we 386 * harvest from this domain. 387 * 388 * @param objectLimit A long value defining an object limit, or 0 for infinite 389 * @param byteLimit A long value defining a byte limit, or HarvesterSettings.MAX_DOMAIN_SIZE for infinite. 390 * @param expectedObjectSize The expected number of bytes per object 391 * @return The lowest of the two boundaries, or MAX_DOMAIN_SIZE if both are unlimited. 392 */ 393 public long minObjectsBytesLimit(long objectLimit, long byteLimit, long expectedObjectSize) { 394 long maxObjectsByBytes = byteLimit / expectedObjectSize; 395 if (objectLimit != Constants.HERITRIX_MAXOBJECTS_INFINITY) { 396 if (byteLimit != Constants.HERITRIX_MAXBYTES_INFINITY) { 397 return Math.min(objectLimit, maxObjectsByBytes); 398 } else { 399 return objectLimit; 400 } 401 } else { 402 if (byteLimit != Constants.HERITRIX_MAXBYTES_INFINITY) { 403 return maxObjectsByBytes; 404 } else { 405 return Settings.getLong(HarvesterSettings.MAX_DOMAIN_SIZE); 406 } 407 } 408 } 409 410 /** 411 * How many bytes we can expect the average object of a domain to be. If we have harvested no objects from this 412 * domain before, we use a setting EXPECTED_AVERAGE_BYTES_PER_OBJECT. If we have objects, we use the harvestinfo 413 * from previous harvests to calculate the harvest, but we only accept a low estimate if the number of harvested 414 * objects is greater than the setting MIN_OBJECTS_TO_TRUST_SMALL_EXPECTATION. 415 * 416 * @param bestInfo The best (newest complete or biggest, as per getBestHarvestInfoExpectation()) harvest info we 417 * have for the domain. 418 * @return How large we expect the average object to be. This number will be >= MIN_EXPECTATION (unless nothing is 419 * harvested and is EXPECTED_AVERAGE_BYTES_PER_OBJECT <= 0). 420 */ 421 private long getExpectedBytesPerObject(HarvestInfo bestInfo) { 422 long defaultExpectation = Settings.getLong(HarvesterSettings.EXPECTED_AVERAGE_BYTES_PER_OBJECT); 423 if (bestInfo != null && bestInfo.getCountObjectRetrieved() > 0) { 424 long expectation = Math.max(MIN_EXPECTATION, 425 bestInfo.getSizeDataRetrieved() / bestInfo.getCountObjectRetrieved()); 426 if (expectation < defaultExpectation 427 && bestInfo.getCountObjectRetrieved() < MIN_OBJECTS_TO_TRUST_SMALL_EXPECTATION) { 428 return defaultExpectation; 429 } 430 return expectation; 431 } else { 432 return defaultExpectation; 433 } 434 } 435 436 /** 437 * Set the comments field. 438 * 439 * @param comments User-entered free-form comments. 440 */ 441 public void setComments(String comments) { 442 ArgumentNotValid.checkNotNull(comments, "comments"); 443 this.comments = comments; 444 } 445 446 /** 447 * Remove a password from the list of passwords used in this domain. 448 * 449 * @param passwordName Password to Remove. 450 */ 451 public void removePassword(String passwordName) { 452 ArgumentNotValid.checkNotNullOrEmpty(passwordName, "passwordName"); 453 if (!usesPassword(passwordName)) { 454 throw new UnknownID("No password named '" + passwordName + "' found in '" + this + "'"); 455 } 456 for (Iterator<Password> i = passwords.iterator(); i.hasNext();) { 457 Password p = i.next(); 458 if (p.getName().equals(passwordName)) { 459 i.remove(); 460 } 461 } 462 } 463 464 /** 465 * Check whether this domain uses a given password. 466 * 467 * @param passwordName The given password 468 * @return whether the given password is used 469 */ 470 public boolean usesPassword(String passwordName) { 471 ArgumentNotValid.checkNotNullOrEmpty(passwordName, "passwordName"); 472 for (Password p : passwords) { 473 if (p.getName().equals(passwordName)) { 474 return true; 475 } 476 } 477 return false; 478 } 479 480 /** 481 * Sets the used passwords to the given list. Note: list is copied. 482 * 483 * @param newPasswords The passwords to use. 484 * @param domain The domain where the passwords should come from 485 * @throws ArgumentNotValid if the passwords are null 486 */ 487 public void setPasswords(Domain domain, List<Password> newPasswords) { 488 ArgumentNotValid.checkNotNull(newPasswords, "newPasswords"); 489 this.passwords = new ArrayList<Password>(newPasswords.size()); 490 for (Password p : newPasswords) { 491 addPassword(domain, p); 492 } 493 } 494 495 /** 496 * Get the ID of this configuration. 497 * 498 * @return the ID of this configuration 499 */ 500 public long getID() { 501 return id; 502 } 503 504 /** 505 * Set the ID of this configuration. Only for use by DBDAO 506 * 507 * @param anId use this id for this configuration 508 */ 509 void setID(long anId) { 510 this.id = anId; 511 } 512 513 /** 514 * Check if this configuration has an ID set yet (doesn't happen until the DBDAO persists it). 515 * 516 * @return true, if the configuration has an ID 517 */ 518 boolean hasID() { 519 return id != null; 520 } 521 522 /** 523 * ToString of DomainConfiguration class. 524 * 525 * @return a string with info about the instance of this class. 526 */ 527 public String toString() { 528 return "Configuration '" + getName() + "' of domain '" + domainName + "'"; 529 } 530 531 /** 532 * Set the crawlerltraps for this configuration. 533 * 534 * @param someCrawlertraps a list of crawlertraps 535 */ 536 public void setCrawlertraps(List<String> someCrawlertraps) { 537 this.crawlertraps = someCrawlertraps; 538 } 539 540 /** 541 * @return the known crawlertraps for this configuration. 542 */ 543 public List<String> getCrawlertraps() { 544 return this.crawlertraps; 545 } 546 547 /** 548 * @return the domainhistory for this configuration 549 */ 550 public DomainHistory getDomainhistory() { 551 return domainhistory; 552 } 553 554 /** 555 * Set the domainHistory for this configuration. 556 * 557 * @param newDomainhistory the new domainHistory for this configuration( null is accepted for no History) 558 */ 559 public void setDomainhistory(DomainHistory newDomainhistory) { 560 this.domainhistory = newDomainhistory; 561 } 562 563}