001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.datamodel; 024 025import gnu.inet.encoding.IDNA; 026import gnu.inet.encoding.IDNAException; 027 028import java.io.BufferedReader; 029import java.io.File; 030import java.io.IOException; 031import java.io.Serializable; 032import java.io.StringReader; 033import java.net.MalformedURLException; 034import java.net.URL; 035import java.util.ArrayList; 036import java.util.Collections; 037import java.util.Date; 038import java.util.HashMap; 039import java.util.HashSet; 040import java.util.Iterator; 041import java.util.List; 042import java.util.Map; 043import java.util.Set; 044import java.util.TreeSet; 045import java.util.regex.Pattern; 046 047import org.apache.commons.io.IOUtils; 048import org.slf4j.Logger; 049import org.slf4j.LoggerFactory; 050 051import dk.netarkivet.common.exceptions.ArgumentNotValid; 052import dk.netarkivet.common.exceptions.IOFailure; 053import dk.netarkivet.common.exceptions.IllegalState; 054import dk.netarkivet.common.utils.DomainUtils; 055import dk.netarkivet.common.utils.Settings; 056import dk.netarkivet.common.utils.StringUtils; 057import dk.netarkivet.harvester.HarvesterSettings; 058import dk.netarkivet.harvester.datamodel.eav.EAV.AttributeAndType; 059import dk.netarkivet.harvester.harvesting.ArchiveFileNaming; 060import dk.netarkivet.harvester.harvesting.ArchiveFileNamingFactory; 061import dk.netarkivet.harvester.harvesting.JobInfo; 062 063/** 064 * This class represents one job to run by Heritrix. It's based on a number of configurations all based on the same 065 * order.xml and at most one configuration for each domain. Each job consists of configurations of the approximate same 066 * size; that is the difference in expectation from the smallest configuration to the largest configuration is within a 067 * factor of each other defined as limMaxRelSize (although differences smaller than limMinAbsSize are ignored) There is 068 * a limit limMaxTotalSize on the total size of the job in objects. 069 * <p> 070 * A job may also be limited on bytes or objects, defined either by the configurations in the job or the harvest 071 * definition the job is generated by. 072 * <p> 073 * The job contains the order file, the seedlist and the current status of the job, as well as the ID of the harvest 074 * definition that defined it and names of all the configurations it is based on. 075 */ 076@SuppressWarnings({"serial"}) 077public class Job implements Serializable, JobInfo { 078 private transient static final Logger log = LoggerFactory.getLogger(Job.class); 079 080 // Persistent fields stored in and read from DAO 081 /** The persistent ID of this job. */ 082 private Long jobID; 083 /** The Id of the harvestdefinition, that generated this job. */ 084 protected Long origHarvestDefinitionID; 085 /** The status of the job. See the JobStatus class for the possible states. */ 086 protected JobStatus status; 087 /** The name of the {@link HarvestChannel} on which this job will be posted. */ 088 private String channel; 089 090 /** Whether the job belongs to a snapshot or partial harvest. */ 091 private boolean isSnapshot; 092 /** 093 * Overrides the individual configurations maximum setting for objects retrieved from a domain when set to a 094 * positive value. 095 */ 096 private long forceMaxObjectsPerDomain = Constants.HERITRIX_MAXOBJECTS_INFINITY; 097 /** 098 * Overrides the individual configurations maximum setting for bytes retrieved from a domain when set to other than 099 * -1. 100 */ 101 private long forceMaxBytesPerDomain = Constants.HERITRIX_MAXBYTES_INFINITY; 102 /** The name of the harvest template used by the job. */ 103 private String orderXMLname; 104 /** The harvest template used by the job. */ 105 private HeritrixTemplate orderXMLdoc; 106 /** The list of Heritrix settings files. */ 107 private File[] settingsXMLfiles; 108 109 /** The corresponding Dom4j Documents for these files. */ 110 //private Document[] settingsXMLdocs; 111 112 /** 113 * A set of seeds involved in this job. Outside the SetSeedList() method, the set of seeds is updated in the 114 * addConfiguration() method. 115 */ 116 private Set<String> seedListSet = new HashSet<String>(); 117 /** Which run of the harvest definition this is. */ 118 private int harvestNum; 119 /** Errors during harvesting. */ 120 private String harvestErrors; 121 /** Details about errors during harvesting. */ 122 private String harvestErrorDetails; 123 /** Errors during upload of the harvested data. */ 124 private String uploadErrors; 125 /** Details about errors during upload of the harvested data. */ 126 private String uploadErrorDetails; 127 /** The starting point of the job. */ 128 private Date actualStart; 129 /** The ending point of the job. */ 130 private Date actualStop; 131 /** The time when this job was submitted. */ 132 private Date submittedDate; 133 /** The time when this job was created. */ 134 private Date creationDate; 135 136 /** Edition is used by the DAO to keep track of changes. */ 137 private long edition = -1; 138 139 /** Resubmitted as the Job with this ID. If null, this job has not been resubmitted. */ 140 private Long resubmittedAsJobWithID; 141 142 /** Continuation of this job. */ 143 private Long continuationOF; 144 145 /** 146 * A map (domainName, domainConfigurationName), must be accessible in order to update job information (see Ass. 147 * 2.4.3) 148 */ 149 private Map<String, String> domainConfigurationMap; 150 /** 151 * A hint to the DAO that configurations have changed. Since configurations are large, the DAO can use that this is 152 * false to avoid updating the config list. The DAO can set it to false after saving configurations. 153 */ 154 boolean configsChanged = false; 155 156 // Intermediate fields, non-persistent and only used while building objects 157 158 /** 159 * Whether the maxObjects field was defined by the harvest definition or the configuration limit. This is deciding 160 * for whether we accept smaller configurations or not when building jobs. True means the limit is defined by the 161 * configuration, false means that it is defined by the harvest definition. 162 */ 163 private boolean configurationSetsObjectLimit; 164 165 /** 166 * Whether the maxBytes field was defined by the harvest definition or the configuration limit. This is deciding for 167 * whether we accept smaller configurations or not when building jobs. True means the limit is defined by the 168 * configuration, false means by the harvest definition. 169 */ 170 private boolean configurationSetsByteLimit; 171 172 /** The lowest number of objects expected by a configuration. */ 173 private long minCountObjects; 174 175 /** The highest number of objects expected by a configuration. */ 176 private long maxCountObjects; 177 178 /** The total number of objects expected by all added configurations. */ 179 private long totalCountObjects; 180 181 /** 182 * The max time in seconds given to the harvester for this job. 0 is unlimited. 183 */ 184 private long forceMaxRunningTime; 185 186 /** 187 * If true, this job object is still undergoing changes due to having more configurations added. When set to false, 188 * the object is no longer considered immutable except for updating status. 189 * <p> 190 * Jobs loaded from the DAO are never under construction anymore. 191 */ 192 private boolean underConstruction = true; 193 194 // Constants 195 196 // Note: The following constants are intentionally left non-static for easy 197 // unit testing 198 199 private boolean maxObjectsIsSetByQuotaEnforcer = Settings 200 .getBoolean(HarvesterSettings.OBJECT_LIMIT_SET_BY_QUOTA_ENFORCER); 201 202 /** 203 * The harvestname prefix used in the files generated by Heritrix. Is set using an ArchiveFileNaming class when the 204 * jobID is available. 205 */ 206 private String harvestnamePrefix; 207 208 /** This variable is right now the same as harvestdefinitions.audience field. */ 209 private String harvestAudience; 210 211 protected Job() { 212 this.status = JobStatus.NEW; 213 } 214 215 /** 216 * Package private constructor for common initialisation. 217 * 218 * @param harvestID the id of the harvestdefinition 219 * @param cfg the configuration to base the Job on 220 * @param orderXMLdoc 221 * @param channel the channel on which the job will be submitted. 222 * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain, overrides individual 223 * configuration settings. -1 means no limit 224 * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for no limit. 225 * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job 226 * @param harvestNum the run number of the harvest definition 227 * @throws ArgumentNotValid if cfg or priority is null or harvestID is invalid, or if any limit < -1 228 */ 229 public Job(Long harvestID, DomainConfiguration cfg, HeritrixTemplate orderXMLdoc, HarvestChannel channel, 230 long forceMaxObjectsPerDomain, 231 long forceMaxBytesPerDomain, long forceMaxJobRunningTime, int harvestNum) throws ArgumentNotValid { 232 ArgumentNotValid.checkNotNull(cfg, "cfg"); 233 ArgumentNotValid.checkNotNull(harvestID, "harvestID"); 234 ArgumentNotValid.checkNotNegative(harvestID, "harvestID"); 235 ArgumentNotValid.checkNotNull(channel, "channel"); 236 237 if (forceMaxObjectsPerDomain < -1) { 238 String msg = "forceMaxObjectsPerDomain must be either -1 or positive"; 239 log.debug(msg); 240 throw new ArgumentNotValid(msg); 241 } 242 if (forceMaxBytesPerDomain < -1) { 243 String msg = "forceMaxBytesPerDomain must be either -1 or positive"; 244 log.debug(msg); 245 throw new ArgumentNotValid(msg); 246 } 247 248 if (forceMaxBytesPerDomain == 0L) { 249 log.warn("forceMaxBytesPerDomain should probably not be 0.Means 0 bytes downloaded per domain"); 250 } 251 252 if (forceMaxObjectsPerDomain == 0L) { 253 log.warn("forceMaxObjectsPerDomain should probably not be 0.Means 0 objects downloaded per domain"); 254 } 255 256 // setup initial members 257 domainConfigurationMap = new HashMap<>(); 258 origHarvestDefinitionID = harvestID; 259 orderXMLname = cfg.getOrderXmlName(); 260 this.orderXMLdoc = orderXMLdoc; 261 262 setHarvestChannel(channel); 263 264 long maxObjects = NumberUtils.minInf(forceMaxObjectsPerDomain, cfg.getMaxObjects()); 265 setMaxObjectsPerDomain(maxObjects); 266 configurationSetsObjectLimit = (maxObjects != forceMaxObjectsPerDomain); 267 268 long maxBytes = NumberUtils.minInf(forceMaxBytesPerDomain, cfg.getMaxBytes()); 269 setMaxBytesPerDomain(maxBytes); 270 configurationSetsByteLimit = (maxBytes != forceMaxBytesPerDomain); 271 272 long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain); 273 maxCountObjects = expectation; 274 minCountObjects = expectation; 275 this.harvestNum = harvestNum; 276 277 addConfiguration(cfg); 278 279 setMaxJobRunningTime(forceMaxJobRunningTime); 280 281 setArchiveFormatInTemplate(Settings.get(HarvesterSettings.HERITRIX_ARCHIVE_FORMAT)); 282 283 setAttributes(cfg.getAttributesAndTypes()); 284 285 orderXMLdoc.enableOrDisableDeduplication(Settings.getBoolean(HarvesterSettings.DEDUPLICATION_ENABLED)); 286 287 status = JobStatus.NEW; 288 } 289 290 public void setAttributes(List<AttributeAndType> attributesAndTypes) { 291 orderXMLdoc.insertAttributes(attributesAndTypes); 292 } 293 294 /** 295 * Update the order template according to the chosen archive format (arc/warc). 296 */ 297 private void setArchiveFormatInTemplate(String archiveFormat) { 298 if (!underConstruction) { 299 final String msg = "Cannot modify job " + this + " as it is no longer under construction"; 300 log.debug(msg); 301 throw new IllegalState(msg); 302 } 303 orderXMLdoc.setArchiveFormat(archiveFormat); 304 } 305 306 /** 307 * Create a new Job object from basic information stored in the DAO. 308 * 309 * @param harvestID the id of the harvestdefinition 310 * @param configurations the configurations to base the Job on 311 * @param channel the name of the channel on which the job will be submitted. 312 * @param snapshot whether the job belongs to a snapshot harvest 313 * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain, overrides individual 314 * configuration settings. 0 means no limit. 315 * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for no limit. 316 * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job 317 * @param status the current status of the job. 318 * @param orderXMLname the name of the order template used. 319 * @param orderXMLdoc the (possibly modified) template 320 * @param seedlist the combined seedlist from all configs. 321 * @param harvestNum the run number of the harvest definition 322 */ 323 Job(Long harvestID, Map<String, String> configurations, String channel, boolean snapshot, 324 long forceMaxObjectsPerDomain, long forceMaxBytesPerDomain, long forceMaxJobRunningTime, JobStatus status, 325 String orderXMLname, HeritrixTemplate orderXMLdoc, String seedlist, int harvestNum, Long continuationOf) { 326 origHarvestDefinitionID = harvestID; 327 domainConfigurationMap = configurations; 328 this.channel = channel; 329 this.isSnapshot = snapshot; 330 this.forceMaxBytesPerDomain = forceMaxBytesPerDomain; 331 this.forceMaxObjectsPerDomain = forceMaxObjectsPerDomain; 332 this.forceMaxRunningTime = forceMaxJobRunningTime; 333 this.status = status; 334 this.orderXMLname = orderXMLname; 335 this.orderXMLdoc = orderXMLdoc; 336 this.setSeedList(seedlist); 337 this.harvestNum = harvestNum; 338 this.continuationOF = continuationOf; 339 340 underConstruction = false; 341 } 342 343 344 /** 345 * Adds a configuration to this Job. Seedlists and settings are updated accordingly. 346 * 347 * @param cfg the configuration to add 348 * @throws ArgumentNotValid if cfg is null or cfg uses a different orderxml than this job or if this job already 349 * contains a configuration associated with domain of configuration cfg. 350 */ 351 public void addConfiguration(DomainConfiguration cfg) { 352 ArgumentNotValid.checkNotNull(cfg, "cfg"); 353 if (domainConfigurationMap.containsKey(cfg.getDomainName())) { 354 throw new ArgumentNotValid("Job already has a configuration for Domain " + cfg.getDomainName()); 355 } 356 357 if (log.isTraceEnabled()) { 358 log.trace("Adding configuration '{}' to job '{}'", cfg, cfg.getName()); 359 } 360 361 if (!underConstruction) { 362 final String msg = "Cannot modify job " + this + " as it is no longer under construction"; 363 log.debug(msg); 364 throw new IllegalState(msg); 365 } 366 367 if (!cfg.getOrderXmlName().equals(getOrderXMLName())) { 368 throw new ArgumentNotValid("Job requires the orderxml file:'" + getOrderXMLName() + "' not:'" 369 + cfg.getOrderXmlName() + "' used by the configuration:'" + cfg.getName()); 370 } 371 372 domainConfigurationMap.put(cfg.getDomainName(), cfg.getName()); 373 374 // Add the seeds from the configuration to the Job seeds. 375 // Take care of duplicates. 376 for (Iterator<SeedList> itt = cfg.getSeedLists(); itt.hasNext();) { 377 SeedList seed = itt.next(); 378 List<String> seeds = seed.getSeeds(); 379 for (String seedUrl : seeds) { 380 seedListSet.add(seedUrl); // duplicates is silently ignored 381 382 // TODO remove when heritrix implements this functionality 383 // try to convert a seed into a Internationalized Domain Name 384 try { 385 String seedASCII = seedUrl; 386 // It is rare to see these seeds, but they need to be 387 // correctly idnaized 388 if (seedUrl.contains(":") || seedUrl.contains("/")) { 389 String normalizedUrl = seedUrl; 390 if (!normalizedUrl.matches("^[a-zA-Z]+:.*")) { 391 // If no protocol is given, assume http 392 normalizedUrl = "http://" + normalizedUrl; 393 } 394 URL url = new URL(normalizedUrl); 395 String domainName = url.getHost(); 396 String domainNameASCII = IDNA.toASCII(domainName); 397 if (!domainName.equals(domainNameASCII)) { 398 // If the domain name changed, replace that in the 399 // seed. 400 seedASCII = seedUrl.replaceFirst(Pattern.quote(domainName), domainNameASCII); 401 } 402 } else { 403 seedASCII = IDNA.toASCII(seedUrl); 404 } 405 if (!seedASCII.equals(seedUrl)) { 406 log.trace("Converted {} to {}", seedUrl, seedASCII); 407 // Note that duplicates is silently ignored 408 seedListSet.add(seedASCII); 409 } 410 } catch (IDNAException e) { 411 log.trace("Cannot convert seed {} to ASCII", seedUrl, e); 412 } catch (MalformedURLException e) { 413 log.trace("Cannot convert seed {} to ASCII", seedUrl, e); 414 } 415 } 416 } 417 418 orderXMLdoc.editOrderXMLAddPerDomainCrawlerTraps(cfg); 419 420 // TODO update limits in settings files - see also bug 269 421 422 // Update estimates of job size 423 long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain); 424 maxCountObjects = Math.max(expectation, maxCountObjects); 425 minCountObjects = Math.min(expectation, minCountObjects); 426 totalCountObjects += expectation; 427 428 configsChanged = true; 429 430 assert (maxCountObjects >= minCountObjects) : "basic invariant"; 431 } 432 433 /** 434 * Get the name of the order XML file used by this Job. 435 * 436 * @return the name of the orderXML file 437 */ 438 public String getOrderXMLName() { 439 return orderXMLname; 440 } 441 442 /** 443 * Get the actual time when this job was stopped/completed. 444 * 445 * @return the time as Date 446 */ 447 public Date getActualStop() { 448 return actualStop; 449 } 450 451 /** 452 * Get the actual time when this job was started. 453 * 454 * @return the time as Date 455 */ 456 public Date getActualStart() { 457 return actualStart; 458 } 459 460 /** 461 * Get the time when this job was submitted. 462 * 463 * @return the time as Date 464 */ 465 public Date getSubmittedDate() { 466 return submittedDate; 467 } 468 469 /** 470 * Get the time when this job was created. 471 * 472 * @return the creation time as a <code>Date</code> 473 */ 474 public Date getCreationDate() { 475 return creationDate; 476 } 477 478 /** 479 * Get a list of Heritrix settings.xml files. Note that these files have nothing to do with NetarchiveSuite settings 480 * files. They are files that supplement the Heritrix order.xml files, and contain overrides for specific domains. 481 * 482 * @return the list of Files as an array 483 */ 484 public File[] getSettingsXMLfiles() { 485 return settingsXMLfiles; 486 } 487 488 /** 489 * Get the id of the HarvestDefinition from which this job originates. 490 * 491 * @return the id as a Long 492 */ 493 public Long getOrigHarvestDefinitionID() { 494 return origHarvestDefinitionID; 495 } 496 497 /** 498 * Get the id of this Job. 499 * 500 * @return the id as a Long 501 */ 502 public Long getJobID() { 503 return jobID; 504 } 505 506 /** 507 * Set the id of this Job. 508 * 509 * @param id The Id for this job. 510 */ 511 public void setJobID(Long id) { 512 jobID = id; 513 } 514 515 /** 516 * Get's the total number of different domains harvested by this job. 517 * 518 * @return the number of configurations added to this domain 519 */ 520 public int getCountDomains() { 521 return domainConfigurationMap.size(); 522 } 523 524 /** 525 * Set the actual time when this job was started. 526 * <p> 527 * Sends a notification, if actualStart is set to a time after actualStop. 528 * 529 * @param actualStart A Date object representing the time when this job was started. 530 */ 531 public void setActualStart(Date actualStart) { 532 ArgumentNotValid.checkNotNull(actualStart, "actualStart"); 533 if (actualStop != null && actualStop.before(actualStart)) { 534 log.warn("Job(" + getJobID()+ "): Start time (" + actualStart + ") is after end time: " + actualStop); 535 } 536 this.actualStart = (Date) actualStart.clone(); 537 } 538 539 /** 540 * Set the actual time when this job was stopped/completed. Sends a notification, if actualStop is set to a time 541 * before actualStart. 542 * 543 * @param actualStop A Date object representing the time when this job was stopped. 544 * @throws ArgumentNotValid 545 */ 546 public void setActualStop(Date actualStop) throws ArgumentNotValid { 547 ArgumentNotValid.checkNotNull(actualStop, "actualStop"); 548 if (actualStart == null) { 549 log.warn("Job(" + getJobID()+ "): actualStart should be defined before setting actualStop"); 550 } else if (actualStop.before(actualStart)) { 551 log.warn("Job(" + getJobID()+ "): actualStop (" + actualStop + ") is before actualStart: " + actualStart); 552 } 553 this.actualStop = (Date) actualStop.clone(); 554 } 555 556 /** 557 * Set the orderxml for this job. 558 * 559 * @param doc A orderxml to be used by this job 560 */ 561 public void setOrderXMLDoc(HeritrixTemplate doc) { 562 ArgumentNotValid.checkNotNull(doc, "doc"); 563 this.orderXMLdoc = doc; 564 } 565 566 /** 567 * Gets a document representation of the order.xml associated with this Job. 568 * 569 * @return the XML as a org.dom4j.Document 570 */ 571 public HeritrixTemplate getOrderXMLdoc() { 572 return orderXMLdoc; 573 } 574 575// /** 576// * Gets a list of document representations of the settings.xml's associated with this Job. 577// * 578// * @return the XML as an array of org.dom4j.Document 579// */ 580// public Document[] getSettingsXMLdocs() { 581// return settingsXMLdocs; 582// } 583 584 /** 585 * Set the seedlist of the job from the seedList argument. Individual seeds are separated by a '\n' character. 586 * Duplicate seeds are removed. 587 * 588 * @param seedList List of seeds as one String 589 */ 590 public void setSeedList(String seedList) { 591 ArgumentNotValid.checkNotNullOrEmpty(seedList, "seedList"); 592 seedListSet = new HashSet<>(); 593 BufferedReader reader = new BufferedReader(new StringReader(seedList)); 594 String seed; 595 try { 596 while ((seed = reader.readLine()) != null) { 597 seedListSet.add(seed); // add to seedlist if not already there 598 } 599 } catch (IOException e) { 600 // This never happens, as we're reading from a string! 601 throw new IOFailure("IOException reading from seed string", e); 602 } finally { 603 IOUtils.closeQuietly(reader); 604 } 605 } 606 607 /** 608 * Get the seedlist as a String. The individual seeds are separated by the character '\n'. The order of the seeds 609 * are unknown. 610 * 611 * @return the seedlist as a String 612 */ 613 public String getSeedListAsString() { 614 return StringUtils.conjoin("\n", seedListSet); 615 } 616 617 /** 618 * Get the current status of this Job. 619 * 620 * @return the status as an int in the range 0 to 4. 621 */ 622 public JobStatus getStatus() { 623 return status; 624 } 625 626 /** 627 * Sets status of this job. 628 * 629 * @param newStatus Must be one of the values STATUS_NEW, ..., STATUS_FAILED 630 * @throws ArgumentNotValid in case of invalid status argument or invalid status change 631 */ 632 public void setStatus(JobStatus newStatus) { 633 ArgumentNotValid.checkNotNull(newStatus, "newStatus"); 634 if (!status.legalChange(newStatus)) { 635 final String message = "Status change from " + status + " to " + newStatus + " is not allowed"; 636 log.debug(message); 637 throw new ArgumentNotValid(message); 638 } 639 640 if ((this.status == JobStatus.NEW || this.status == JobStatus.RESUBMITTED) && newStatus == JobStatus.SUBMITTED) { 641 orderXMLdoc.configureQuotaEnforcer(maxObjectsIsSetByQuotaEnforcer, forceMaxBytesPerDomain, forceMaxObjectsPerDomain); 642 } 643 644 645 if (this.status == JobStatus.SUBMITTED && newStatus == JobStatus.STARTED) { 646 setActualStart(new Date()); 647 } 648 if (this.status == JobStatus.STARTED && (newStatus == JobStatus.DONE || newStatus == JobStatus.FAILED)) { 649 setActualStop(new Date()); 650 } 651 status = newStatus; 652 } 653 654 /** 655 * Returns a map of domain names and name of their corresponding configuration. 656 * <p> 657 * The returned Map cannot be changed. 658 * 659 * @return a read-only Map (<String>, <String>) 660 */ 661 public Map<String, String> getDomainConfigurationMap() { 662 return Collections.unmodifiableMap(domainConfigurationMap); 663 } 664 665 /** 666 * Gets the maximum number of objects harvested per domain. 667 * 668 * @return The maximum number of objects harvested per domain. 0 means no limit. 669 */ 670 public long getMaxObjectsPerDomain() { 671 return forceMaxObjectsPerDomain; 672 } 673 674 /** 675 * Gets the maximum number of bytes harvested per domain. 676 * 677 * @return The maximum number of bytes harvested per domain. -1 means no limit. 678 */ 679 public long getMaxBytesPerDomain() { 680 return forceMaxBytesPerDomain; 681 } 682 683 /** 684 * Get the edition number. 685 * 686 * @return The edition number 687 */ 688 long getEdition() { 689 return edition; 690 } 691 692 /** 693 * Set the edition number. 694 * 695 * @param edition the new edition number 696 */ 697 void setEdition(long edition) { 698 this.edition = edition; 699 } 700 701 public void setHarvestChannel(HarvestChannel harvestChannel) { 702 this.channel = harvestChannel.getName(); 703 this.isSnapshot = harvestChannel.isSnapshot(); 704 } 705 706 /** 707 * @return the associated {@link HarvestChannel} name. 708 */ 709 public String getChannel() { 710 return channel; 711 } 712 713 /** 714 * Sets the associated {@link HarvestChannel} name. 715 * 716 * @param channel the channel name 717 */ 718 public void setChannel(String channel) { 719 this.channel = channel; 720 } 721 722 /** 723 * @return true if the job belongs to a snapshot harvest, false if it belongs to a focused harvest. 724 */ 725 public boolean isSnapshot() { 726 return isSnapshot; 727 } 728 729 /** 730 * Sets whether job belongs to a snapshot or focused harvest. 731 * 732 * @param isSnapshot true if the job belongs to a snapshot harvest, false if it belongs to a focused harvest. 733 */ 734 public void setSnapshot(boolean isSnapshot) { 735 this.isSnapshot = isSnapshot; 736 } 737 738 @Override 739 public String toString() { 740 return "Job " + getJobID() + " (state = " + getStatus() + ", HD = " + getOrigHarvestDefinitionID() 741 + ", channel = " + getChannel() + ", snapshot = " + isSnapshot() + ", forcemaxcount = " 742 + getForceMaxObjectsPerDomain() + ", forcemaxbytes = " + getMaxBytesPerDomain() 743 + ", forcemaxrunningtime = " + forceMaxRunningTime + ", orderxml = " + getOrderXMLName() 744 + ", numconfigs = " + getDomainConfigurationMap().size() + ", created = " + getCreationDate() 745 + (getSubmittedDate() != null ? ", submitted = " + getSubmittedDate() : "") 746 + (getActualStart() != null ? ", started = " + getActualStart() : "") 747 + (getActualStop() != null ? ", stopped = " + getActualStop() : "") + ")"; 748 } 749 750 /** 751 * @return Returns the forceMaxObjectsPerDomain. 0 means no limit. 752 */ 753 public long getForceMaxObjectsPerDomain() { 754 return forceMaxObjectsPerDomain; 755 } 756 757 /** 758 * Sets the maxObjectsPerDomain value. 759 * 760 * @param maxObjectsPerDomain The forceMaxObjectsPerDomain to set. 0 means no limit. 761 * @throws IOFailure Thrown from auxiliary method editOrderXML_maxObjectsPerDomain. 762 */ 763 protected void setMaxObjectsPerDomain(long maxObjectsPerDomain) { 764 if (!underConstruction) { 765 final String msg = "Cannot modify job " + this + " as it is no longer under construction"; 766 log.debug(msg); 767 throw new IllegalState(msg); 768 } 769 770 this.forceMaxObjectsPerDomain = maxObjectsPerDomain; 771 orderXMLdoc.setMaxObjectsPerDomain(maxObjectsPerDomain); // FIXME? add argument to maxObjectsIsSetByQuotaEnforcer to method setMaxObjectsPerDomain 772 //orderXMLdoc.editOrderXML_maxObjectsPerDomain(orderXMLdoc, maxObjectsPerDomain, 773 // maxObjectsIsSetByQuotaEnforcer); 774 775 if (0L == maxObjectsPerDomain && 0L != forceMaxBytesPerDomain) { 776 setMaxBytesPerDomain(0L); 777 } 778 } 779 780 /** 781 * Set the maxbytes per domain value. 782 * 783 * @param maxBytesPerDomain The maxBytesPerDomain to set, or -1 for no limit. 784 */ 785 protected void setMaxBytesPerDomain(long maxBytesPerDomain) { 786 if (!underConstruction) { 787 final String msg = "Cannot modify job " + this + " as it is no longer under construction"; 788 log.debug(msg); 789 throw new IllegalState(msg); 790 } 791 this.forceMaxBytesPerDomain = maxBytesPerDomain; 792 orderXMLdoc.setMaxBytesPerDomain(maxBytesPerDomain); 793 794 if (0L == maxBytesPerDomain && 0L != forceMaxObjectsPerDomain) { 795 setMaxObjectsPerDomain(0L); 796 } 797 } 798 799 /** 800 * Set the maxJobRunningTime value. 801 * 802 * @param maxJobRunningTime The maxJobRunningTime in seconds to set, or 0 for no limit. 803 */ 804 protected void setMaxJobRunningTime(long maxJobRunningTime) { 805 if (!underConstruction) { 806 final String msg = "Cannot modify job " + this + " as it is no longer under construction"; 807 log.debug(msg); 808 throw new IllegalState(msg); 809 } 810 this.forceMaxRunningTime = maxJobRunningTime; 811 orderXMLdoc.setMaxJobRunningTime(maxJobRunningTime); 812 } 813 814 /** 815 * @return Returns the MaxJobRunningTime. 0 means no limit. 816 */ 817 public long getMaxJobRunningTime() { 818 return forceMaxRunningTime; 819 } 820 821 /** 822 * Get the harvestNum for this job. The number reflects which run of the harvest definition this is. 823 * 824 * @return the harvestNum for this job. 825 */ 826 public int getHarvestNum() { 827 return harvestNum; 828 } 829 830 /** 831 * Set the harvestNum for this job. The number reflects which run of the harvest definition this is. ONLY TO BE USED 832 * IN THE CONSTRUCTION PHASE. 833 * 834 * @param harvestNum a given harvestNum 835 */ 836 public void setHarvestNum(int harvestNum) { 837 if (!underConstruction) { 838 final String msg = "Cannot modify job " + this + " as it is no longer under construction"; 839 log.debug(msg); 840 throw new IllegalState(msg); 841 } 842 this.harvestNum = harvestNum; 843 } 844 845 /** 846 * Get the list of harvest errors for this job. If no harvest errors, null is returned This value is not meaningful 847 * until the job is finished (FAILED,DONE, RESUBMITTED) 848 * 849 * @return the harvest errors for this job or null if no harvest errors. 850 */ 851 public String getHarvestErrors() { 852 return harvestErrors; 853 } 854 855 /** 856 * Append to the list of harvest errors for this job. Nothing happens, if argument harvestErrors is null. 857 * 858 * @param harvestErrors a string containing harvest errors (may be null) 859 */ 860 public void appendHarvestErrors(String harvestErrors) { 861 if (harvestErrors != null) { 862 if (this.harvestErrors == null) { 863 this.harvestErrors = harvestErrors; 864 } else { 865 this.harvestErrors += "\n" + harvestErrors; 866 } 867 } 868 } 869 870 /** 871 * Get the list of harvest error details for this job. If no harvest error details, null is returned This value is 872 * not meaningful until the job is finished (FAILED,DONE, RESUBMITTED) 873 * 874 * @return the list of harvest error details for this job or null if no harvest error details. 875 */ 876 877 public String getHarvestErrorDetails() { 878 return harvestErrorDetails; 879 } 880 881 /** 882 * Append to the list of harvest error details for this job. Nothing happens, if argument harvestErrorDetails is 883 * null. 884 * 885 * @param harvestErrorDetails a string containing harvest error details. 886 */ 887 public void appendHarvestErrorDetails(String harvestErrorDetails) { 888 if (harvestErrorDetails != null) { 889 if (this.harvestErrorDetails == null) { 890 this.harvestErrorDetails = harvestErrorDetails; 891 } else { 892 this.harvestErrorDetails += "\n" + harvestErrorDetails; 893 } 894 } 895 } 896 897 /** 898 * Get the list of upload errors. If no upload errors, null is returned. This value is not meaningful until the job 899 * is finished (FAILED,DONE, RESUBMITTED) 900 * 901 * @return the list of upload errors as String, or null if no upload errors. 902 */ 903 public String getUploadErrors() { 904 return uploadErrors; 905 } 906 907 /** 908 * Append to the list of upload errors. Nothing happens, if argument uploadErrors is null. 909 * 910 * @param uploadErrors a string containing upload errors. 911 */ 912 public void appendUploadErrors(String uploadErrors) { 913 if (uploadErrors != null) { 914 if (this.uploadErrors == null) { 915 this.uploadErrors = uploadErrors; 916 } else { 917 this.uploadErrors += "\n" + uploadErrors; 918 } 919 } 920 } 921 922 /** 923 * Get the list of upload error details. If no upload error details, null is returned. This value is not meaningful 924 * until the job is finished (FAILED,DONE, RESUBMITTED) 925 * 926 * @return the list of upload error details as String, or null if no upload error details 927 */ 928 public String getUploadErrorDetails() { 929 return uploadErrorDetails; 930 } 931 932 /** 933 * Append to the list of upload error details. Nothing happens, if argument uploadErrorDetails is null. 934 * 935 * @param uploadErrorDetails a string containing upload error details. 936 */ 937 public void appendUploadErrorDetails(String uploadErrorDetails) { 938 if (uploadErrorDetails != null) { 939 if (this.uploadErrorDetails == null) { 940 this.uploadErrorDetails = uploadErrorDetails; 941 } else { 942 this.uploadErrorDetails += "\n" + uploadErrorDetails; 943 } 944 } 945 } 946 947 /** 948 * Get the ID for the job which this job was resubmitted as. If null, this job has not been resubmitted. 949 * 950 * @return this ID. 951 */ 952 public Long getResubmittedAsJob() { 953 return resubmittedAsJobWithID; 954 } 955 956 /** 957 * Set the Date for when this job was submitted. If null, this job has not been submitted. 958 * 959 * @param submittedDate The date when this was submitted 960 */ 961 public void setSubmittedDate(Date submittedDate) { 962 this.submittedDate = submittedDate; 963 } 964 965 /** 966 * Set the Date for when this job was created. If null, this job has not been created. 967 * 968 * @param creationDate The date when this was created 969 */ 970 public void setCreationDate(Date creationDate) { 971 this.creationDate = creationDate; 972 } 973 974 /** 975 * Set the ID for the job which this job was resubmitted as. 976 * 977 * @param resubmittedAsJob An Id for a new job. 978 */ 979 public void setResubmittedAsJob(Long resubmittedAsJob) { 980 this.resubmittedAsJobWithID = resubmittedAsJob; 981 } 982 983 /** 984 * @return id of the job that this job is supposed to continue using Heritrix recover-log or null if it starts from 985 * scratch. 986 */ 987 public Long getContinuationOf() { 988 return this.continuationOF; 989 } 990 991 @Override 992 public String getHarvestFilenamePrefix() { 993 if (this.harvestnamePrefix == null) { 994 log.warn("HarvestnamePrefix not yet set for job {}. Set it by using the naming scheme. " 995 + "This should only happen for old jobs being read", this.jobID); 996 setDefaultHarvestNamePrefix(); 997 } 998 return this.harvestnamePrefix; 999 } 1000 1001 /** 1002 * @param prefix 1003 */ 1004 public void setHarvestFilenamePrefix(String prefix) { 1005 this.harvestnamePrefix = prefix; 1006 } 1007 1008 /** 1009 * @return the forceMaxBytesPerDomain 1010 */ 1011 public long getForceMaxBytesPerDomain() { 1012 return forceMaxBytesPerDomain; 1013 } 1014 1015 /** 1016 * @return the configurationSetsObjectLimit 1017 */ 1018 public boolean isConfigurationSetsObjectLimit() { 1019 return configurationSetsObjectLimit; 1020 } 1021 1022 /** 1023 * @return the configurationSetsByteLimit 1024 */ 1025 public boolean isConfigurationSetsByteLimit() { 1026 return configurationSetsByteLimit; 1027 } 1028 1029 /** 1030 * @return the minCountObjects 1031 */ 1032 public long getMinCountObjects() { 1033 return minCountObjects; 1034 } 1035 1036 /** 1037 * @return the maxCountObjects 1038 */ 1039 public long getMaxCountObjects() { 1040 return maxCountObjects; 1041 } 1042 1043 /** 1044 * @return the totalCountObjects 1045 */ 1046 public long getTotalCountObjects() { 1047 return totalCountObjects; 1048 } 1049 1050 void setDefaultHarvestNamePrefix() { 1051 if (getJobID() != null) { 1052 ArchiveFileNaming naming = ArchiveFileNamingFactory.getInstance(); 1053 log.debug("Applying the default ArchiveFileNaming class '{}'.", naming.getClass().getName()); 1054 final String prefix = naming.getPrefix(this); 1055 setHarvestFilenamePrefix(prefix); 1056 log.debug("The harvestPrefix of this job is: {}", prefix); 1057 } else { 1058 log.warn("The harvestnamePrefix is not set now, as it depends on the JobID, which is not set yet"); 1059 } 1060 } 1061 1062 /** @return the harvest-audience. */ 1063 public String getHarvestAudience() { 1064 return harvestAudience; 1065 } 1066 1067 /** 1068 * Set the harvest audience for this job. Taken from the harvestdefinition that generated this job. 1069 * 1070 * @param theAudience the harvest-audience. 1071 */ 1072 public void setHarvestAudience(String theAudience) { 1073 this.harvestAudience = theAudience; 1074 } 1075 1076 ///////////// The following two methods are needed by harvestStatus-jobdetails.jsp //////////////////////////////////// 1077 /** 1078 * Returns a list of sorted seeds for this job. 1079 * The sorting is by domain, and inside each domain, 1080 * the list is sorted by url 1081 * @return a list of sorted seeds for this job. 1082 */ 1083 public List<String> getSortedSeedList() { 1084 Map<String, Set<String>> urlMap = new HashMap<String, Set<String>>(); 1085 for (String seed : seedListSet) { 1086 String url; 1087 // Assume the protocol is http://, if it is missing 1088 if (!seed.matches(Constants.PROTOCOL_REGEXP)) { 1089 url = "http://" + seed; 1090 } else { 1091 url = seed; 1092 } 1093 String domain = getDomain(url); 1094 if (domain == null) { 1095 // stop processing this url, and continue to the next seed 1096 continue; 1097 } 1098 Set<String> set; 1099 if (urlMap.containsKey(domain)) { 1100 set = urlMap.get(domain); 1101 } else { 1102 set = new TreeSet<String>(); 1103 urlMap.put(domain, set); 1104 } 1105 set.add(seed); 1106 1107 } 1108 List<String> result = new ArrayList<String>(); 1109 for (Set<String> set: urlMap.values()) { 1110 result.addAll(set); 1111 } 1112 return result; 1113 } 1114 /** 1115 * Get the domain, that the given URL belongs to. 1116 * @param url an URL 1117 * @return the domain, that the given URL belongs to, or 1118 * null if unable to do so. 1119 */ 1120 private String getDomain(String url) { 1121 try { 1122 URL uri = new URL(url); 1123 return DomainUtils.domainNameFromHostname(uri.getHost()); 1124 } catch (MalformedURLException e) { 1125 log.warn("The string '{}' is not a valid URL", url); 1126 return null; 1127 } 1128 } 1129 1130}