001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.datamodel; 024 025import gnu.inet.encoding.IDNA; 026import gnu.inet.encoding.IDNAException; 027 028import java.io.BufferedReader; 029import java.io.File; 030import java.io.IOException; 031import java.io.Serializable; 032import java.io.StringReader; 033import java.net.MalformedURLException; 034import java.net.URL; 035import java.util.ArrayList; 036import java.util.Collections; 037import java.util.Date; 038import java.util.HashMap; 039import java.util.HashSet; 040import java.util.Iterator; 041import java.util.List; 042import java.util.Map; 043import java.util.Set; 044import java.util.TreeSet; 045import java.util.regex.Pattern; 046 047import org.apache.commons.io.IOUtils; 048import org.slf4j.Logger; 049import org.slf4j.LoggerFactory; 050 051import dk.netarkivet.common.exceptions.ArgumentNotValid; 052import dk.netarkivet.common.exceptions.IOFailure; 053import dk.netarkivet.common.exceptions.IllegalState; 054import dk.netarkivet.common.utils.DomainUtils; 055import dk.netarkivet.common.utils.Settings; 056import dk.netarkivet.common.utils.StringUtils; 057import dk.netarkivet.harvester.HarvesterSettings; 058import dk.netarkivet.harvester.datamodel.eav.EAV.AttributeAndType; 059import dk.netarkivet.harvester.harvesting.ArchiveFileNaming; 060import dk.netarkivet.harvester.harvesting.ArchiveFileNamingFactory; 061import dk.netarkivet.harvester.harvesting.JobInfo; 062 063/** 064 * This class represents one job to run by Heritrix. It's based on a number of configurations all based on the same 065 * order.xml and at most one configuration for each domain. Each job consists of configurations of the approximate same 066 * size; that is the difference in expectation from the smallest configuration to the largest configuration is within a 067 * factor of each other defined as limMaxRelSize (although differences smaller than limMinAbsSize are ignored) There is 068 * a limit limMaxTotalSize on the total size of the job in objects. 069 * <p> 070 * A job may also be limited on bytes or objects, defined either by the configurations in the job or the harvest 071 * definition the job is generated by. 072 * <p> 073 * The job contains the order file, the seedlist and the current status of the job, as well as the ID of the harvest 074 * definition that defined it and names of all the configurations it is based on. 075 */ 076@SuppressWarnings({"serial"}) 077public class Job implements Serializable, JobInfo { 078 private transient static final Logger log = LoggerFactory.getLogger(Job.class); 079 080 // Persistent fields stored in and read from DAO 081 /** The persistent ID of this job. */ 082 private Long jobID; 083 /** The Id of the harvestdefinition, that generated this job. */ 084 protected Long origHarvestDefinitionID; 085 /** The status of the job. See the JobStatus class for the possible states. */ 086 protected JobStatus status; 087 /** The name of the {@link HarvestChannel} on which this job will be posted. */ 088 private String channel; 089 090 /** Whether the job belongs to a snapshot or partial harvest. */ 091 private boolean isSnapshot; 092 /** 093 * Overrides the individual configurations maximum setting for objects retrieved from a domain when set to a 094 * positive value. 095 */ 096 private long forceMaxObjectsPerDomain = Constants.HERITRIX_MAXOBJECTS_INFINITY; 097 /** 098 * Overrides the individual configurations maximum setting for bytes retrieved from a domain when set to other than 099 * -1. 100 */ 101 private long forceMaxBytesPerDomain = Constants.HERITRIX_MAXBYTES_INFINITY; 102 /** The name of the harvest template used by the job. */ 103 private String orderXMLname; 104 /** The harvest template used by the job. */ 105 private HeritrixTemplate orderXMLdoc; 106 /** The list of Heritrix settings files. */ 107 private File[] settingsXMLfiles; 108 109 /** The corresponding Dom4j Documents for these files. */ 110 //private Document[] settingsXMLdocs; 111 112 /** 113 * A set of seeds involved in this job. Outside the SetSeedList() method, the set of seeds is updated in the 114 * addConfiguration() method. 115 */ 116 private Set<String> seedListSet = new HashSet<String>(); 117 /** Which run of the harvest definition this is. */ 118 private int harvestNum; 119 /** Errors during harvesting. */ 120 private String harvestErrors; 121 /** Details about errors during harvesting. */ 122 private String harvestErrorDetails; 123 /** Errors during upload of the harvested data. */ 124 private String uploadErrors; 125 /** Details about errors during upload of the harvested data. */ 126 private String uploadErrorDetails; 127 /** The starting point of the job. */ 128 private Date actualStart; 129 /** The ending point of the job. */ 130 private Date actualStop; 131 /** The time when this job was submitted. */ 132 private Date submittedDate; 133 /** The time when this job was created. */ 134 private Date creationDate; 135 136 /** Edition is used by the DAO to keep track of changes. */ 137 private long edition = -1; 138 139 /** Resubmitted as the Job with this ID. If null, this job has not been resubmitted. */ 140 private Long resubmittedAsJobWithID; 141 142 /** Continuation of this job. */ 143 private Long continuationOF; 144 145 /** 146 * A map (domainName, domainConfigurationName), must be accessible in order to update job information (see Ass. 147 * 2.4.3) 148 */ 149 private Map<String, String> domainConfigurationMap; 150 /** 151 * A hint to the DAO that configurations have changed. Since configurations are large, the DAO can use that this is 152 * false to avoid updating the config list. The DAO can set it to false after saving configurations. 153 */ 154 boolean configsChanged = false; 155 156 // Intermediate fields, non-persistent and only used while building objects 157 158 /** 159 * Whether the maxObjects field was defined by the harvest definition or the configuration limit. This is deciding 160 * for whether we accept smaller configurations or not when building jobs. True means the limit is defined by the 161 * configuration, false means that it is defined by the harvest definition. 162 */ 163 private boolean configurationSetsObjectLimit; 164 165 /** 166 * Whether the maxBytes field was defined by the harvest definition or the configuration limit. This is deciding for 167 * whether we accept smaller configurations or not when building jobs. True means the limit is defined by the 168 * configuration, false means by the harvest definition. 169 */ 170 private boolean configurationSetsByteLimit; 171 172 /** The lowest number of objects expected by a configuration. */ 173 private long minCountObjects; 174 175 /** The highest number of objects expected by a configuration. */ 176 private long maxCountObjects; 177 178 /** The total number of objects expected by all added configurations. */ 179 private long totalCountObjects; 180 181 /** 182 * The max time in seconds given to the harvester for this job. 0 is unlimited. 183 */ 184 private long forceMaxRunningTime; 185 186 /** 187 * If true, this job object is still undergoing changes due to having more configurations added. When set to false, 188 * the object is no longer considered immutable except for updating status. 189 * <p> 190 * Jobs loaded from the DAO are never under construction anymore. 191 */ 192 private boolean underConstruction = true; 193 194 // Constants 195 196 // Note: The following constants are intentionally left non-static for easy 197 // unit testing 198 199 private boolean maxObjectsIsSetByQuotaEnforcer = Settings 200 .getBoolean(HarvesterSettings.OBJECT_LIMIT_SET_BY_QUOTA_ENFORCER); 201 202 /** 203 * The harvestname prefix used in the files generated by Heritrix. Is set using an ArchiveFileNaming class when the 204 * jobID is available. 205 */ 206 private String harvestnamePrefix; 207 208 /** This variable is right now the same as harvestdefinitions.audience field. */ 209 private String harvestAudience; 210 211 protected Job() { 212 this.status = JobStatus.NEW; 213 } 214 215 /** 216 * Package private constructor for common initialisation. 217 * 218 * @param harvestID the id of the harvestdefinition 219 * @param cfg the configuration to base the Job on 220 * @param orderXMLdoc 221 * @param channel the channel on which the job will be submitted. 222 * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain, overrides individual 223 * configuration settings. -1 means no limit 224 * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for no limit. 225 * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job 226 * @param harvestNum the run number of the harvest definition 227 * @throws ArgumentNotValid if cfg or priority is null or harvestID is invalid, or if any limit < -1 228 */ 229 public Job(Long harvestID, DomainConfiguration cfg, HeritrixTemplate orderXMLdoc, HarvestChannel channel, 230 long forceMaxObjectsPerDomain, 231 long forceMaxBytesPerDomain, long forceMaxJobRunningTime, int harvestNum) throws ArgumentNotValid { 232 ArgumentNotValid.checkNotNull(cfg, "cfg"); 233 ArgumentNotValid.checkNotNull(harvestID, "harvestID"); 234 ArgumentNotValid.checkNotNegative(harvestID, "harvestID"); 235 ArgumentNotValid.checkNotNull(channel, "channel"); 236 237 if (forceMaxObjectsPerDomain < -1) { 238 String msg = "forceMaxObjectsPerDomain must be either -1 or positive"; 239 log.debug(msg); 240 throw new ArgumentNotValid(msg); 241 } 242 if (forceMaxBytesPerDomain < -1) { 243 String msg = "forceMaxBytesPerDomain must be either -1 or positive"; 244 log.debug(msg); 245 throw new ArgumentNotValid(msg); 246 } 247 248 if (forceMaxBytesPerDomain == 0L) { 249 log.warn("forceMaxBytesPerDomain should probably not be 0.Means 0 bytes downloaded per domain"); 250 } 251 252 if (forceMaxObjectsPerDomain == 0L) { 253 log.warn("forceMaxObjectsPerDomain should probably not be 0.Means 0 objects downloaded per domain"); 254 } 255 256 // setup initial members 257 domainConfigurationMap = new HashMap<>(); 258 origHarvestDefinitionID = harvestID; 259 orderXMLname = cfg.getOrderXmlName(); 260 this.orderXMLdoc = orderXMLdoc; 261 262 setHarvestChannel(channel); 263 264 long maxObjects = NumberUtils.minInf(forceMaxObjectsPerDomain, cfg.getMaxObjects()); 265 setMaxObjectsPerDomain(maxObjects); 266 configurationSetsObjectLimit = (maxObjects != forceMaxObjectsPerDomain); 267 268 long maxBytes = NumberUtils.minInf(forceMaxBytesPerDomain, cfg.getMaxBytes()); 269 setMaxBytesPerDomain(maxBytes); 270 configurationSetsByteLimit = (maxBytes != forceMaxBytesPerDomain); 271 272 long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain); 273 maxCountObjects = expectation; 274 minCountObjects = expectation; 275 this.harvestNum = harvestNum; 276 277 addConfiguration(cfg); 278 279 setMaxJobRunningTime(forceMaxJobRunningTime); 280 281 setArchiveFormatInTemplate(Settings.get(HarvesterSettings.HERITRIX_ARCHIVE_FORMAT)); 282 283 setAttributes(cfg.getAttributesAndTypes()); 284 285 status = JobStatus.NEW; 286 } 287 288 public void setAttributes(List<AttributeAndType> attributesAndTypes) { 289 orderXMLdoc.insertAttributes(attributesAndTypes); 290 } 291 292 /** 293 * Update the order template according to the chosen archive format (arc/warc). 294 */ 295 private void setArchiveFormatInTemplate(String archiveFormat) { 296 if (!underConstruction) { 297 final String msg = "Cannot modify job " + this + " as it is no longer under construction"; 298 log.debug(msg); 299 throw new IllegalState(msg); 300 } 301 orderXMLdoc.setArchiveFormat(archiveFormat); 302 } 303 304 /** 305 * Create a new Job object from basic information stored in the DAO. 306 * 307 * @param harvestID the id of the harvestdefinition 308 * @param configurations the configurations to base the Job on 309 * @param channel the name of the channel on which the job will be submitted. 310 * @param snapshot whether the job belongs to a snapshot harvest 311 * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain, overrides individual 312 * configuration settings. 0 means no limit. 313 * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for no limit. 314 * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job 315 * @param status the current status of the job. 316 * @param orderXMLname the name of the order template used. 317 * @param orderXMLdoc the (possibly modified) template 318 * @param seedlist the combined seedlist from all configs. 319 * @param harvestNum the run number of the harvest definition 320 */ 321 Job(Long harvestID, Map<String, String> configurations, String channel, boolean snapshot, 322 long forceMaxObjectsPerDomain, long forceMaxBytesPerDomain, long forceMaxJobRunningTime, JobStatus status, 323 String orderXMLname, HeritrixTemplate orderXMLdoc, String seedlist, int harvestNum, Long continuationOf) { 324 origHarvestDefinitionID = harvestID; 325 domainConfigurationMap = configurations; 326 this.channel = channel; 327 this.isSnapshot = snapshot; 328 this.forceMaxBytesPerDomain = forceMaxBytesPerDomain; 329 this.forceMaxObjectsPerDomain = forceMaxObjectsPerDomain; 330 this.forceMaxRunningTime = forceMaxJobRunningTime; 331 this.status = status; 332 this.orderXMLname = orderXMLname; 333 this.orderXMLdoc = orderXMLdoc; 334 this.setSeedList(seedlist); 335 this.harvestNum = harvestNum; 336 this.continuationOF = continuationOf; 337 338 underConstruction = false; 339 } 340 341 342 /** 343 * Adds a configuration to this Job. Seedlists and settings are updated accordingly. 344 * 345 * @param cfg the configuration to add 346 * @throws ArgumentNotValid if cfg is null or cfg uses a different orderxml than this job or if this job already 347 * contains a configuration associated with domain of configuration cfg. 348 */ 349 public void addConfiguration(DomainConfiguration cfg) { 350 ArgumentNotValid.checkNotNull(cfg, "cfg"); 351 if (domainConfigurationMap.containsKey(cfg.getDomainName())) { 352 throw new ArgumentNotValid("Job already has a configuration for Domain " + cfg.getDomainName()); 353 } 354 355 if (log.isTraceEnabled()) { 356 log.trace("Adding configuration '{}' to job '{}'", cfg, cfg.getName()); 357 } 358 359 if (!underConstruction) { 360 final String msg = "Cannot modify job " + this + " as it is no longer under construction"; 361 log.debug(msg); 362 throw new IllegalState(msg); 363 } 364 365 if (!cfg.getOrderXmlName().equals(getOrderXMLName())) { 366 throw new ArgumentNotValid("Job requires the orderxml file:'" + getOrderXMLName() + "' not:'" 367 + cfg.getOrderXmlName() + "' used by the configuration:'" + cfg.getName()); 368 } 369 370 domainConfigurationMap.put(cfg.getDomainName(), cfg.getName()); 371 372 // Add the seeds from the configuration to the Job seeds. 373 // Take care of duplicates. 374 for (Iterator<SeedList> itt = cfg.getSeedLists(); itt.hasNext();) { 375 SeedList seed = itt.next(); 376 List<String> seeds = seed.getSeeds(); 377 for (String seedUrl : seeds) { 378 seedListSet.add(seedUrl); // duplicates is silently ignored 379 380 // TODO remove when heritrix implements this functionality 381 // try to convert a seed into a Internationalized Domain Name 382 try { 383 String seedASCII = seedUrl; 384 // It is rare to see these seeds, but they need to be 385 // correctly idnaized 386 if (seedUrl.contains(":") || seedUrl.contains("/")) { 387 String normalizedUrl = seedUrl; 388 if (!normalizedUrl.matches("^[a-zA-Z]+:.*")) { 389 // If no protocol is given, assume http 390 normalizedUrl = "http://" + normalizedUrl; 391 } 392 URL url = new URL(normalizedUrl); 393 String domainName = url.getHost(); 394 String domainNameASCII = IDNA.toASCII(domainName); 395 if (!domainName.equals(domainNameASCII)) { 396 // If the domain name changed, replace that in the 397 // seed. 398 seedASCII = seedUrl.replaceFirst(Pattern.quote(domainName), domainNameASCII); 399 } 400 } else { 401 seedASCII = IDNA.toASCII(seedUrl); 402 } 403 if (!seedASCII.equals(seedUrl)) { 404 log.trace("Converted {} to {}", seedUrl, seedASCII); 405 // Note that duplicates is silently ignored 406 seedListSet.add(seedASCII); 407 } 408 } catch (IDNAException e) { 409 log.trace("Cannot convert seed {} to ASCII", seedUrl, e); 410 } catch (MalformedURLException e) { 411 log.trace("Cannot convert seed {} to ASCII", seedUrl, e); 412 } 413 } 414 } 415 416 orderXMLdoc.editOrderXMLAddPerDomainCrawlerTraps(cfg); 417 418 // TODO update limits in settings files - see also bug 269 419 420 // Update estimates of job size 421 long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain); 422 maxCountObjects = Math.max(expectation, maxCountObjects); 423 minCountObjects = Math.min(expectation, minCountObjects); 424 totalCountObjects += expectation; 425 426 configsChanged = true; 427 428 assert (maxCountObjects >= minCountObjects) : "basic invariant"; 429 } 430 431 /** 432 * Get the name of the order XML file used by this Job. 433 * 434 * @return the name of the orderXML file 435 */ 436 public String getOrderXMLName() { 437 return orderXMLname; 438 } 439 440 /** 441 * Get the actual time when this job was stopped/completed. 442 * 443 * @return the time as Date 444 */ 445 public Date getActualStop() { 446 return actualStop; 447 } 448 449 /** 450 * Get the actual time when this job was started. 451 * 452 * @return the time as Date 453 */ 454 public Date getActualStart() { 455 return actualStart; 456 } 457 458 /** 459 * Get the time when this job was submitted. 460 * 461 * @return the time as Date 462 */ 463 public Date getSubmittedDate() { 464 return submittedDate; 465 } 466 467 /** 468 * Get the time when this job was created. 469 * 470 * @return the creation time as a <code>Date</code> 471 */ 472 public Date getCreationDate() { 473 return creationDate; 474 } 475 476 /** 477 * Get a list of Heritrix settings.xml files. Note that these files have nothing to do with NetarchiveSuite settings 478 * files. They are files that supplement the Heritrix order.xml files, and contain overrides for specific domains. 479 * 480 * @return the list of Files as an array 481 */ 482 public File[] getSettingsXMLfiles() { 483 return settingsXMLfiles; 484 } 485 486 /** 487 * Get the id of the HarvestDefinition from which this job originates. 488 * 489 * @return the id as a Long 490 */ 491 public Long getOrigHarvestDefinitionID() { 492 return origHarvestDefinitionID; 493 } 494 495 /** 496 * Get the id of this Job. 497 * 498 * @return the id as a Long 499 */ 500 public Long getJobID() { 501 return jobID; 502 } 503 504 /** 505 * Set the id of this Job. 506 * 507 * @param id The Id for this job. 508 */ 509 public void setJobID(Long id) { 510 jobID = id; 511 } 512 513 /** 514 * Get's the total number of different domains harvested by this job. 515 * 516 * @return the number of configurations added to this domain 517 */ 518 public int getCountDomains() { 519 return domainConfigurationMap.size(); 520 } 521 522 /** 523 * Set the actual time when this job was started. 524 * <p> 525 * Sends a notification, if actualStart is set to a time after actualStop. 526 * 527 * @param actualStart A Date object representing the time when this job was started. 528 */ 529 public void setActualStart(Date actualStart) { 530 ArgumentNotValid.checkNotNull(actualStart, "actualStart"); 531 if (actualStop != null && actualStop.before(actualStart)) { 532 log.warn("Job(" + getJobID()+ "): Start time (" + actualStart + ") is after end time: " + actualStop); 533 } 534 this.actualStart = (Date) actualStart.clone(); 535 } 536 537 /** 538 * Set the actual time when this job was stopped/completed. Sends a notification, if actualStop is set to a time 539 * before actualStart. 540 * 541 * @param actualStop A Date object representing the time when this job was stopped. 542 * @throws ArgumentNotValid 543 */ 544 public void setActualStop(Date actualStop) throws ArgumentNotValid { 545 ArgumentNotValid.checkNotNull(actualStop, "actualStop"); 546 if (actualStart == null) { 547 log.warn("Job(" + getJobID()+ "): actualStart should be defined before setting actualStop"); 548 } else if (actualStop.before(actualStart)) { 549 log.warn("Job(" + getJobID()+ "): actualStop (" + actualStop + ") is before actualStart: " + actualStart); 550 } 551 this.actualStop = (Date) actualStop.clone(); 552 } 553 554 /** 555 * Set the orderxml for this job. 556 * 557 * @param doc A orderxml to be used by this job 558 */ 559 public void setOrderXMLDoc(HeritrixTemplate doc) { 560 ArgumentNotValid.checkNotNull(doc, "doc"); 561 this.orderXMLdoc = doc; 562 } 563 564 /** 565 * Gets a document representation of the order.xml associated with this Job. 566 * 567 * @return the XML as a org.dom4j.Document 568 */ 569 public HeritrixTemplate getOrderXMLdoc() { 570 return orderXMLdoc; 571 } 572 573// /** 574// * Gets a list of document representations of the settings.xml's associated with this Job. 575// * 576// * @return the XML as an array of org.dom4j.Document 577// */ 578// public Document[] getSettingsXMLdocs() { 579// return settingsXMLdocs; 580// } 581 582 /** 583 * Set the seedlist of the job from the seedList argument. Individual seeds are separated by a '\n' character. 584 * Duplicate seeds are removed. 585 * 586 * @param seedList List of seeds as one String 587 */ 588 public void setSeedList(String seedList) { 589 ArgumentNotValid.checkNotNullOrEmpty(seedList, "seedList"); 590 seedListSet = new HashSet<>(); 591 BufferedReader reader = new BufferedReader(new StringReader(seedList)); 592 String seed; 593 try { 594 while ((seed = reader.readLine()) != null) { 595 seedListSet.add(seed); // add to seedlist if not already there 596 } 597 } catch (IOException e) { 598 // This never happens, as we're reading from a string! 599 throw new IOFailure("IOException reading from seed string", e); 600 } finally { 601 IOUtils.closeQuietly(reader); 602 } 603 } 604 605 /** 606 * Get the seedlist as a String. The individual seeds are separated by the character '\n'. The order of the seeds 607 * are unknown. 608 * 609 * @return the seedlist as a String 610 */ 611 public String getSeedListAsString() { 612 return StringUtils.conjoin("\n", seedListSet); 613 } 614 615 /** 616 * Get the current status of this Job. 617 * 618 * @return the status as an int in the range 0 to 4. 619 */ 620 public JobStatus getStatus() { 621 return status; 622 } 623 624 /** 625 * Sets status of this job. 626 * 627 * @param newStatus Must be one of the values STATUS_NEW, ..., STATUS_FAILED 628 * @throws ArgumentNotValid in case of invalid status argument or invalid status change 629 */ 630 public void setStatus(JobStatus newStatus) { 631 ArgumentNotValid.checkNotNull(newStatus, "newStatus"); 632 if (!status.legalChange(newStatus)) { 633 final String message = "Status change from " + status + " to " + newStatus + " is not allowed"; 634 log.debug(message); 635 throw new ArgumentNotValid(message); 636 } 637 638 if ((this.status == JobStatus.NEW || this.status == JobStatus.RESUBMITTED) && newStatus == JobStatus.SUBMITTED) { 639 orderXMLdoc.configureQuotaEnforcer(maxObjectsIsSetByQuotaEnforcer, forceMaxBytesPerDomain, forceMaxObjectsPerDomain); 640 } 641 642 643 if (this.status == JobStatus.SUBMITTED && newStatus == JobStatus.STARTED) { 644 setActualStart(new Date()); 645 } 646 if (this.status == JobStatus.STARTED && (newStatus == JobStatus.DONE || newStatus == JobStatus.FAILED)) { 647 setActualStop(new Date()); 648 } 649 status = newStatus; 650 } 651 652 /** 653 * Returns a map of domain names and name of their corresponding configuration. 654 * <p> 655 * The returned Map cannot be changed. 656 * 657 * @return a read-only Map (<String>, <String>) 658 */ 659 public Map<String, String> getDomainConfigurationMap() { 660 return Collections.unmodifiableMap(domainConfigurationMap); 661 } 662 663 /** 664 * Gets the maximum number of objects harvested per domain. 665 * 666 * @return The maximum number of objects harvested per domain. 0 means no limit. 667 */ 668 public long getMaxObjectsPerDomain() { 669 return forceMaxObjectsPerDomain; 670 } 671 672 /** 673 * Gets the maximum number of bytes harvested per domain. 674 * 675 * @return The maximum number of bytes harvested per domain. -1 means no limit. 676 */ 677 public long getMaxBytesPerDomain() { 678 return forceMaxBytesPerDomain; 679 } 680 681 /** 682 * Get the edition number. 683 * 684 * @return The edition number 685 */ 686 long getEdition() { 687 return edition; 688 } 689 690 /** 691 * Set the edition number. 692 * 693 * @param edition the new edition number 694 */ 695 void setEdition(long edition) { 696 this.edition = edition; 697 } 698 699 public void setHarvestChannel(HarvestChannel harvestChannel) { 700 this.channel = harvestChannel.getName(); 701 this.isSnapshot = harvestChannel.isSnapshot(); 702 } 703 704 /** 705 * @return the associated {@link HarvestChannel} name. 706 */ 707 public String getChannel() { 708 return channel; 709 } 710 711 /** 712 * Sets the associated {@link HarvestChannel} name. 713 * 714 * @param channel the channel name 715 */ 716 public void setChannel(String channel) { 717 this.channel = channel; 718 } 719 720 /** 721 * @return true if the job belongs to a snapshot harvest, false if it belongs to a focused harvest. 722 */ 723 public boolean isSnapshot() { 724 return isSnapshot; 725 } 726 727 /** 728 * Sets whether job belongs to a snapshot or focused harvest. 729 * 730 * @param isSnapshot true if the job belongs to a snapshot harvest, false if it belongs to a focused harvest. 731 */ 732 public void setSnapshot(boolean isSnapshot) { 733 this.isSnapshot = isSnapshot; 734 } 735 736 @Override 737 public String toString() { 738 return "Job " + getJobID() + " (state = " + getStatus() + ", HD = " + getOrigHarvestDefinitionID() 739 + ", channel = " + getChannel() + ", snapshot = " + isSnapshot() + ", forcemaxcount = " 740 + getForceMaxObjectsPerDomain() + ", forcemaxbytes = " + getMaxBytesPerDomain() 741 + ", forcemaxrunningtime = " + forceMaxRunningTime + ", orderxml = " + getOrderXMLName() 742 + ", numconfigs = " + getDomainConfigurationMap().size() + ", created = " + getCreationDate() 743 + (getSubmittedDate() != null ? ", submitted = " + getSubmittedDate() : "") 744 + (getActualStart() != null ? ", started = " + getActualStart() : "") 745 + (getActualStop() != null ? ", stopped = " + getActualStop() : "") + ")"; 746 } 747 748 /** 749 * @return Returns the forceMaxObjectsPerDomain. 0 means no limit. 750 */ 751 public long getForceMaxObjectsPerDomain() { 752 return forceMaxObjectsPerDomain; 753 } 754 755 /** 756 * Sets the maxObjectsPerDomain value. 757 * 758 * @param maxObjectsPerDomain The forceMaxObjectsPerDomain to set. 0 means no limit. 759 * @throws IOFailure Thrown from auxiliary method editOrderXML_maxObjectsPerDomain. 760 */ 761 protected void setMaxObjectsPerDomain(long maxObjectsPerDomain) { 762 if (!underConstruction) { 763 final String msg = "Cannot modify job " + this + " as it is no longer under construction"; 764 log.debug(msg); 765 throw new IllegalState(msg); 766 } 767 768 this.forceMaxObjectsPerDomain = maxObjectsPerDomain; 769 orderXMLdoc.setMaxObjectsPerDomain(maxObjectsPerDomain); // FIXME? add argument to maxObjectsIsSetByQuotaEnforcer to method setMaxObjectsPerDomain 770 //orderXMLdoc.editOrderXML_maxObjectsPerDomain(orderXMLdoc, maxObjectsPerDomain, 771 // maxObjectsIsSetByQuotaEnforcer); 772 773 if (0L == maxObjectsPerDomain && 0L != forceMaxBytesPerDomain) { 774 setMaxBytesPerDomain(0L); 775 } 776 } 777 778 /** 779 * Set the maxbytes per domain value. 780 * 781 * @param maxBytesPerDomain The maxBytesPerDomain to set, or -1 for no limit. 782 */ 783 protected void setMaxBytesPerDomain(long maxBytesPerDomain) { 784 if (!underConstruction) { 785 final String msg = "Cannot modify job " + this + " as it is no longer under construction"; 786 log.debug(msg); 787 throw new IllegalState(msg); 788 } 789 this.forceMaxBytesPerDomain = maxBytesPerDomain; 790 orderXMLdoc.setMaxBytesPerDomain(maxBytesPerDomain); 791 792 if (0L == maxBytesPerDomain && 0L != forceMaxObjectsPerDomain) { 793 setMaxObjectsPerDomain(0L); 794 } 795 } 796 797 /** 798 * Set the maxJobRunningTime value. 799 * 800 * @param maxJobRunningTime The maxJobRunningTime in seconds to set, or 0 for no limit. 801 */ 802 protected void setMaxJobRunningTime(long maxJobRunningTime) { 803 if (!underConstruction) { 804 final String msg = "Cannot modify job " + this + " as it is no longer under construction"; 805 log.debug(msg); 806 throw new IllegalState(msg); 807 } 808 this.forceMaxRunningTime = maxJobRunningTime; 809 orderXMLdoc.setMaxJobRunningTime(maxJobRunningTime); 810 } 811 812 /** 813 * @return Returns the MaxJobRunningTime. 0 means no limit. 814 */ 815 public long getMaxJobRunningTime() { 816 return forceMaxRunningTime; 817 } 818 819 /** 820 * Get the harvestNum for this job. The number reflects which run of the harvest definition this is. 821 * 822 * @return the harvestNum for this job. 823 */ 824 public int getHarvestNum() { 825 return harvestNum; 826 } 827 828 /** 829 * Set the harvestNum for this job. The number reflects which run of the harvest definition this is. ONLY TO BE USED 830 * IN THE CONSTRUCTION PHASE. 831 * 832 * @param harvestNum a given harvestNum 833 */ 834 public void setHarvestNum(int harvestNum) { 835 if (!underConstruction) { 836 final String msg = "Cannot modify job " + this + " as it is no longer under construction"; 837 log.debug(msg); 838 throw new IllegalState(msg); 839 } 840 this.harvestNum = harvestNum; 841 } 842 843 /** 844 * Get the list of harvest errors for this job. If no harvest errors, null is returned This value is not meaningful 845 * until the job is finished (FAILED,DONE, RESUBMITTED) 846 * 847 * @return the harvest errors for this job or null if no harvest errors. 848 */ 849 public String getHarvestErrors() { 850 return harvestErrors; 851 } 852 853 /** 854 * Append to the list of harvest errors for this job. Nothing happens, if argument harvestErrors is null. 855 * 856 * @param harvestErrors a string containing harvest errors (may be null) 857 */ 858 public void appendHarvestErrors(String harvestErrors) { 859 if (harvestErrors != null) { 860 if (this.harvestErrors == null) { 861 this.harvestErrors = harvestErrors; 862 } else { 863 this.harvestErrors += "\n" + harvestErrors; 864 } 865 } 866 } 867 868 /** 869 * Get the list of harvest error details for this job. If no harvest error details, null is returned This value is 870 * not meaningful until the job is finished (FAILED,DONE, RESUBMITTED) 871 * 872 * @return the list of harvest error details for this job or null if no harvest error details. 873 */ 874 875 public String getHarvestErrorDetails() { 876 return harvestErrorDetails; 877 } 878 879 /** 880 * Append to the list of harvest error details for this job. Nothing happens, if argument harvestErrorDetails is 881 * null. 882 * 883 * @param harvestErrorDetails a string containing harvest error details. 884 */ 885 public void appendHarvestErrorDetails(String harvestErrorDetails) { 886 if (harvestErrorDetails != null) { 887 if (this.harvestErrorDetails == null) { 888 this.harvestErrorDetails = harvestErrorDetails; 889 } else { 890 this.harvestErrorDetails += "\n" + harvestErrorDetails; 891 } 892 } 893 } 894 895 /** 896 * Get the list of upload errors. If no upload errors, null is returned. This value is not meaningful until the job 897 * is finished (FAILED,DONE, RESUBMITTED) 898 * 899 * @return the list of upload errors as String, or null if no upload errors. 900 */ 901 public String getUploadErrors() { 902 return uploadErrors; 903 } 904 905 /** 906 * Append to the list of upload errors. Nothing happens, if argument uploadErrors is null. 907 * 908 * @param uploadErrors a string containing upload errors. 909 */ 910 public void appendUploadErrors(String uploadErrors) { 911 if (uploadErrors != null) { 912 if (this.uploadErrors == null) { 913 this.uploadErrors = uploadErrors; 914 } else { 915 this.uploadErrors += "\n" + uploadErrors; 916 } 917 } 918 } 919 920 /** 921 * Get the list of upload error details. If no upload error details, null is returned. This value is not meaningful 922 * until the job is finished (FAILED,DONE, RESUBMITTED) 923 * 924 * @return the list of upload error details as String, or null if no upload error details 925 */ 926 public String getUploadErrorDetails() { 927 return uploadErrorDetails; 928 } 929 930 /** 931 * Append to the list of upload error details. Nothing happens, if argument uploadErrorDetails is null. 932 * 933 * @param uploadErrorDetails a string containing upload error details. 934 */ 935 public void appendUploadErrorDetails(String uploadErrorDetails) { 936 if (uploadErrorDetails != null) { 937 if (this.uploadErrorDetails == null) { 938 this.uploadErrorDetails = uploadErrorDetails; 939 } else { 940 this.uploadErrorDetails += "\n" + uploadErrorDetails; 941 } 942 } 943 } 944 945 /** 946 * Get the ID for the job which this job was resubmitted as. If null, this job has not been resubmitted. 947 * 948 * @return this ID. 949 */ 950 public Long getResubmittedAsJob() { 951 return resubmittedAsJobWithID; 952 } 953 954 /** 955 * Set the Date for when this job was submitted. If null, this job has not been submitted. 956 * 957 * @param submittedDate The date when this was submitted 958 */ 959 public void setSubmittedDate(Date submittedDate) { 960 this.submittedDate = submittedDate; 961 } 962 963 /** 964 * Set the Date for when this job was created. If null, this job has not been created. 965 * 966 * @param creationDate The date when this was created 967 */ 968 public void setCreationDate(Date creationDate) { 969 this.creationDate = creationDate; 970 } 971 972 /** 973 * Set the ID for the job which this job was resubmitted as. 974 * 975 * @param resubmittedAsJob An Id for a new job. 976 */ 977 public void setResubmittedAsJob(Long resubmittedAsJob) { 978 this.resubmittedAsJobWithID = resubmittedAsJob; 979 } 980 981 /** 982 * @return id of the job that this job is supposed to continue using Heritrix recover-log or null if it starts from 983 * scratch. 984 */ 985 public Long getContinuationOf() { 986 return this.continuationOF; 987 } 988 989 @Override 990 public String getHarvestFilenamePrefix() { 991 if (this.harvestnamePrefix == null) { 992 log.warn("HarvestnamePrefix not yet set for job {}. Set it by using the naming scheme. " 993 + "This should only happen for old jobs being read", this.jobID); 994 setDefaultHarvestNamePrefix(); 995 } 996 return this.harvestnamePrefix; 997 } 998 999 /** 1000 * @param prefix 1001 */ 1002 public void setHarvestFilenamePrefix(String prefix) { 1003 this.harvestnamePrefix = prefix; 1004 } 1005 1006 /** 1007 * @return the forceMaxBytesPerDomain 1008 */ 1009 public long getForceMaxBytesPerDomain() { 1010 return forceMaxBytesPerDomain; 1011 } 1012 1013 /** 1014 * @return the configurationSetsObjectLimit 1015 */ 1016 public boolean isConfigurationSetsObjectLimit() { 1017 return configurationSetsObjectLimit; 1018 } 1019 1020 /** 1021 * @return the configurationSetsByteLimit 1022 */ 1023 public boolean isConfigurationSetsByteLimit() { 1024 return configurationSetsByteLimit; 1025 } 1026 1027 /** 1028 * @return the minCountObjects 1029 */ 1030 public long getMinCountObjects() { 1031 return minCountObjects; 1032 } 1033 1034 /** 1035 * @return the maxCountObjects 1036 */ 1037 public long getMaxCountObjects() { 1038 return maxCountObjects; 1039 } 1040 1041 /** 1042 * @return the totalCountObjects 1043 */ 1044 public long getTotalCountObjects() { 1045 return totalCountObjects; 1046 } 1047 1048 void setDefaultHarvestNamePrefix() { 1049 if (getJobID() != null) { 1050 ArchiveFileNaming naming = ArchiveFileNamingFactory.getInstance(); 1051 log.debug("Applying the default ArchiveFileNaming class '{}'.", naming.getClass().getName()); 1052 final String prefix = naming.getPrefix(this); 1053 setHarvestFilenamePrefix(prefix); 1054 log.debug("The harvestPrefix of this job is: {}", prefix); 1055 } else { 1056 log.warn("The harvestnamePrefix is not set now, as it depends on the JobID, which is not set yet"); 1057 } 1058 } 1059 1060 /** @return the harvest-audience. */ 1061 public String getHarvestAudience() { 1062 return harvestAudience; 1063 } 1064 1065 /** 1066 * Set the harvest audience for this job. Taken from the harvestdefinition that generated this job. 1067 * 1068 * @param theAudience the harvest-audience. 1069 */ 1070 public void setHarvestAudience(String theAudience) { 1071 this.harvestAudience = theAudience; 1072 } 1073 1074 ///////////// The following two methods are needed by harvestStatus-jobdetails.jsp //////////////////////////////////// 1075 /** 1076 * Returns a list of sorted seeds for this job. 1077 * The sorting is by domain, and inside each domain, 1078 * the list is sorted by url 1079 * @return a list of sorted seeds for this job. 1080 */ 1081 public List<String> getSortedSeedList() { 1082 Map<String, Set<String>> urlMap = new HashMap<String, Set<String>>(); 1083 for (String seed : seedListSet) { 1084 String url; 1085 // Assume the protocol is http://, if it is missing 1086 if (!seed.matches(Constants.PROTOCOL_REGEXP)) { 1087 url = "http://" + seed; 1088 } else { 1089 url = seed; 1090 } 1091 String domain = getDomain(url); 1092 if (domain == null) { 1093 // stop processing this url, and continue to the next seed 1094 continue; 1095 } 1096 Set<String> set; 1097 if (urlMap.containsKey(domain)) { 1098 set = urlMap.get(domain); 1099 } else { 1100 set = new TreeSet<String>(); 1101 urlMap.put(domain, set); 1102 } 1103 set.add(seed); 1104 1105 } 1106 List<String> result = new ArrayList<String>(); 1107 for (Set<String> set: urlMap.values()) { 1108 result.addAll(set); 1109 } 1110 return result; 1111 } 1112 /** 1113 * Get the domain, that the given URL belongs to. 1114 * @param url an URL 1115 * @return the domain, that the given URL belongs to, or 1116 * null if unable to do so. 1117 */ 1118 private String getDomain(String url) { 1119 try { 1120 URL uri = new URL(url); 1121 return DomainUtils.domainNameFromHostname(uri.getHost()); 1122 } catch (MalformedURLException e) { 1123 log.warn("The string '{}' is not a valid URL", url); 1124 return null; 1125 } 1126 } 1127 1128}