001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.harvesting.monitor; 024 025import java.net.MalformedURLException; 026import java.net.URL; 027import java.text.MessageFormat; 028import java.text.ParseException; 029import java.util.Date; 030 031import org.slf4j.Logger; 032import org.slf4j.LoggerFactory; 033 034import dk.netarkivet.common.exceptions.ArgumentNotValid; 035import dk.netarkivet.common.utils.StringUtils; 036import dk.netarkivet.harvester.datamodel.HarvestDefinitionDAO; 037import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage; 038import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage.CrawlServiceInfo; 039import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage.CrawlServiceJobInfo; 040import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage.CrawlStatus; 041 042/** 043 * This class is a simple bean storing information about a started job. 044 * <p> 045 * This class is a persistent entity as per Berkeley DB JE DPL API. 046 */ 047public class StartedJobInfo implements Comparable<StartedJobInfo> { 048 049 /** The class logger. */ 050 private static final Logger log = LoggerFactory.getLogger(StartedJobInfo.class); 051 052 /** list of the compare criteria. */ 053 public enum Criteria { 054 JOBID, HOST, PROGRESS, ELAPSED, QFILES, TOTALQ, ACTIVEQ, INACTIVEQ, EXHAUSTEDQ 055 } 056 057 ; 058 059 /** current compare criteria. */ 060 private StartedJobInfo.Criteria compareCriteria = StartedJobInfo.Criteria.JOBID; 061 062 private static final String NOT_AVAILABLE_STRING = ""; 063 private static final long NOT_AVAILABLE_NUM = -1L; 064 065 /** A text format used to parse Heritrix's short frontier report. */ 066 private static final MessageFormat FRONTIER_SHORT_FMT = new MessageFormat( 067 "{0} queues: {1} active ({2} in-process; {3} ready; {4} snoozed); {5} inactive; {6} retired; {7} exhausted"); 068 069 /** The job identifier. */ 070 private long jobId; 071 072 /** The name of the harvest this job belongs to. */ 073 private String harvestName; 074 075 /** Creation date of the record. */ 076 private Date timestamp; 077 078 /** URL to the Heritrix admin console. */ 079 private String hostUrl; 080 081 /** A percentage indicating the crawl progress. */ 082 private double progress; 083 084 /** The number of URIs queued, awaiting to be processed by Heritrix. */ 085 private long queuedFilesCount; 086 087 /** 088 * The number of URIS harvested by Heritrix since the beginning of the crawl. 089 */ 090 private long downloadedFilesCount; 091 092 /** 093 * The total number of queues in the frontier. Queues are per-domain. 094 */ 095 private long totalQueuesCount; 096 097 /** The number of queues in process. */ 098 private long activeQueuesCount; 099 100 /** The number of inactive queues in process. */ 101 private long inactiveQueuesCount; 102 103 /** The number of queues that have been retired when they hit their quota. */ 104 private long retiredQueuesCount; 105 /** Number of queues entirely processed. */ 106 private long exhaustedQueuesCount; 107 108 /** Time in seconds elapsed since Heritrix began crawling this job. */ 109 private long elapsedSeconds; 110 111 /** Current download rate in KB/sec. */ 112 private long currentProcessedKBPerSec; 113 114 /** Average download rate (over a period of 20 seconds) in KB/sec. */ 115 private long processedKBPerSec; 116 117 /** Current download rate in URI/sec. */ 118 private double currentProcessedDocsPerSec; 119 120 /** Average download rate (over a period of 20 seconds) in URI/sec. */ 121 private double processedDocsPerSec; 122 123 /** Number of active Heritrix worker threads. */ 124 private int activeToeCount; 125 126 /** Number of alerts raised by Heritrix since the crawl began. */ 127 private long alertsCount; 128 129 /** Current job status. */ 130 private CrawlStatus status; 131 132 /** 133 * Needed by BDB DPL. 134 */ 135 public StartedJobInfo() { 136 137 } 138 139 /** 140 * Instantiates all readable fields with default values. 141 * 142 * @param harvestName the name of the harvest 143 * @param jobId the ID of the job 144 */ 145 public StartedJobInfo(String harvestName, long jobId) { 146 this.timestamp = new Date(System.currentTimeMillis()); 147 this.jobId = jobId; 148 this.harvestName = harvestName; 149 this.hostUrl = NOT_AVAILABLE_STRING; 150 this.progress = NOT_AVAILABLE_NUM; 151 this.queuedFilesCount = NOT_AVAILABLE_NUM; 152 this.totalQueuesCount = NOT_AVAILABLE_NUM; 153 this.activeQueuesCount = NOT_AVAILABLE_NUM; 154 this.inactiveQueuesCount = NOT_AVAILABLE_NUM; 155 this.retiredQueuesCount = NOT_AVAILABLE_NUM; 156 this.exhaustedQueuesCount = NOT_AVAILABLE_NUM; 157 this.elapsedSeconds = NOT_AVAILABLE_NUM; 158 this.alertsCount = NOT_AVAILABLE_NUM; 159 this.downloadedFilesCount = NOT_AVAILABLE_NUM; 160 this.currentProcessedKBPerSec = NOT_AVAILABLE_NUM; 161 this.processedKBPerSec = NOT_AVAILABLE_NUM; 162 this.currentProcessedDocsPerSec = NOT_AVAILABLE_NUM; 163 this.processedDocsPerSec = NOT_AVAILABLE_NUM; 164 this.activeToeCount = (int) NOT_AVAILABLE_NUM; 165 this.status = CrawlStatus.PRE_CRAWL; 166 } 167 168 /** 169 * @return the job ID. 170 */ 171 public long getJobId() { 172 return jobId; 173 } 174 175 /** 176 * @return the harvest name. 177 */ 178 public String getHarvestName() { 179 return harvestName; 180 } 181 182 /** 183 * @return the timestamp 184 */ 185 public Date getTimestamp() { 186 return timestamp; 187 } 188 189 /** 190 * @return the name of the host on which Heritrix is crawling this job. 191 */ 192 public String getHostName() { 193 if (NOT_AVAILABLE_STRING.equals(hostUrl)) { 194 return NOT_AVAILABLE_STRING; 195 } 196 try { 197 return new URL(hostUrl).getHost(); 198 } catch (MalformedURLException e) { 199 return NOT_AVAILABLE_STRING; 200 } 201 } 202 203 /** 204 * @return the URL of the Heritrix admin console for the instance crawling this job. 205 */ 206 public String getHostUrl() { 207 return hostUrl; 208 } 209 210 /** 211 * @return the crawl progress as a numeric percentage. 212 */ 213 public double getProgress() { 214 return progress; 215 } 216 217 /** 218 * @return the number of queued files reported by Heritrix. 219 */ 220 public long getQueuedFilesCount() { 221 return queuedFilesCount; 222 } 223 224 /** 225 * @return the number of queues reported by Heritrix. 226 */ 227 public long getTotalQueuesCount() { 228 return totalQueuesCount; 229 } 230 231 /** 232 * @return the number of active queues reported by Heritrix. 233 */ 234 public long getActiveQueuesCount() { 235 return activeQueuesCount; 236 } 237 238 /** 239 * @return the number of retired heritrix queues. 240 */ 241 public long getRetiredQueuesCount() { 242 return retiredQueuesCount; 243 } 244 245 /** 246 * @return the number of exhausted queues reported by Heritrix. 247 */ 248 public long getExhaustedQueuesCount() { 249 return exhaustedQueuesCount; 250 } 251 252 /** 253 * @return the formatted duration of the crawl. 254 */ 255 public String getElapsedTime() { 256 return StringUtils.formatDuration(elapsedSeconds); 257 } 258 259 /** 260 * @return the duration of the crawl so far. 261 */ 262 public Long getElapsedSeconds() { 263 return elapsedSeconds; 264 } 265 266 /** 267 * @return the number of alerts raised by Heritrix. 268 */ 269 public long getAlertsCount() { 270 return alertsCount; 271 } 272 273 /** 274 * @return the number of downloaded URIs reported by Heritrix. 275 */ 276 public long getDownloadedFilesCount() { 277 return downloadedFilesCount; 278 } 279 280 /** 281 * @return the current download rate in KB/sec reported by Heritrix. 282 */ 283 public long getCurrentProcessedKBPerSec() { 284 return currentProcessedKBPerSec; 285 } 286 287 /** 288 * @return the average download rate in KB/sec reported by Heritrix. 289 */ 290 public long getProcessedKBPerSec() { 291 return processedKBPerSec; 292 } 293 294 /** 295 * @return the current download rate in URI/sec reported by Heritrix. 296 */ 297 public double getCurrentProcessedDocsPerSec() { 298 return currentProcessedDocsPerSec; 299 } 300 301 /** 302 * @return the average download rate in URI/sec reported by Heritrix. 303 */ 304 public double getProcessedDocsPerSec() { 305 return processedDocsPerSec; 306 } 307 308 /** 309 * @return the number of active processor threads reported by Heritrix. 310 */ 311 public int getActiveToeCount() { 312 return activeToeCount; 313 } 314 315 /** 316 * @return the job status 317 * @see CrawlStatus 318 */ 319 public CrawlStatus getStatus() { 320 return status; 321 } 322 323 @Override 324 public int compareTo(StartedJobInfo o) throws NullPointerException { 325 326 if (o == null) { 327 throw new NullPointerException("StartedJobInfo o can't be null"); 328 } 329 330 if (compareCriteria == StartedJobInfo.Criteria.HOST) { 331 return hostUrl.compareTo(o.hostUrl); 332 } 333 if (compareCriteria == StartedJobInfo.Criteria.PROGRESS) { 334 return new Double(progress).compareTo(new Double(o.progress)); 335 } 336 if (compareCriteria == StartedJobInfo.Criteria.ELAPSED) { 337 return Long.valueOf(elapsedSeconds).compareTo(Long.valueOf(o.elapsedSeconds)); 338 } 339 if (compareCriteria == StartedJobInfo.Criteria.QFILES) { 340 return Long.valueOf(queuedFilesCount).compareTo(Long.valueOf(o.queuedFilesCount)); 341 } 342 if (compareCriteria == StartedJobInfo.Criteria.TOTALQ) { 343 return Long.valueOf(totalQueuesCount).compareTo(Long.valueOf(o.totalQueuesCount)); 344 } 345 if (compareCriteria == StartedJobInfo.Criteria.ACTIVEQ) { 346 return Long.valueOf(activeQueuesCount).compareTo(Long.valueOf(o.activeQueuesCount)); 347 } 348 if (compareCriteria == StartedJobInfo.Criteria.INACTIVEQ) { 349 return Long.valueOf(inactiveQueuesCount).compareTo(Long.valueOf(o.inactiveQueuesCount)); 350 } 351 if (compareCriteria == StartedJobInfo.Criteria.EXHAUSTEDQ) { 352 return Long.valueOf(exhaustedQueuesCount).compareTo(Long.valueOf(o.exhaustedQueuesCount)); 353 } 354 return Long.valueOf(jobId).compareTo(Long.valueOf(o.jobId)); 355 } 356 357 /** 358 * set the criteria used in the compareTo method that way we can decide how to sort StartedJobInfo. 359 * 360 * @param criteria the criteria we want to use 361 */ 362 public void chooseCompareCriteria(StartedJobInfo.Criteria criteria) { 363 ArgumentNotValid.checkNotNull(criteria, "criteria can't be null"); 364 compareCriteria = criteria; 365 } 366 367 @Override 368 public String toString() { 369 return harvestName + " - " + jobId + " {" + "\n\tstatus=" + status.name() + "\n\telapsedSeconds=" 370 + elapsedSeconds + "\n\thostUrl=" + hostUrl + "\n\tprogress=" + progress + "\n\tactiveToeCount=" 371 + activeToeCount + "\n\talertsCount=" + alertsCount + "\n\tcurrentProcessedKBPerSec=" 372 + currentProcessedKBPerSec + "\n\tprocessedKBPerSec=" + processedKBPerSec 373 + "\n\tcurrentProcessedDocsPerSec=" + currentProcessedDocsPerSec + "\n\tprocessedDocsPerSec=" 374 + processedDocsPerSec + "\n\tdownloadedFilesCount=" + downloadedFilesCount + "\n\tqueuedFilesCount=" 375 + queuedFilesCount + "\n\tactiveQueuesCount=" + activeQueuesCount + "\n\tinactiveQueuesCount=" 376 + inactiveQueuesCount + "\n\texhaustedQueuesCount=" 377 + exhaustedQueuesCount + "\n\ttotalQueuesCount=" + totalQueuesCount + "\n}"; 378 } 379 380 /** 381 * Updates the members from a {@link CrawlProgressMessage} instance. 382 * 383 * @param msg the {@link CrawlProgressMessage} to process. 384 * @return jobinfo based on the contents of the message. 385 */ 386 public static StartedJobInfo build(CrawlProgressMessage msg) { 387 ArgumentNotValid.checkNotNull(msg, "CrawlProgressMessage msg"); 388 String harvestName = HarvestDefinitionDAO.getInstance().getHarvestName(msg.getHarvestID()); 389 390 StartedJobInfo sji = new StartedJobInfo(harvestName, msg.getJobID()); 391 392 CrawlServiceInfo heritrixInfo = msg.getHeritrixStatus(); 393 CrawlServiceJobInfo jobInfo = msg.getJobStatus(); 394 395 CrawlStatus newStatus = msg.getStatus(); 396 switch (newStatus) { 397 case PRE_CRAWL: 398 // Initialize statistics-variables before starting the crawl. 399 sji.activeQueuesCount = 0; 400 sji.inactiveQueuesCount = 0; 401 sji.activeToeCount = 0; 402 sji.alertsCount = 0; 403 sji.currentProcessedDocsPerSec = 0; 404 sji.currentProcessedKBPerSec = 0; 405 sji.downloadedFilesCount = 0; 406 sji.elapsedSeconds = 0; 407 sji.hostUrl = ""; 408 sji.processedDocsPerSec = 0; 409 sji.processedKBPerSec = 0; 410 sji.progress = 0; 411 sji.queuedFilesCount = 0; 412 sji.totalQueuesCount = 0; 413 break; 414 case CRAWLER_ACTIVE: 415 case CRAWLER_PAUSING: 416 case CRAWLER_PAUSED: 417 // Update statistics for the crawl 418 double discoveredCount = jobInfo.getDiscoveredFilesCount(); 419 double downloadedCount = jobInfo.getDownloadedFilesCount(); 420 sji.progress = 100 * downloadedCount / discoveredCount; 421 422 String frontierShortReport = jobInfo.getFrontierShortReport(); 423 if (frontierShortReport != null) { 424 try { 425 Object[] params = FRONTIER_SHORT_FMT.parse(frontierShortReport); 426 sji.totalQueuesCount = Long.parseLong((String) params[0]); 427 sji.activeQueuesCount = Long.parseLong((String) params[1]); 428 sji.inactiveQueuesCount = Long.parseLong((String) params[5]); 429 //FIXME : delete retiredQueuesCount to keep only inactiveQueuesCount 430 sji.retiredQueuesCount = Long.parseLong((String) params[5]); 431 sji.exhaustedQueuesCount = Long.parseLong((String) params[7]); 432 } catch (ParseException e) { 433 throw new ArgumentNotValid(frontierShortReport, e); 434 } 435 } 436 437 sji.activeToeCount = jobInfo.getActiveToeCount(); 438 sji.alertsCount = heritrixInfo.getAlertCount(); 439 sji.currentProcessedDocsPerSec = jobInfo.getCurrentProcessedDocsPerSec(); 440 sji.currentProcessedKBPerSec = jobInfo.getCurrentProcessedKBPerSec(); 441 sji.downloadedFilesCount = jobInfo.getDownloadedFilesCount(); 442 sji.elapsedSeconds = jobInfo.getElapsedSeconds(); 443 sji.hostUrl = msg.getHostUrl(); 444 sji.processedDocsPerSec = jobInfo.getProcessedDocsPerSec(); 445 sji.processedKBPerSec = jobInfo.getProcessedKBPerSec(); 446 sji.queuedFilesCount = jobInfo.getQueuedUriCount(); 447 break; 448 case CRAWLING_FINISHED: 449 // Set progress to 100 %, and reset the other values . 450 sji.progress = 100; 451 sji.hostUrl = ""; 452 sji.activeQueuesCount = 0; 453 sji.inactiveQueuesCount = 0; 454 sji.activeToeCount = 0; 455 sji.currentProcessedDocsPerSec = 0; 456 sji.currentProcessedKBPerSec = 0; 457 sji.processedDocsPerSec = 0; 458 sji.processedKBPerSec = 0; 459 sji.queuedFilesCount = 0; 460 sji.totalQueuesCount = 0; 461 break; 462 default: 463 log.debug("Nothing to do for state: {}", newStatus); 464 break; 465 } 466 sji.status = newStatus; 467 468 //FIXME : save inactiveQueuesCount in database 469 return sji; 470 } 471 472 /** 473 * @param hostUrl the hostUrl to set 474 */ 475 public void setHostUrl(String hostUrl) { 476 477 this.hostUrl = hostUrl; 478 } 479 480 /** 481 * @param progress the progress to set 482 */ 483 public void setProgress(double progress) { 484 this.progress = progress; 485 } 486 487 /** 488 * @param queuedFilesCount the queuedFilesCount to set 489 */ 490 public void setQueuedFilesCount(long queuedFilesCount) { 491 this.queuedFilesCount = queuedFilesCount; 492 } 493 494 /** 495 * @param downloadedFilesCount the downloadedFilesCount to set 496 */ 497 public void setDownloadedFilesCount(long downloadedFilesCount) { 498 this.downloadedFilesCount = downloadedFilesCount; 499 } 500 501 /** 502 * @param totalQueuesCount the totalQueuesCount to set 503 */ 504 public void setTotalQueuesCount(long totalQueuesCount) { 505 this.totalQueuesCount = totalQueuesCount; 506 } 507 508 /** 509 * @param activeQueuesCount the activeQueuesCount to set 510 */ 511 public void setActiveQueuesCount(long activeQueuesCount) { 512 this.activeQueuesCount = activeQueuesCount; 513 } 514 515 /** 516 * @param exhaustedQueuesCount the exhaustedQueuesCount to set 517 */ 518 public void setExhaustedQueuesCount(long exhaustedQueuesCount) { 519 this.exhaustedQueuesCount = exhaustedQueuesCount; 520 } 521 522 /** 523 * @param elapsedSeconds the elapsedSeconds to set 524 */ 525 public void setElapsedSeconds(long elapsedSeconds) { 526 this.elapsedSeconds = elapsedSeconds; 527 } 528 529 /** 530 * @param currentProcessedKBPerSec the currentProcessedKBPerSec to set 531 */ 532 public void setCurrentProcessedKBPerSec(long currentProcessedKBPerSec) { 533 this.currentProcessedKBPerSec = currentProcessedKBPerSec; 534 } 535 536 /** 537 * @param processedKBPerSec the processedKBPerSec to set 538 */ 539 public void setProcessedKBPerSec(long processedKBPerSec) { 540 this.processedKBPerSec = processedKBPerSec; 541 } 542 543 /** 544 * @param currentProcessedDocsPerSec the currentProcessedDocsPerSec to set 545 */ 546 public void setCurrentProcessedDocsPerSec(double currentProcessedDocsPerSec) { 547 this.currentProcessedDocsPerSec = currentProcessedDocsPerSec; 548 } 549 550 /** 551 * @param processedDocsPerSec the processedDocsPerSec to set 552 */ 553 public void setProcessedDocsPerSec(double processedDocsPerSec) { 554 this.processedDocsPerSec = processedDocsPerSec; 555 } 556 557 /** 558 * @param activeToeCount the activeToeCount to set 559 */ 560 public void setActiveToeCount(int activeToeCount) { 561 this.activeToeCount = activeToeCount; 562 } 563 564 /** 565 * @param alertsCount the alertsCount to set 566 */ 567 public void setAlertsCount(long alertsCount) { 568 this.alertsCount = alertsCount; 569 } 570 571 /** 572 * @param status the status to set 573 */ 574 public void setStatus(CrawlStatus status) { 575 this.status = status; 576 } 577 578 /** 579 * @param timestamp the timestamp to set 580 */ 581 public void setTimestamp(Date timestamp) { 582 this.timestamp = timestamp; 583 } 584 585 /** 586 * @param retiredQueuesCount the retiredQueuesCount to set 587 */ 588 public void setRetiredQueuesCount(long retiredQueuesCount) { 589 this.retiredQueuesCount = retiredQueuesCount; 590 } 591 592 public long getInactiveQueuesCount() { 593 //FIXME : delete retiredQueuesCount from code and database 594 //and use only inactiveQueuesCount 595 //return inactiveQueuesCount; 596 return retiredQueuesCount; 597 } 598 599 public void setInactiveQueuesCount(long inactiveQueuesCount) { 600 this.inactiveQueuesCount = inactiveQueuesCount; 601 } 602 603}