001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.harvesting.monitor; 024 025import java.net.MalformedURLException; 026import java.net.URL; 027import java.text.MessageFormat; 028import java.text.ParseException; 029import java.util.Date; 030 031import org.slf4j.Logger; 032import org.slf4j.LoggerFactory; 033 034import dk.netarkivet.common.exceptions.ArgumentNotValid; 035import dk.netarkivet.common.utils.StringUtils; 036import dk.netarkivet.harvester.datamodel.HarvestDefinitionDAO; 037import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage; 038import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage.CrawlServiceInfo; 039import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage.CrawlServiceJobInfo; 040import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage.CrawlStatus; 041 042/** 043 * This class is a simple bean storing information about a started job. 044 * <p> 045 * This class is a persistent entity as per Berkeley DB JE DPL API. 046 */ 047public class StartedJobInfo implements Comparable<StartedJobInfo> { 048 049 /** The class logger. */ 050 private static final Logger log = LoggerFactory.getLogger(StartedJobInfo.class); 051 052 /** list of the compare criteria. */ 053 public enum Criteria { 054 JOBID, HOST, PROGRESS, ELAPSED, QFILES, TOTALQ, ACTIVEQ, EXHAUSTEDQ 055 } 056 057 ; 058 059 /** current compare criteria. */ 060 private StartedJobInfo.Criteria compareCriteria = StartedJobInfo.Criteria.JOBID; 061 062 private static final String NOT_AVAILABLE_STRING = ""; 063 private static final long NOT_AVAILABLE_NUM = -1L; 064 065 /** A text format used to parse Heritrix's short frontier report. */ 066 private static final MessageFormat FRONTIER_SHORT_FMT = new MessageFormat( 067 "{0} queues: {1} active ({2} in-process; {3} ready; {4} snoozed); {5} inactive; {6} retired; {7} exhausted"); 068 069 /** The job identifier. */ 070 private long jobId; 071 072 /** The name of the harvest this job belongs to. */ 073 private String harvestName; 074 075 /** Creation date of the record. */ 076 private Date timestamp; 077 078 /** URL to the Heritrix admin console. */ 079 private String hostUrl; 080 081 /** A percentage indicating the crawl progress. */ 082 private double progress; 083 084 /** The number of URIs queued, awaiting to be processed by Heritrix. */ 085 private long queuedFilesCount; 086 087 /** 088 * The number of URIS harvested by Heritrix since the beginning of the crawl. 089 */ 090 private long downloadedFilesCount; 091 092 /** 093 * The total number of queues in the frontier. Queues are per-domain. 094 */ 095 private long totalQueuesCount; 096 097 /** The number of queues in process. */ 098 private long activeQueuesCount; 099 100 /** The number of queues that have been retired when they hit their quota. */ 101 private long retiredQueuesCount; 102 /** Number of queues entirely processed. */ 103 private long exhaustedQueuesCount; 104 105 /** Time in seconds elapsed since Heritrix began crawling this job. */ 106 private long elapsedSeconds; 107 108 /** Current download rate in KB/sec. */ 109 private long currentProcessedKBPerSec; 110 111 /** Average download rate (over a period of 20 seconds) in KB/sec. */ 112 private long processedKBPerSec; 113 114 /** Current download rate in URI/sec. */ 115 private double currentProcessedDocsPerSec; 116 117 /** Average download rate (over a period of 20 seconds) in URI/sec. */ 118 private double processedDocsPerSec; 119 120 /** Number of active Heritrix worker threads. */ 121 private int activeToeCount; 122 123 /** Number of alerts raised by Heritrix since the crawl began. */ 124 private long alertsCount; 125 126 /** Current job status. */ 127 private CrawlStatus status; 128 129 /** 130 * Needed by BDB DPL. 131 */ 132 public StartedJobInfo() { 133 134 } 135 136 /** 137 * Instantiates all readable fields with default values. 138 * 139 * @param harvestName the name of the harvest 140 * @param jobId the ID of the job 141 */ 142 public StartedJobInfo(String harvestName, long jobId) { 143 this.timestamp = new Date(System.currentTimeMillis()); 144 this.jobId = jobId; 145 this.harvestName = harvestName; 146 this.hostUrl = NOT_AVAILABLE_STRING; 147 this.progress = NOT_AVAILABLE_NUM; 148 this.queuedFilesCount = NOT_AVAILABLE_NUM; 149 this.totalQueuesCount = NOT_AVAILABLE_NUM; 150 this.activeQueuesCount = NOT_AVAILABLE_NUM; 151 this.retiredQueuesCount = NOT_AVAILABLE_NUM; 152 this.exhaustedQueuesCount = NOT_AVAILABLE_NUM; 153 this.elapsedSeconds = NOT_AVAILABLE_NUM; 154 this.alertsCount = NOT_AVAILABLE_NUM; 155 this.downloadedFilesCount = NOT_AVAILABLE_NUM; 156 this.currentProcessedKBPerSec = NOT_AVAILABLE_NUM; 157 this.processedKBPerSec = NOT_AVAILABLE_NUM; 158 this.currentProcessedDocsPerSec = NOT_AVAILABLE_NUM; 159 this.processedDocsPerSec = NOT_AVAILABLE_NUM; 160 this.activeToeCount = (int) NOT_AVAILABLE_NUM; 161 this.status = CrawlStatus.PRE_CRAWL; 162 } 163 164 /** 165 * @return the job ID. 166 */ 167 public long getJobId() { 168 return jobId; 169 } 170 171 /** 172 * @return the harvest name. 173 */ 174 public String getHarvestName() { 175 return harvestName; 176 } 177 178 /** 179 * @return the timestamp 180 */ 181 public Date getTimestamp() { 182 return timestamp; 183 } 184 185 /** 186 * @return the name of the host on which Heritrix is crawling this job. 187 */ 188 public String getHostName() { 189 if (NOT_AVAILABLE_STRING.equals(hostUrl)) { 190 return NOT_AVAILABLE_STRING; 191 } 192 try { 193 return new URL(hostUrl).getHost(); 194 } catch (MalformedURLException e) { 195 return NOT_AVAILABLE_STRING; 196 } 197 } 198 199 /** 200 * @return the URL of the Heritrix admin console for the instance crawling this job. 201 */ 202 public String getHostUrl() { 203 return hostUrl; 204 } 205 206 /** 207 * @return the crawl progress as a numeric percentage. 208 */ 209 public double getProgress() { 210 return progress; 211 } 212 213 /** 214 * @return the number of queued files reported by Heritrix. 215 */ 216 public long getQueuedFilesCount() { 217 return queuedFilesCount; 218 } 219 220 /** 221 * @return the number of queues reported by Heritrix. 222 */ 223 public long getTotalQueuesCount() { 224 return totalQueuesCount; 225 } 226 227 /** 228 * @return the number of active queues reported by Heritrix. 229 */ 230 public long getActiveQueuesCount() { 231 return activeQueuesCount; 232 } 233 234 /** 235 * @return the number of retired heritrix queues. 236 */ 237 public long getRetiredQueuesCount() { 238 return retiredQueuesCount; 239 } 240 241 /** 242 * @return the number of exhausted queues reported by Heritrix. 243 */ 244 public long getExhaustedQueuesCount() { 245 return exhaustedQueuesCount; 246 } 247 248 /** 249 * @return the formatted duration of the crawl. 250 */ 251 public String getElapsedTime() { 252 return StringUtils.formatDuration(elapsedSeconds); 253 } 254 255 /** 256 * @return the duration of the crawl so far. 257 */ 258 public Long getElapsedSeconds() { 259 return elapsedSeconds; 260 } 261 262 /** 263 * @return the number of alerts raised by Heritrix. 264 */ 265 public long getAlertsCount() { 266 return alertsCount; 267 } 268 269 /** 270 * @return the number of downloaded URIs reported by Heritrix. 271 */ 272 public long getDownloadedFilesCount() { 273 return downloadedFilesCount; 274 } 275 276 /** 277 * @return the current download rate in KB/sec reported by Heritrix. 278 */ 279 public long getCurrentProcessedKBPerSec() { 280 return currentProcessedKBPerSec; 281 } 282 283 /** 284 * @return the average download rate in KB/sec reported by Heritrix. 285 */ 286 public long getProcessedKBPerSec() { 287 return processedKBPerSec; 288 } 289 290 /** 291 * @return the current download rate in URI/sec reported by Heritrix. 292 */ 293 public double getCurrentProcessedDocsPerSec() { 294 return currentProcessedDocsPerSec; 295 } 296 297 /** 298 * @return the average download rate in URI/sec reported by Heritrix. 299 */ 300 public double getProcessedDocsPerSec() { 301 return processedDocsPerSec; 302 } 303 304 /** 305 * @return the number of active processor threads reported by Heritrix. 306 */ 307 public int getActiveToeCount() { 308 return activeToeCount; 309 } 310 311 /** 312 * @return the job status 313 * @see CrawlStatus 314 */ 315 public CrawlStatus getStatus() { 316 return status; 317 } 318 319 @Override 320 public int compareTo(StartedJobInfo o) throws NullPointerException { 321 322 if (o == null) { 323 throw new NullPointerException("StartedJobInfo o can't be null"); 324 } 325 326 if (compareCriteria == StartedJobInfo.Criteria.HOST) { 327 return hostUrl.compareTo(o.hostUrl); 328 } 329 if (compareCriteria == StartedJobInfo.Criteria.PROGRESS) { 330 return new Double(progress).compareTo(new Double(o.progress)); 331 } 332 if (compareCriteria == StartedJobInfo.Criteria.ELAPSED) { 333 return Long.valueOf(elapsedSeconds).compareTo(Long.valueOf(o.elapsedSeconds)); 334 } 335 if (compareCriteria == StartedJobInfo.Criteria.QFILES) { 336 return Long.valueOf(queuedFilesCount).compareTo(Long.valueOf(o.queuedFilesCount)); 337 } 338 if (compareCriteria == StartedJobInfo.Criteria.TOTALQ) { 339 return Long.valueOf(totalQueuesCount).compareTo(Long.valueOf(o.totalQueuesCount)); 340 } 341 if (compareCriteria == StartedJobInfo.Criteria.ACTIVEQ) { 342 return Long.valueOf(activeQueuesCount).compareTo(Long.valueOf(o.activeQueuesCount)); 343 } 344 if (compareCriteria == StartedJobInfo.Criteria.EXHAUSTEDQ) { 345 return Long.valueOf(exhaustedQueuesCount).compareTo(Long.valueOf(o.exhaustedQueuesCount)); 346 } 347 return Long.valueOf(jobId).compareTo(Long.valueOf(o.jobId)); 348 } 349 350 /** 351 * set the criteria used in the compareTo method that way we can decide how to sort StartedJobInfo. 352 * 353 * @param criteria the criteria we want to use 354 */ 355 public void chooseCompareCriteria(StartedJobInfo.Criteria criteria) { 356 ArgumentNotValid.checkNotNull(criteria, "criteria can't be null"); 357 compareCriteria = criteria; 358 } 359 360 @Override 361 public String toString() { 362 return harvestName + " - " + jobId + " {" + "\n\tstatus=" + status.name() + "\n\telapsedSeconds=" 363 + elapsedSeconds + "\n\thostUrl=" + hostUrl + "\n\tprogress=" + progress + "\n\tactiveToeCount=" 364 + activeToeCount + "\n\talertsCount=" + alertsCount + "\n\tcurrentProcessedKBPerSec=" 365 + currentProcessedKBPerSec + "\n\tprocessedKBPerSec=" + processedKBPerSec 366 + "\n\tcurrentProcessedDocsPerSec=" + currentProcessedDocsPerSec + "\n\tprocessedDocsPerSec=" 367 + processedDocsPerSec + "\n\tdownloadedFilesCount=" + downloadedFilesCount + "\n\tqueuedFilesCount=" 368 + queuedFilesCount + "\n\tactiveQueuesCount=" + activeQueuesCount + "\n\texhaustedQueuesCount=" 369 + exhaustedQueuesCount + "\n\ttotalQueuesCount=" + totalQueuesCount + "\n}"; 370 } 371 372 /** 373 * Updates the members from a {@link CrawlProgressMessage} instance. 374 * 375 * @param msg the {@link CrawlProgressMessage} to process. 376 * @return jobinfo based on the contents of the message. 377 */ 378 public static StartedJobInfo build(CrawlProgressMessage msg) { 379 ArgumentNotValid.checkNotNull(msg, "CrawlProgressMessage msg"); 380 String harvestName = HarvestDefinitionDAO.getInstance().getHarvestName(msg.getHarvestID()); 381 382 StartedJobInfo sji = new StartedJobInfo(harvestName, msg.getJobID()); 383 384 CrawlServiceInfo heritrixInfo = msg.getHeritrixStatus(); 385 CrawlServiceJobInfo jobInfo = msg.getJobStatus(); 386 387 CrawlStatus newStatus = msg.getStatus(); 388 switch (newStatus) { 389 case PRE_CRAWL: 390 // Initialize statistics-variables before starting the crawl. 391 sji.activeQueuesCount = 0; 392 sji.activeToeCount = 0; 393 sji.alertsCount = 0; 394 sji.currentProcessedDocsPerSec = 0; 395 sji.currentProcessedKBPerSec = 0; 396 sji.downloadedFilesCount = 0; 397 sji.elapsedSeconds = 0; 398 sji.hostUrl = ""; 399 sji.processedDocsPerSec = 0; 400 sji.processedKBPerSec = 0; 401 sji.progress = 0; 402 sji.queuedFilesCount = 0; 403 sji.totalQueuesCount = 0; 404 break; 405 case CRAWLER_ACTIVE: 406 case CRAWLER_PAUSING: 407 case CRAWLER_PAUSED: 408 // Update statistics for the crawl 409 double discoveredCount = jobInfo.getDiscoveredFilesCount(); 410 double downloadedCount = jobInfo.getDownloadedFilesCount(); 411 sji.progress = 100 * downloadedCount / discoveredCount; 412 413 String frontierShortReport = jobInfo.getFrontierShortReport(); 414 if (frontierShortReport != null) { 415 try { 416 Object[] params = FRONTIER_SHORT_FMT.parse(frontierShortReport); 417 sji.totalQueuesCount = Long.parseLong((String) params[0]); 418 sji.activeQueuesCount = Long.parseLong((String) params[1]); 419 sji.retiredQueuesCount = Long.parseLong((String) params[6]); 420 sji.exhaustedQueuesCount = Long.parseLong((String) params[7]); 421 } catch (ParseException e) { 422 throw new ArgumentNotValid(frontierShortReport, e); 423 } 424 } 425 426 sji.activeToeCount = jobInfo.getActiveToeCount(); 427 sji.alertsCount = heritrixInfo.getAlertCount(); 428 sji.currentProcessedDocsPerSec = jobInfo.getCurrentProcessedDocsPerSec(); 429 sji.currentProcessedKBPerSec = jobInfo.getCurrentProcessedKBPerSec(); 430 sji.downloadedFilesCount = jobInfo.getDownloadedFilesCount(); 431 sji.elapsedSeconds = jobInfo.getElapsedSeconds(); 432 sji.hostUrl = msg.getHostUrl(); 433 sji.processedDocsPerSec = jobInfo.getProcessedDocsPerSec(); 434 sji.processedKBPerSec = jobInfo.getProcessedKBPerSec(); 435 sji.queuedFilesCount = jobInfo.getQueuedUriCount(); 436 break; 437 case CRAWLING_FINISHED: 438 // Set progress to 100 %, and reset the other values . 439 sji.progress = 100; 440 sji.hostUrl = ""; 441 sji.activeQueuesCount = 0; 442 sji.activeToeCount = 0; 443 sji.currentProcessedDocsPerSec = 0; 444 sji.currentProcessedKBPerSec = 0; 445 sji.processedDocsPerSec = 0; 446 sji.processedKBPerSec = 0; 447 sji.queuedFilesCount = 0; 448 sji.totalQueuesCount = 0; 449 break; 450 default: 451 log.debug("Nothing to do for state: {}", newStatus); 452 break; 453 } 454 sji.status = newStatus; 455 456 return sji; 457 } 458 459 /** 460 * @param hostUrl the hostUrl to set 461 */ 462 public void setHostUrl(String hostUrl) { 463 this.hostUrl = hostUrl; 464 } 465 466 /** 467 * @param progress the progress to set 468 */ 469 public void setProgress(double progress) { 470 this.progress = progress; 471 } 472 473 /** 474 * @param queuedFilesCount the queuedFilesCount to set 475 */ 476 public void setQueuedFilesCount(long queuedFilesCount) { 477 this.queuedFilesCount = queuedFilesCount; 478 } 479 480 /** 481 * @param downloadedFilesCount the downloadedFilesCount to set 482 */ 483 public void setDownloadedFilesCount(long downloadedFilesCount) { 484 this.downloadedFilesCount = downloadedFilesCount; 485 } 486 487 /** 488 * @param totalQueuesCount the totalQueuesCount to set 489 */ 490 public void setTotalQueuesCount(long totalQueuesCount) { 491 this.totalQueuesCount = totalQueuesCount; 492 } 493 494 /** 495 * @param activeQueuesCount the activeQueuesCount to set 496 */ 497 public void setActiveQueuesCount(long activeQueuesCount) { 498 this.activeQueuesCount = activeQueuesCount; 499 } 500 501 /** 502 * @param exhaustedQueuesCount the exhaustedQueuesCount to set 503 */ 504 public void setExhaustedQueuesCount(long exhaustedQueuesCount) { 505 this.exhaustedQueuesCount = exhaustedQueuesCount; 506 } 507 508 /** 509 * @param elapsedSeconds the elapsedSeconds to set 510 */ 511 public void setElapsedSeconds(long elapsedSeconds) { 512 this.elapsedSeconds = elapsedSeconds; 513 } 514 515 /** 516 * @param currentProcessedKBPerSec the currentProcessedKBPerSec to set 517 */ 518 public void setCurrentProcessedKBPerSec(long currentProcessedKBPerSec) { 519 this.currentProcessedKBPerSec = currentProcessedKBPerSec; 520 } 521 522 /** 523 * @param processedKBPerSec the processedKBPerSec to set 524 */ 525 public void setProcessedKBPerSec(long processedKBPerSec) { 526 this.processedKBPerSec = processedKBPerSec; 527 } 528 529 /** 530 * @param currentProcessedDocsPerSec the currentProcessedDocsPerSec to set 531 */ 532 public void setCurrentProcessedDocsPerSec(double currentProcessedDocsPerSec) { 533 this.currentProcessedDocsPerSec = currentProcessedDocsPerSec; 534 } 535 536 /** 537 * @param processedDocsPerSec the processedDocsPerSec to set 538 */ 539 public void setProcessedDocsPerSec(double processedDocsPerSec) { 540 this.processedDocsPerSec = processedDocsPerSec; 541 } 542 543 /** 544 * @param activeToeCount the activeToeCount to set 545 */ 546 public void setActiveToeCount(int activeToeCount) { 547 this.activeToeCount = activeToeCount; 548 } 549 550 /** 551 * @param alertsCount the alertsCount to set 552 */ 553 public void setAlertsCount(long alertsCount) { 554 this.alertsCount = alertsCount; 555 } 556 557 /** 558 * @param status the status to set 559 */ 560 public void setStatus(CrawlStatus status) { 561 this.status = status; 562 } 563 564 /** 565 * @param timestamp the timestamp to set 566 */ 567 public void setTimestamp(Date timestamp) { 568 this.timestamp = timestamp; 569 } 570 571 /** 572 * @param retiredQueuesCount the retiredQueuesCount to set 573 */ 574 public void setRetiredQueuesCount(long retiredQueuesCount) { 575 this.retiredQueuesCount = retiredQueuesCount; 576 } 577 578}