001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.harvesting.distribute; 024 025import java.io.Serializable; 026 027import org.slf4j.Logger; 028import org.slf4j.LoggerFactory; 029 030import dk.netarkivet.common.distribute.Channels; 031import dk.netarkivet.harvester.distribute.HarvesterMessage; 032import dk.netarkivet.harvester.distribute.HarvesterMessageVisitor; 033import dk.netarkivet.harvester.harvesting.monitor.HarvestMonitor; 034import dk.netarkivet.harvester.harvesting.report.Heritrix1Constants; 035 036/** 037 * This class wraps information stored in the Heritrix MBeans, CrawlService and CrawlService.Job, and represents the 038 * crawl progress. 039 * <p> 040 * Additionally this object extends {@link HarvesterMessage} so that it can be sent on the JMS bus to be processed by 041 * {@link HarvestMonitor}. 042 */ 043@SuppressWarnings({"serial"}) 044public class CrawlProgressMessage extends HarvesterMessage implements Serializable { 045 046 /** The logger for this class. */ 047 048 private static final Logger log = LoggerFactory.getLogger(CrawlProgressMessage.class); 049 050 /** 051 * The general status of a job in NAS. 052 */ 053 public static enum CrawlStatus { 054 /** 055 * Initial status of a job: Heritrix has not yet started crawling. 056 */ 057 PRE_CRAWL, 058 /** 059 * Heritrix is actively crawling. 060 */ 061 CRAWLER_ACTIVE, 062 /** 063 * Heritrix is crawling but is currently pausing. 064 */ 065 CRAWLER_PAUSING, 066 /** 067 * Heritrix is crawling but has been paused by the user. 068 */ 069 CRAWLER_PAUSED, 070 /** 071 * Heritrix has finished crawling, post processing of metadata and ARC files remains to be done. 072 */ 073 CRAWLING_FINISHED 074 } 075 076 /** 077 * Wraps CrawlService MBean attributes. 078 */ 079 public class CrawlServiceInfo implements Serializable { 080 081 /** The number of alerts raised by Heritrix. */ 082 private int alertCount; 083 084 /** Flag is set to true when Heritrix is crawling or paused. */ 085 private boolean isCrawling; 086 087 /** Contains the UID of the current job. */ 088 private String currentJob; 089 090 public int getAlertCount() { 091 return alertCount; 092 } 093 094 public void setAlertCount(int alertCount) { 095 this.alertCount = alertCount; 096 } 097 098 public boolean isCrawling() { 099 return isCrawling; 100 } 101 102 public void setCrawling(boolean isCrawling) { 103 this.isCrawling = isCrawling; 104 } 105 106 public String getCurrentJob() { 107 return currentJob; 108 } 109 110 public void setCurrentJob(String currentJob) { 111 this.currentJob = currentJob; 112 } 113 } 114 115 /** 116 * Wraps CrawlService.Job MBean attributes. 117 */ 118 public class CrawlServiceJobInfo implements Serializable { 119 120 /** The number of URIs currently discovered. */ 121 private long discoveredFilesCount; 122 123 /** The number of URIs currently harvested. */ 124 private long downloadedFilesCount; 125 126 /** A summary of the frontier queues. */ 127 private String frontierShortReport; 128 129 /** The time in seconds elapsed since the crawl began. */ 130 private long elapsedSeconds; 131 132 /** The current download rate in KB/sec. */ 133 private long currentProcessedKBPerSec; 134 135 /** The average download rate in KB/sec. */ 136 private long processedKBPerSec; 137 138 /** The current download rate in URI/sec. */ 139 private double currentProcessedDocsPerSec; 140 141 /** The average download rate in URI/sec. */ 142 private double processedDocsPerSec; 143 144 /** The number of active toe threads for this job. */ 145 private int activeToeCount; 146 147 /** A textual summary of the crawler activity. */ 148 private String progressStatistics; 149 150 /** The job status. */ 151 private String status; 152 153 public long getDiscoveredFilesCount() { 154 return discoveredFilesCount; 155 } 156 157 public void setDiscoveredFilesCount(long discoveredFilesCount) { 158 this.discoveredFilesCount = discoveredFilesCount; 159 } 160 161 public long getDownloadedFilesCount() { 162 return downloadedFilesCount; 163 } 164 165 public void setDownloadedFilesCount(long downloadedFilesCount) { 166 this.downloadedFilesCount = downloadedFilesCount; 167 } 168 169 public String getFrontierShortReport() { 170 return frontierShortReport; 171 } 172 173 public void setFrontierShortReport(String frontierShortReport) { 174 this.frontierShortReport = frontierShortReport; 175 } 176 177 public long getElapsedSeconds() { 178 return elapsedSeconds; 179 } 180 181 public void setElapsedSeconds(long elapsedSeconds) { 182 this.elapsedSeconds = elapsedSeconds; 183 } 184 185 public long getCurrentProcessedKBPerSec() { 186 return currentProcessedKBPerSec; 187 } 188 189 public void setCurrentProcessedKBPerSec(long currentProcessedKBPerSec) { 190 this.currentProcessedKBPerSec = currentProcessedKBPerSec; 191 } 192 193 public long getProcessedKBPerSec() { 194 return processedKBPerSec; 195 } 196 197 public void setProcessedKBPerSec(long processedKBPerSec) { 198 this.processedKBPerSec = processedKBPerSec; 199 } 200 201 public double getCurrentProcessedDocsPerSec() { 202 return currentProcessedDocsPerSec; 203 } 204 205 public void setCurrentProcessedDocsPerSec(double currentProcessedDocsPerSec) { 206 this.currentProcessedDocsPerSec = currentProcessedDocsPerSec; 207 } 208 209 public double getProcessedDocsPerSec() { 210 return processedDocsPerSec; 211 } 212 213 public void setProcessedDocsPerSec(double processedDocsPerSec) { 214 this.processedDocsPerSec = processedDocsPerSec; 215 } 216 217 public int getActiveToeCount() { 218 return activeToeCount; 219 } 220 221 public void setActiveToeCount(int activeToeCount) { 222 this.activeToeCount = activeToeCount; 223 } 224 225 public String getProgressStatistics() { 226 return progressStatistics; 227 } 228 229 public void setProgressStatistics(String progressStatistics) { 230 this.progressStatistics = progressStatistics; 231 } 232 233 /** 234 * Helper method that approximates the number of queued URIs. 235 * 236 * @return the number of queued URIs 237 */ 238 public long getQueuedUriCount() { 239 return discoveredFilesCount - downloadedFilesCount; 240 } 241 242 public String getStatus() { 243 return status; 244 } 245 246 public void setStatus(String status) { 247 this.status = status; 248 } 249 250 } 251 252 /** The unique identifier of the job. */ 253 private final long jobID; 254 255 /** The unique identifier of the associated harvest definition. */ 256 private long harvestID; 257 258 /** The URL to the host Heritrix admin UI. */ 259 private String hostUrl; 260 261 /** The job's status. */ 262 private CrawlStatus status; 263 264 /** A legend, fetched only once, for the {@link CrawlServiceJobInfo#progressStatistics} property. */ 265 private final String progressStatisticsLegend; 266 267 /** The information provided by the CrawlService MBean. */ 268 private CrawlServiceInfo heritrixStatus = new CrawlServiceInfo(); 269 270 /** The information provided by the CrawlService.Job MBean. */ 271 private CrawlServiceJobInfo jobStatus = new CrawlServiceJobInfo(); 272 273 /** 274 * Builds an empty message. MBean wrapper values are not set and the appropriate getters should be used to do so. 275 * 276 * @param harvestID the harvest definition ID 277 * @param jobId the job ID 278 * @param progressStatisticsLegend the legend of the progress statistics summary string 279 * @see CrawlProgressMessage#progressStatisticsLegend 280 */ 281 public CrawlProgressMessage(long harvestID, long jobId, String progressStatisticsLegend) { 282 super(HarvestMonitor.HARVEST_MONITOR_CHANNEL_ID, Channels.getError()); 283 this.harvestID = harvestID; 284 this.jobID = jobId; 285 this.status = CrawlStatus.PRE_CRAWL; 286 this.progressStatisticsLegend = progressStatisticsLegend; 287 } 288 289 /** 290 * Builds an empty message. MBean wrapper values are not set and the appropriate getters should be used to do so. 291 * The progressStatisticsLegend is set to the empty string. 292 * 293 * @param harvestID the harvest definition ID 294 * @param jobId the job ID 295 */ 296 public CrawlProgressMessage(long harvestID, long jobId) { 297 this(harvestID, jobId, ""); 298 } 299 300 public long getHarvestID() { 301 return harvestID; 302 } 303 304 public String getHostUrl() { 305 return hostUrl; 306 } 307 308 public void setHostUrl(String hostUrl) { 309 this.hostUrl = hostUrl; 310 } 311 312 public CrawlStatus getStatus() { 313 return status; 314 } 315 316 public void setStatus(CrawlStatus status) { 317 this.status = status; 318 } 319 320 public long getJobID() { 321 return jobID; 322 } 323 324 public String getProgressStatisticsLegend() { 325 return progressStatisticsLegend; 326 } 327 328 public CrawlServiceInfo getHeritrixStatus() { 329 return heritrixStatus; 330 } 331 332 public CrawlServiceJobInfo getJobStatus() { 333 return jobStatus; 334 } 335 336 @Override 337 public void accept(HarvesterMessageVisitor v) { 338 v.visit(this); 339 } 340 341 /** 342 * Returns true if the crawler has been paused, and thus not supposed to fetch anything. Heritrix may still be 343 * fetching stuff, as it takes some time for it to go into full pause mode. This method can be used as an indicator 344 * that we should not be worried if Heritrix appears to be idle. 345 * 346 * @return True if the crawler has been paused, e.g. by using the Heritrix GUI. 347 */ 348 public boolean isPaused() { 349 return CrawlStatus.CRAWLER_PAUSED.equals(status); 350 } 351 352 /** 353 * Checks whether Heritrix has finished crawling the job. 354 * 355 * @return true if Heritrix has finished crawling the job, false otherwise. 356 */ 357 public boolean crawlIsFinished() { 358 // Evidently heritrixStatus.currentJob is set to "", if no job is crawling 359 boolean jobInProgress = heritrixStatus.isCrawling() && !heritrixStatus.getCurrentJob().isEmpty(); 360 361 if (!jobInProgress) { 362 // FIXME does this work for H3 as well (If not modify the above logic) 363 log.info("Job {} seems to be no longer in progress. ", jobID); 364 return true; 365 } 366 367 String statusAsString = getJobStatus().getStatus(); 368 369 if (statusAsString != null) { 370 // FIXME probably only works for H1 equals to the String "FINISHED" 371 log.info("StatusAsString = '{}'", statusAsString); 372 return statusAsString.equals(Heritrix1Constants.CRAWLCONTROLLER_FINISHED); 373 } 374 // statusAsString is null 375 log.info("statusAsString is null for job {}. Considering the crawl to be not finished", jobID); 376 377 return false; 378 } 379 380}