001/* 002 * #%L 003 * NetarchiveSuite - harvester - core 004 * %% 005 * Copyright (C) 2005 - 2018 The Royal Danish Library, the National Library of France and the Austrian National Library. 006 * %% 007 * This program is free software: you can redistribute it and/or modify 008 * it under the terms of the GNU Lesser General Public License as 009 * published by the Free Software Foundation, either version 2.1 of the 010 * License, or (at your option) any later version. 011 * 012 * This program is distributed in the hope that it will be useful, 013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 015 * GNU General Lesser Public License for more details. 016 * 017 * You should have received a copy of the GNU General Lesser Public 018 * License along with this program. If not, see 019 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 020 * #L% 021 */ 022package dk.netarkivet.harvester.harvesting.distribute; 023 024import java.io.Serializable; 025 026import org.slf4j.Logger; 027import org.slf4j.LoggerFactory; 028 029import dk.netarkivet.common.distribute.Channels; 030import dk.netarkivet.harvester.distribute.HarvesterMessage; 031import dk.netarkivet.harvester.distribute.HarvesterMessageVisitor; 032import dk.netarkivet.harvester.harvesting.monitor.HarvestMonitor; 033import dk.netarkivet.harvester.harvesting.report.Heritrix1Constants; 034 035/** 036 * This class wraps information stored in the Heritrix MBeans, CrawlService and CrawlService.Job, and represents the 037 * crawl progress. 038 * <p> 039 * Additionally this object extends {@link HarvesterMessage} so that it can be sent on the JMS bus to be processed by 040 * {@link HarvestMonitor}. 041 */ 042@SuppressWarnings({"serial"}) 043public class CrawlProgressMessage extends HarvesterMessage implements Serializable { 044 045 /** The logger for this class. */ 046 047 private static final Logger log = LoggerFactory.getLogger(CrawlProgressMessage.class); 048 049 /** 050 * The general status of a job in NAS. 051 */ 052 public static enum CrawlStatus { 053 /** 054 * Initial status of a job: Heritrix has not yet started crawling. 055 */ 056 PRE_CRAWL, 057 /** 058 * Heritrix is actively crawling. 059 */ 060 CRAWLER_ACTIVE, 061 /** 062 * Heritrix is active but is crawling nothing. 063 */ 064 CRAWLER_EMPTY, 065 /** 066 * Heritrix is crawling but is currently pausing. 067 */ 068 CRAWLER_PAUSING, 069 /** 070 * Heritrix is crawling but has been paused by the user. 071 */ 072 CRAWLER_PAUSED, 073 /** 074 * Heritrix has finished crawling, post processing of metadata and ARC files remains to be done. 075 */ 076 CRAWLING_FINISHED 077 } 078 079 /** 080 * Wraps CrawlService MBean attributes. 081 */ 082 public class CrawlServiceInfo implements Serializable { 083 084 /** The number of alerts raised by Heritrix. */ 085 private int alertCount; 086 087 /** Flag is set to true when Heritrix is crawling or paused. */ 088 private boolean isCrawling; 089 090 /** Contains the UID of the current job. */ 091 private String currentJob; 092 093 public int getAlertCount() { 094 return alertCount; 095 } 096 097 public void setAlertCount(int alertCount) { 098 this.alertCount = alertCount; 099 } 100 101 public boolean isCrawling() { 102 return isCrawling; 103 } 104 105 public void setCrawling(boolean isCrawling) { 106 this.isCrawling = isCrawling; 107 } 108 109 public String getCurrentJob() { 110 return currentJob; 111 } 112 113 public void setCurrentJob(String currentJob) { 114 this.currentJob = currentJob; 115 } 116 } 117 118 /** 119 * Wraps CrawlService.Job MBean attributes. 120 */ 121 public class CrawlServiceJobInfo implements Serializable { 122 123 /** The number of URIs currently discovered. */ 124 private long discoveredFilesCount; 125 126 /** The number of URIs currently harvested. */ 127 private long downloadedFilesCount; 128 129 /** A summary of the frontier queues. */ 130 private String frontierShortReport; 131 132 /** The time in seconds elapsed since the crawl began. */ 133 private long elapsedSeconds; 134 135 /** The current download rate in KB/sec. */ 136 private long currentProcessedKBPerSec; 137 138 /** The average download rate in KB/sec. */ 139 private long processedKBPerSec; 140 141 /** The current download rate in URI/sec. */ 142 private double currentProcessedDocsPerSec; 143 144 /** The average download rate in URI/sec. */ 145 private double processedDocsPerSec; 146 147 /** The number of active toe threads for this job. */ 148 private int activeToeCount; 149 150 /** A textual summary of the crawler activity. */ 151 private String progressStatistics; 152 153 /** The job status. */ 154 private String status; 155 156 public long getDiscoveredFilesCount() { 157 return discoveredFilesCount; 158 } 159 160 public void setDiscoveredFilesCount(long discoveredFilesCount) { 161 this.discoveredFilesCount = discoveredFilesCount; 162 } 163 164 public long getDownloadedFilesCount() { 165 return downloadedFilesCount; 166 } 167 168 public void setDownloadedFilesCount(long downloadedFilesCount) { 169 this.downloadedFilesCount = downloadedFilesCount; 170 } 171 172 public String getFrontierShortReport() { 173 return frontierShortReport; 174 } 175 176 public void setFrontierShortReport(String frontierShortReport) { 177 this.frontierShortReport = frontierShortReport; 178 } 179 180 public long getElapsedSeconds() { 181 return elapsedSeconds; 182 } 183 184 public void setElapsedSeconds(long elapsedSeconds) { 185 this.elapsedSeconds = elapsedSeconds; 186 } 187 188 public long getCurrentProcessedKBPerSec() { 189 return currentProcessedKBPerSec; 190 } 191 192 public void setCurrentProcessedKBPerSec(long currentProcessedKBPerSec) { 193 this.currentProcessedKBPerSec = currentProcessedKBPerSec; 194 } 195 196 public long getProcessedKBPerSec() { 197 return processedKBPerSec; 198 } 199 200 public void setProcessedKBPerSec(long processedKBPerSec) { 201 this.processedKBPerSec = processedKBPerSec; 202 } 203 204 public double getCurrentProcessedDocsPerSec() { 205 return currentProcessedDocsPerSec; 206 } 207 208 public void setCurrentProcessedDocsPerSec(double currentProcessedDocsPerSec) { 209 this.currentProcessedDocsPerSec = currentProcessedDocsPerSec; 210 } 211 212 public double getProcessedDocsPerSec() { 213 return processedDocsPerSec; 214 } 215 216 public void setProcessedDocsPerSec(double processedDocsPerSec) { 217 this.processedDocsPerSec = processedDocsPerSec; 218 } 219 220 public int getActiveToeCount() { 221 return activeToeCount; 222 } 223 224 public void setActiveToeCount(int activeToeCount) { 225 this.activeToeCount = activeToeCount; 226 } 227 228 public String getProgressStatistics() { 229 return progressStatistics; 230 } 231 232 public void setProgressStatistics(String progressStatistics) { 233 this.progressStatistics = progressStatistics; 234 } 235 236 /** 237 * Helper method that approximates the number of queued URIs. 238 * 239 * @return the number of queued URIs 240 */ 241 public long getQueuedUriCount() { 242 return discoveredFilesCount - downloadedFilesCount; 243 } 244 245 public String getStatus() { 246 return status; 247 } 248 249 public void setStatus(String status) { 250 this.status = status; 251 } 252 253 } 254 255 /** The unique identifier of the job. */ 256 private final long jobID; 257 258 /** The unique identifier of the associated harvest definition. */ 259 private long harvestID; 260 261 /** The URL to the host Heritrix admin UI. */ 262 private String hostUrl; 263 264 /** The job's status. */ 265 private CrawlStatus status; 266 267 /** A legend, fetched only once, for the {@link CrawlServiceJobInfo#progressStatistics} property. */ 268 private final String progressStatisticsLegend; 269 270 /** The information provided by the CrawlService MBean. */ 271 private CrawlServiceInfo heritrixStatus = new CrawlServiceInfo(); 272 273 /** The information provided by the CrawlService.Job MBean. */ 274 private CrawlServiceJobInfo jobStatus = new CrawlServiceJobInfo(); 275 276 /** 277 * Builds an empty message. MBean wrapper values are not set and the appropriate getters should be used to do so. 278 * 279 * @param harvestID the harvest definition ID 280 * @param jobId the job ID 281 * @param progressStatisticsLegend the legend of the progress statistics summary string 282 * @see CrawlProgressMessage#progressStatisticsLegend 283 */ 284 public CrawlProgressMessage(long harvestID, long jobId, String progressStatisticsLegend) { 285 super(HarvestMonitor.HARVEST_MONITOR_CHANNEL_ID, Channels.getError()); 286 this.harvestID = harvestID; 287 this.jobID = jobId; 288 this.status = CrawlStatus.PRE_CRAWL; 289 this.progressStatisticsLegend = progressStatisticsLegend; 290 } 291 292 /** 293 * Builds an empty message. MBean wrapper values are not set and the appropriate getters should be used to do so. 294 * The progressStatisticsLegend is set to the empty string. 295 * 296 * @param harvestID the harvest definition ID 297 * @param jobId the job ID 298 */ 299 public CrawlProgressMessage(long harvestID, long jobId) { 300 this(harvestID, jobId, ""); 301 } 302 303 public long getHarvestID() { 304 return harvestID; 305 } 306 307 public String getHostUrl() { 308 return hostUrl; 309 } 310 311 public void setHostUrl(String hostUrl) { 312 this.hostUrl = hostUrl; 313 } 314 315 public CrawlStatus getStatus() { 316 return status; 317 } 318 319 public void setStatus(CrawlStatus status) { 320 this.status = status; 321 } 322 323 public long getJobID() { 324 return jobID; 325 } 326 327 public String getProgressStatisticsLegend() { 328 return progressStatisticsLegend; 329 } 330 331 public CrawlServiceInfo getHeritrixStatus() { 332 return heritrixStatus; 333 } 334 335 public CrawlServiceJobInfo getJobStatus() { 336 return jobStatus; 337 } 338 339 @Override 340 public void accept(HarvesterMessageVisitor v) { 341 v.visit(this); 342 } 343 344 /** 345 * Returns true if the crawler has been paused, and thus not supposed to fetch anything. Heritrix may still be 346 * fetching stuff, as it takes some time for it to go into full pause mode. This method can be used as an indicator 347 * that we should not be worried if Heritrix appears to be idle. 348 * 349 * @return True if the crawler has been paused, e.g. by using the Heritrix GUI. 350 */ 351 public boolean isPaused() { 352 return CrawlStatus.CRAWLER_PAUSED.equals(status); 353 } 354 355 /** 356 * Checks whether Heritrix has finished crawling the job. 357 * 358 * @return true if Heritrix has finished crawling the job, false otherwise. 359 */ 360 public boolean crawlIsFinished() { 361 // Evidently heritrixStatus.currentJob is set to "", if no job is crawling 362 boolean jobInProgress = heritrixStatus.isCrawling() && !heritrixStatus.getCurrentJob().isEmpty(); 363 364 if (!jobInProgress) { 365 // FIXME does this work for H3 as well (If not modify the above logic) 366 log.info("Job {} seems to be no longer in progress. ", jobID); 367 return true; 368 } 369 370 String statusAsString = getJobStatus().getStatus(); 371 372 if (statusAsString != null) { 373 // FIXME probably only works for H1 equals to the String "FINISHED" 374 log.info("StatusAsString = '{}'", statusAsString); 375 return statusAsString.equals(Heritrix1Constants.CRAWLCONTROLLER_FINISHED); 376 } 377 // statusAsString is null 378 log.info("statusAsString is null for job {}. Considering the crawl to be not finished", jobID); 379 380 return false; 381 } 382 383}