001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.harvester.harvesting.controller; 025 026import java.util.Collection; 027 028import javax.management.openmbean.CompositeData; 029import javax.management.openmbean.TabularData; 030import javax.management.remote.JMXConnector; 031 032import org.archive.util.JmxUtils; 033import org.slf4j.Logger; 034import org.slf4j.LoggerFactory; 035 036import dk.netarkivet.common.exceptions.IOFailure; 037import dk.netarkivet.common.exceptions.IllegalState; 038import dk.netarkivet.common.utils.JMXUtils; 039import dk.netarkivet.common.utils.Settings; 040import dk.netarkivet.common.utils.SystemUtils; 041import dk.netarkivet.common.utils.TimeUtils; 042import dk.netarkivet.harvester.HarvesterSettings; 043import dk.netarkivet.harvester.harvesting.HeritrixFiles; 044 045/** 046 * This implementation of the HeritrixController interface starts Heritrix as a separate process and uses JMX to 047 * communicate with it. Each instance executes exactly one process that runs exactly one crawl job. 048 * 049 * @deprecated Use the {@link BnfHeritrixController} instead 050 */ 051@SuppressWarnings({"unused", "unchecked"}) 052public class JMXHeritrixController extends AbstractJMXHeritrixController { 053 054 /** The logger for this class. */ 055 private static final Logger log = LoggerFactory.getLogger(JMXHeritrixController.class); 056 057 /* 058 * The below commands and attributes are copied from org.archive.crawler.admin.CrawlJob. 059 * 060 * @see <A href="http://crawler.archive.org/xref/org/archive/crawler/admin/CrawlJob.html"> 061 * org.archive.crawler.admin.CrawlJob</A> 062 * 063 * These strings are currently not visible from outside the Heritrix class. See 064 * http://webteam.archive.org/jira/browse/HER-1285 065 */ 066 /** The command to submit a new crawljob to the Crawlcontroller. */ 067 private static final String ADD_JOB_COMMAND = "addJob"; 068 /** The command to retrieve progress statistics for the currently running job. */ 069 private static final String PROGRESS_STATISTICS_COMMAND = "progressStatistics"; 070 /** 071 * The command to retrieve a progress statistics legend for the currently running job. 072 */ 073 private static final String PROGRESS_STATISTICS_LEGEND_COMMAND = "progressStatisticsLegend"; 074 /** 075 * The attribute for the current download rate in kbytes for the currently running job. 076 */ 077 private static final String CURRENT_KB_RATE_ATTRIBUTE = "CurrentKbRate"; 078 /** The attribute for the number of currently running process-threads. */ 079 private static final String THREAD_COUNT_ATTRIBUTE = "ThreadCount"; 080 /** 081 * The attribute for the number of discovered URIs for the currently running job. 082 */ 083 private static final String DISCOVERED_COUNT_ATTRIBUTE = "DiscoveredCount"; 084 /** 085 * The attribute for the number of downloaded URIs for the currently running job. 086 */ 087 private static final String DOWNLOADED_COUNT_ATTRIBUTE = "DownloadedCount"; 088 /** The attribute for the status for the currently running job. */ 089 private static final String STATUS_ATTRIBUTE = "Status"; 090 091 /* 092 * The below commands and attributes are copied from org.archive.crawler.Heritrix 093 * 094 * @see <A href="http://crawler.archive.org/apidocs/org/archive/crawler/Heritrix.html"> 095 * org.archive.crawler.Heritrix</A> 096 * 097 * These strings are currently not visible from outside the Heritrix class. See 098 * http://webteam.archive.org/jira/browse/HER-1285 099 */ 100 /* 101 * Note: The Heritrix JMX interface has two apparent ways to stop crawling: stopCrawling and terminateCurrentJob. 102 * stopCrawling merely makes Heritrix not start any more jobs, but the old jobs continue. Note that if we start 103 * using more than one job at a time, terminateCurrentJob will only stop one job. 104 */ 105 /** Command to start crawling. */ 106 private static final String START_CRAWLING_COMMAND = "startCrawling"; 107 /** Make the currently active (selected?) job stop. */ 108 private static final String TERMINATE_CURRENT_JOB_COMMAND = "terminateCurrentJob"; 109 /** Command for returning list of pending jobs. */ 110 private static final String PENDING_JOBS_COMMAND = "pendingJobs"; 111 /** Command for returning list of completed jobs. */ 112 private static final String COMPLETED_JOBS_COMMAND = "completedJobs"; 113 /** Command for shutting down Heritrix. */ 114 private static final String SHUTDOWN_COMMAND = "shutdown"; 115 116 /** 117 * The part of the Job MBean name that designates the unique id. For some reason, this is not included in the normal 118 * Heritrix definitions in JmxUtils, otherwise we wouldn't have to define it. I have committed a feature request: 119 * http://webteam.archive.org/jira/browse/HER-1618 120 */ 121 private static final String UID_PROPERTY = "uid"; 122 123 /** 124 * The name that Heritrix gives to the job we ask it to create. This is part of the name of the MBean for that job, 125 * but we can only retrieve the name after the MBean has been created. 126 */ 127 private String jobName; 128 129 /** The header line (legend) for the statistics report. */ 130 private String progressStatisticsLegend; 131 132 /* 133 * The possible values of a request of the status attribute. Copied from private values in {@link 134 * org.archive.crawler.framework.CrawlController} 135 * 136 * These strings are currently not visible from outside the CrawlController class. See 137 * http://webteam.archive.org/jira/browse/HER-1285 138 */ 139 /** The 'NASCENT' status. */ 140 // private static final String NASCENT_STATUS = "NASCENT"; 141 /** The 'RUNNING' status. */ 142 // private static final String RUNNING_STATUS = "RUNNING"; 143 /** The 'PAUSED' status. */ 144 private static final String PAUSED_STATUS = "PAUSED"; 145 /** The 'PAUSING' status. */ 146 private static final String PAUSING_STATUS = "PAUSING"; 147 /** The 'CHECKPOINTING' status. */ 148 // private static final String CHECKPOINTING_STATUS = "CHECKPOINTING"; 149 /** The 'STOPPING' status. */ 150 // private static final String STOPPING_STATUS = "STOPPING"; 151 /** The 'FINISHED' status. */ 152 private static final String FINISHED_STATUS = "FINISHED"; 153 /** The 'STARTED status. */ 154 // private static final String STARTED_STATUS = "STARTED"; 155 /** The 'PREPARING' status. */ 156 // private static final String PREPARING_STATUS = "PREPARING"; 157 /** The 'Illegal State' status. */ 158 private static final String ILLEGAL_STATUS = "Illegal State"; 159 160 /** 161 * Create a JMXHeritrixController object. 162 * 163 * @param files Files that are used to set up Heritrix. 164 */ 165 public JMXHeritrixController(HeritrixFiles files) { 166 super(files); 167 } 168 169 /** 170 * @throws IOFailure If Heritrix dies before initialization, or we encounter any problems during the initialization. 171 * @see HeritrixController#initialize() 172 */ 173 public void initialize() { 174 if (processHasExited()) { 175 String errMsg = "Heritrix process of " + this + " died before initialization"; 176 log.warn(errMsg); 177 throw new IOFailure(errMsg); 178 } 179 // We want to be sure there are no jobs when starting, in case we got 180 // an old Heritrix or somebody added jobs behind our back. 181 TabularData doneJobs = (TabularData) executeHeritrixCommand(COMPLETED_JOBS_COMMAND); 182 TabularData pendingJobs = (TabularData) executeHeritrixCommand(PENDING_JOBS_COMMAND); 183 if (doneJobs != null && doneJobs.size() > 0 || pendingJobs != null && pendingJobs.size() > 0) { 184 throw new IllegalState("This Heritrix instance is in a illegalState! " 185 + "This instance has either old done jobs (" + doneJobs + "), or old pending jobs (" + pendingJobs 186 + ")."); 187 } 188 // From here on, we can assume there's only the one job we make. 189 // We'll use the arc file prefix to name the job, since the prefix 190 // already contains the harvest id and job id. 191 HeritrixFiles files = getHeritrixFiles(); 192 executeHeritrixCommand(ADD_JOB_COMMAND, files.getOrderXmlFile().getAbsolutePath(), 193 files.getArchiveFilePrefix(), getJobDescription(), files.getSeedsTxtFile().getAbsolutePath()); 194 jobName = getJobName(); 195 initializeProgressStatisticsLegend(); 196 } 197 198 /** 199 * @throws IOFailure if unable to communicate with Heritrix 200 * @see HeritrixController#requestCrawlStart() 201 */ 202 public void requestCrawlStart() { 203 executeHeritrixCommand(START_CRAWLING_COMMAND); 204 } 205 206 /** @see HeritrixController#atFinish() */ 207 public boolean atFinish() { 208 return crawlIsEnded(); 209 } 210 211 /** 212 * @throws IOFailure if unable to communicate with Heritrix 213 * @see HeritrixController#beginCrawlStop() 214 */ 215 public void beginCrawlStop() { 216 executeHeritrixCommand(TERMINATE_CURRENT_JOB_COMMAND); 217 } 218 219 /** @see HeritrixController#getActiveToeCount() */ 220 public int getActiveToeCount() { 221 Integer activeToeCount = (Integer) getCrawlJobAttribute(THREAD_COUNT_ATTRIBUTE); 222 if (activeToeCount == null) { 223 return 0; 224 } 225 return activeToeCount; 226 } 227 228 /** @see HeritrixController#requestCrawlStop(String) */ 229 public void requestCrawlStop(String reason) { 230 if (!atFinish()) { 231 beginCrawlStop(); 232 } 233 } 234 235 /** 236 * @see HeritrixController#getQueuedUriCount() 237 */ 238 public long getQueuedUriCount() { 239 /* 240 * Implementation note: This count is not as precise as what StatisticsTracker could provide, but it's presently 241 * only used in a warning in the HeritrixLauncher.doCrawlLoop() method. 242 */ 243 Long discoveredUris = (Long) getCrawlJobAttribute(DISCOVERED_COUNT_ATTRIBUTE); 244 Long downloadedUris = (Long) getCrawlJobAttribute(DOWNLOADED_COUNT_ATTRIBUTE); 245 if (discoveredUris == null) { 246 return 0; 247 } 248 if (downloadedUris == null) { 249 return discoveredUris; 250 } 251 return discoveredUris - downloadedUris; 252 } 253 254 /** @see HeritrixController#getCurrentProcessedKBPerSec() */ 255 public int getCurrentProcessedKBPerSec() { 256 Long currentDownloadRate = (Long) getCrawlJobAttribute(CURRENT_KB_RATE_ATTRIBUTE); 257 if (currentDownloadRate == null) { 258 return 0; 259 } 260 return currentDownloadRate.intValue(); 261 } 262 263 /** @see HeritrixController#getProgressStats() */ 264 public String getProgressStats() { 265 String status = (String) getCrawlJobAttribute(STATUS_ATTRIBUTE); 266 267 if (status == null) { 268 status = "NO STATUS"; 269 } 270 271 String progressStatistics = (String) executeCrawlJobCommand(PROGRESS_STATISTICS_COMMAND); 272 273 if (progressStatistics == null) { 274 progressStatistics = "No progress statistics available"; 275 } else { 276 // Since progressStatisticsLegend acts as a latch, we can check 277 // for non-null even though it gets assigned asynchronously. 278 if (progressStatisticsLegend != null) { 279 progressStatistics = progressStatisticsLegend + '\n' + progressStatistics; 280 } 281 } 282 return status + " " + progressStatistics; 283 } 284 285 /** Store the statistics legend line (asynchronously). */ 286 private void initializeProgressStatisticsLegend() { 287 new Thread() { 288 public void run() { 289 progressStatisticsLegend = (String) executeCrawlJobCommand(PROGRESS_STATISTICS_LEGEND_COMMAND); 290 } 291 }.start(); 292 } 293 294 /** @see HeritrixController#isPaused() */ 295 public boolean isPaused() { 296 String status = (String) getCrawlJobAttribute(STATUS_ATTRIBUTE); 297 log.debug("Heritrix state: '{}'", status); 298 // Either Pausing or Paused in case of not null 299 return status != null && (status.equals(PAUSED_STATUS) || status.equals(PAUSING_STATUS)); 300 } 301 302 /** 303 * Check if the crawl has ended, either because Heritrix finished of its own, or because we terminated it. 304 * 305 * @return True if the crawl has ended, either because Heritrix finished or because we terminated it. Otherwise we 306 * return false. 307 * @see HeritrixController#crawlIsEnded() 308 */ 309 public synchronized boolean crawlIsEnded() { 310 // End of crawl can be seen in one of three ways: 311 // 1) The Heritrix process has exited. 312 // 2) The job has been moved to the completed jobs list in Heritrix. 313 // 3) The job is in one of the FINISHED states. 314 if (processHasExited()) { 315 return true; 316 } 317 TabularData jobs = (TabularData) executeHeritrixCommand(COMPLETED_JOBS_COMMAND); 318 if (jobs != null && jobs.size() > 0) { 319 for (CompositeData value : (Collection<CompositeData>) jobs.values()) { 320 String thisJobID = value.get(JmxUtils.NAME) + "-" + value.get(UID_PROPERTY); 321 if (thisJobID.equals(jobName)) { 322 return true; 323 } 324 } 325 } 326 String status = (String) getCrawlJobAttribute(STATUS_ATTRIBUTE); 327 return status == null || status.equals(FINISHED_STATUS) || status.equals(ILLEGAL_STATUS); 328 } 329 330 /** 331 * Cleanup after an Heritrix process. This entails sending the shutdown command to the Heritrix process, and killing 332 * it forcefully, if it is still alive after waiting the period of time specified by the 333 * CommonSettings.PROCESS_TIMEOUT setting. 334 * 335 * @see HeritrixController#cleanup() 336 */ 337 public void cleanup() { 338 try { 339 executeHeritrixCommand(SHUTDOWN_COMMAND); 340 } catch (IOFailure e) { 341 log.error("JMX error while cleaning up Heritrix controller", e); 342 } 343 344 waitForHeritrixProcessExit(); 345 } 346 347 /** 348 * Return the URL for monitoring this instance. 349 * 350 * @return the URL for monitoring this instance. 351 */ 352 public String getHarvestInformation() { 353 return "http://" + SystemUtils.getLocalHostName() + ":" + getGUIPort(); 354 } 355 356 /** 357 * Get the name of the one job we let this Heritrix run. The handling of done jobs depends on Heritrix not being in 358 * crawl. This call may take several seconds to finish. 359 * 360 * @return The name of the one job that Heritrix has. 361 * @throws IOFailure if the job created failed to initialize or didn't appear in time. 362 * @throws IllegalState if more than one job in done list, or more than one pending job 363 */ 364 private String getJobName() { 365 /* 366 * This is called just after we've told Heritrix to create a job. It may take a while before the job is actually 367 * created, so we have to wait around a bit. 368 */ 369 TabularData pendingJobs = null; 370 TabularData doneJobs; 371 int retries = 0; 372 while (retries++ < JMXUtils.getMaxTries()) { 373 // If the job turns up in Heritrix' pending jobs list, it's ready 374 pendingJobs = (TabularData) executeHeritrixCommand(PENDING_JOBS_COMMAND); 375 if (pendingJobs != null && pendingJobs.size() > 0) { 376 break; // It's ready, we can move on. 377 } 378 379 // If there's an error in the job configuration, the job will be put 380 // in Heritrix' completed jobs list. 381 doneJobs = (TabularData) executeHeritrixCommand(COMPLETED_JOBS_COMMAND); 382 if (doneJobs != null && doneJobs.size() >= 1) { 383 // Since we haven't allowed Heritrix to start any crawls yet, 384 // the only way the job could have ended and then put into 385 // the list of completed jobs is by error. 386 if (doneJobs.size() > 1) { 387 throw new IllegalState("More than one job in done list: " + doneJobs); 388 } else { 389 CompositeData job = JMXUtils.getOneCompositeData(doneJobs); 390 throw new IOFailure("Job " + job + " failed: " + job.get(STATUS_ATTRIBUTE)); 391 } 392 } 393 if (retries < JMXUtils.getMaxTries()) { 394 TimeUtils.exponentialBackoffSleep(retries); 395 } 396 } 397 // If all went well, we now have exactly one job in the pending 398 // jobs list. 399 if (pendingJobs == null || pendingJobs.size() == 0) { 400 throw new IOFailure("Heritrix has not created a job after " + (Math.pow(2, JMXUtils.getMaxTries()) / 1000) 401 + " seconds, giving up."); 402 } else if (pendingJobs.size() > 1) { 403 throw new IllegalState("More than one pending job: " + pendingJobs); 404 } else { 405 // Note that we may actually get through to here even if the job 406 // is malformed. The job will then die as soon as we tell it to 407 // start crawling. 408 CompositeData job = JMXUtils.getOneCompositeData(pendingJobs); 409 String name = job.get(JmxUtils.NAME) + "-" + job.get(UID_PROPERTY); 410 log.info("Heritrix created a job with name {}", name); 411 return name; 412 } 413 } 414 415 /** 416 * Get the name to use for logging on to Heritrix' JMX with full control. The name cannot be set by the user. 417 * 418 * @return Name to use when connecting to Heritrix JMX 419 */ 420 private String getJMXAdminName() { 421 String jmxUsername = Settings.get(HarvesterSettings.HERITRIX_JMX_USERNAME); 422 log.debug("The JMX username used for connecting to the Heritrix GUI is: '{}'.", jmxUsername); 423 return jmxUsername; 424 } 425 426 /** 427 * Get the password to use to access the Heritrix JMX as the user returned by getJMXAdminName(). This password can 428 * be set in a file pointed to in settings.xml. 429 * 430 * @return Password for accessing Heritrix JMX 431 */ 432 private String getJMXAdminPassword() { 433 return Settings.get(HarvesterSettings.HERITRIX_JMX_PASSWORD); 434 } 435 436 /** 437 * Get the port to use for Heritrix JMX, as set in settings.xml. 438 * 439 * @return Port that Heritrix will expose its JMX interface on. 440 */ 441 private int getJMXPort() { 442 return Settings.getInt(HarvesterSettings.HERITRIX_JMX_PORT); 443 } 444 445 /** 446 * Get the port to use for Heritrix GUI, as set in settings.xml. 447 * 448 * @return Port that Heritrix will expose its web interface on. 449 */ 450 private int getGUIPort() { 451 return Settings.getInt(HarvesterSettings.HERITRIX_GUI_PORT); 452 } 453 454 /** 455 * Execute a command for the Heritrix process we're running. 456 * 457 * @param command The command to execute. 458 * @param arguments Any arguments to the command. These arguments can only be of String type. 459 * @return Whatever object was returned by the JMX invocation. 460 */ 461 private Object executeHeritrixCommand(String command, String... arguments) { 462 return JMXUtils.executeCommand(getHeritrixJMXConnector(), getHeritrixBeanName(), command, arguments); 463 } 464 465 /** 466 * Execute a command for the Heritrix job. This must only be called after initialize() has been run. 467 * 468 * @param command The command to execute. 469 * @param arguments Any arguments to the command. These arguments can only be of String type. 470 * @return Whatever object was returned by the JMX invocation. 471 */ 472 private Object executeCrawlJobCommand(String command, String... arguments) { 473 return JMXUtils.executeCommand(getHeritrixJMXConnector(), getCrawlJobBeanName(), command, arguments); 474 } 475 476 /** 477 * Get an attribute of the Heritrix process we're running. 478 * 479 * @param attribute The attribute to get. 480 * @return The value of the attribute. 481 */ 482 private Object getHeritrixAttribute(String attribute) { 483 return JMXUtils.getAttribute(getHeritrixJMXConnector(), getHeritrixBeanName(), attribute); 484 } 485 486 /** 487 * Get an attribute of the Heritrix job. This must only be called after initialize() has been run. 488 * 489 * @param attribute The attribute to get. 490 * @return The value of the attribute. 491 */ 492 private Object getCrawlJobAttribute(String attribute) { 493 return JMXUtils.getAttribute(getHeritrixJMXConnector(), getCrawlJobBeanName(), attribute); 494 } 495 496 /** 497 * Get the name for the main bean of the Heritrix instance. 498 * 499 * @return Bean name, to be passed into JMXUtils#getBeanName(String) 500 */ 501 private String getHeritrixBeanName() { 502 return "org.archive.crawler:" + JmxUtils.NAME + "=Heritrix," + JmxUtils.TYPE + "=CrawlService," 503 + JmxUtils.JMX_PORT + "=" + getJMXPort() + "," + JmxUtils.GUI_PORT + "=" + getGUIPort() + "," 504 + JmxUtils.HOST + "=" + getHostName(); 505 506 } 507 508 /** 509 * Get the name for the bean of a single job. This bean does not exist until after a job has been created using 510 * initialize(). 511 * 512 * @return Bean name, to be passed into JMXUtils#getBeanName(String) 513 */ 514 private String getCrawlJobBeanName() { 515 return "org.archive.crawler:" + JmxUtils.NAME + "=" + jobName + "," + JmxUtils.TYPE + "=CrawlService.Job," 516 + JmxUtils.JMX_PORT + "=" + getJMXPort() + "," + JmxUtils.MOTHER + "=Heritrix," + JmxUtils.HOST + "=" 517 + getHostName(); 518 } 519 520 /** 521 * Get the JMX connector to Heritrix. 522 * 523 * @return A connector that connects to a local Heritrix instance. 524 */ 525 private JMXConnector getHeritrixJMXConnector() { 526 return JMXUtils.getJMXConnector(SystemUtils.LOCALHOST, getJMXPort(), getJMXAdminName(), getJMXAdminPassword()); 527 } 528 529}