001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.harvesting.controller; 024 025import java.io.File; 026import java.io.FileWriter; 027import java.io.FilenameFilter; 028import java.io.IOException; 029import java.io.PrintWriter; 030import java.util.ArrayList; 031import java.util.Arrays; 032import java.util.Collections; 033import java.util.Comparator; 034import java.util.HashSet; 035import java.util.LinkedList; 036import java.util.List; 037import java.util.Map; 038import java.util.Properties; 039import java.util.Set; 040 041import org.archive.crawler.Heritrix; 042import org.slf4j.Logger; 043import org.slf4j.LoggerFactory; 044 045import dk.netarkivet.common.CommonSettings; 046import dk.netarkivet.common.exceptions.ArgumentNotValid; 047import dk.netarkivet.common.exceptions.IOFailure; 048import dk.netarkivet.common.utils.FileUtils; 049import dk.netarkivet.common.utils.JMXUtils; 050import dk.netarkivet.common.utils.NotificationType; 051import dk.netarkivet.common.utils.NotificationsFactory; 052import dk.netarkivet.common.utils.ProcessUtils; 053import dk.netarkivet.common.utils.Settings; 054import dk.netarkivet.common.utils.StringUtils; 055import dk.netarkivet.common.utils.SystemUtils; 056import dk.netarkivet.common.utils.TimeUtils; 057import dk.netarkivet.harvester.HarvesterSettings; 058import dk.netarkivet.harvester.harvesting.HeritrixFiles; 059 060/** 061 * Abstract base class for JMX-based Heritrix controllers. 062 */ 063@SuppressWarnings({"rawtypes"}) 064public abstract class AbstractJMXHeritrixController implements HeritrixController { 065 066 /** The logger for this class. */ 067 private static final Logger log = LoggerFactory.getLogger(AbstractJMXHeritrixController.class); 068 069 /** File path Separator. Used to separate the jar-files in the classpath. */ 070 private static final String FILE_PATH_SEPARATOR = ":"; 071 072 /** How long we're willing to wait for Heritrix to shutdown in a shutdown hook. */ 073 private static final long SHUTDOWN_HOOK_MAX_WAIT = 1000L; 074 075 /** The various files used by Heritrix. */ 076 private final HeritrixFiles files; 077 078 /** The threads used to collect process output. Only one thread used presently. */ 079 private Set<Thread> collectionThreads = new HashSet<Thread>(1); 080 081 /** The host name for this machine that matches what Heritrix uses in its MBean names. */ 082 private final String hostName; 083 084 /** The port to use for Heritrix JMX, as set in settings.xml. */ 085 private final int jmxPort = Settings.getInt(HarvesterSettings.HERITRIX_JMX_PORT); 086 087 /** The port to use for Heritrix GUI, as set in settings.xml. */ 088 private final int guiPort = Settings.getInt(HarvesterSettings.HERITRIX_GUI_PORT); 089 090 /** 091 * The shutdownHook that takes care of killing our process. This is removed in cleanup() when the process is shut 092 * down. 093 */ 094 private Thread processKillerHook; 095 096 /** 097 * The one-shot Heritrix process created in the constructor. It will only perform a single crawl before being shut 098 * down. 099 */ 100 private final Process heritrixProcess; 101 102 /** 103 * Create a BnfHeritrixController object. 104 * 105 * @param files Files that are used to set up Heritrix. 106 */ 107 public AbstractJMXHeritrixController(HeritrixFiles files) { 108 ArgumentNotValid.checkNotNull(files, "HeritrixFile files"); 109 this.files = files; 110 111 SystemUtils.checkPortNotUsed(guiPort); 112 SystemUtils.checkPortNotUsed(jmxPort); 113 114 hostName = SystemUtils.getLocalHostName(); 115 116 try { 117 log.info("Starting Heritrix for {}", this); 118 /* 119 * To start Heritrix, we need to do the following (taken from the Heritrix startup shell script): - set 120 * heritrix.home to base dir of Heritrix stuff - set com.sun.management.jmxremote.port to JMX port - set 121 * com.sun.management.jmxremote.ssl to false - set com.sun.management.jmxremote.password.file to JMX 122 * password file - set heritrix.out to heritrix_out.log - set java.protocol.handler.pkgs=org.archive.net - 123 * send processOutput & stderr into heritrix.out - let the Heritrix GUI-webserver listen on all available 124 * network interfaces: This is done with argument "--bind /" (default is 127.0.0.1) - listen on a specific 125 * port using the port argument: --port <GUI port> 126 * 127 * We also need to output something like the following to heritrix.out: `date Starting heritrix uname -a 128 * java -version JAVA_OPTS ulimit -a 129 */ 130 File heritrixOutputFile = files.getHeritrixOutput(); 131 StringBuilder settingProperty = new StringBuilder(); 132 for (File file : Settings.getSettingsFiles()) { 133 settingProperty.append(File.pathSeparator); 134 135 String absolutePath = file.getAbsolutePath(); 136 // check that the settings files not only exist but 137 // are readable 138 boolean readable = new File(absolutePath).canRead(); 139 if (!readable) { 140 log.warn("The file '{}' is missing.", absolutePath); 141 throw new IOFailure("Failed to read file '" + absolutePath + "'"); 142 } 143 settingProperty.append(absolutePath); 144 } 145 if (settingProperty.length() > 0) { 146 // delete last path-separator 147 settingProperty.deleteCharAt(0); 148 } 149 150 List<String> allOpts = new LinkedList<String>(); 151 allOpts.add(new File(new File(System.getProperty("java.home"), "bin"), "java").getAbsolutePath()); 152 allOpts.add("-Xmx" + Settings.get(HarvesterSettings.HERITRIX_HEAP_SIZE)); 153 allOpts.add("-Dheritrix.home=" + files.getCrawlDir().getAbsolutePath()); 154 155 String jvmOptsStr = Settings.get(HarvesterSettings.HERITRIX_JVM_OPTS); 156 if ((jvmOptsStr != null) && (!jvmOptsStr.isEmpty())) { 157 String[] add = jvmOptsStr.split(" "); 158 allOpts.addAll(Arrays.asList(add)); 159 } 160 161 allOpts.add("-Dcom.sun.management.jmxremote.port=" + jmxPort); 162 allOpts.add("-Dcom.sun.management.jmxremote.ssl=false"); 163 // check that JMX password and access files are readable. 164 // TODO This should probably be extracted to a method? 165 File passwordFile = files.getJmxPasswordFile(); 166 String pwAbsolutePath = passwordFile.getAbsolutePath(); 167 if (!passwordFile.canRead()) { 168 final String errMsg = "Failed to read the password file '" + pwAbsolutePath + "'. " 169 + "It is possibly missing."; 170 log.warn(errMsg); 171 throw new IOFailure(errMsg); 172 } 173 File accessFile = files.getJmxAccessFile(); 174 String acAbsolutePath = accessFile.getAbsolutePath(); 175 if (!accessFile.canRead()) { 176 final String errMsg = "Failed to read the access file '" + acAbsolutePath + "'. " 177 + "It is possibly missing."; 178 log.warn(errMsg); 179 throw new IOFailure(errMsg); 180 } 181 allOpts.add("-Dcom.sun.management.jmxremote.password.file=" + new File(pwAbsolutePath)); 182 allOpts.add("-Dcom.sun.management.jmxremote.access.file=" + new File(acAbsolutePath)); 183 allOpts.add("-Dheritrix.out=" + heritrixOutputFile.getAbsolutePath()); 184 allOpts.add("-Djava.protocol.handler.pkgs=org.archive.net"); 185 allOpts.add("-Ddk.netarkivet.settings.file=" + settingProperty); 186 allOpts.add(Heritrix.class.getName()); 187 allOpts.add("--bind"); 188 allOpts.add("/"); 189 allOpts.add("--port=" + guiPort); 190 allOpts.add("--admin=" + getHeritrixAdminName() + ":" + getHeritrixAdminPassword()); 191 192 String[] args = allOpts.toArray(new String[allOpts.size()]); 193 log.info("Starting Heritrix process with args" + Arrays.toString(args)); 194 log.debug("The JMX timeout is set to " + TimeUtils.readableTimeInterval(JMXUtils.getJmxTimeout())); 195 196 ProcessBuilder builder = new ProcessBuilder(args); 197 198 updateEnvironment(builder.environment()); 199 FileUtils.copyDirectory(new File("lib/heritrix"), files.getCrawlDir()); 200 builder.directory(files.getCrawlDir()); 201 builder.redirectErrorStream(true); 202 writeSystemInfo(heritrixOutputFile, builder); 203 FileUtils.appendToFile(heritrixOutputFile, "Working directory: " + files.getCrawlDir()); 204 addProcessKillerHook(); 205 heritrixProcess = builder.start(); 206 ProcessUtils.writeProcessOutput(heritrixProcess.getInputStream(), heritrixOutputFile, collectionThreads); 207 } catch (IOException e) { 208 throw new IOFailure("Error starting Heritrix process", e); 209 } 210 } 211 212 /** 213 * @return the JMX port for communicating with Heritrix. 214 */ 215 protected int getJmxPort() { 216 return jmxPort; 217 } 218 219 /** 220 * @return the HTTP port used by the Heritrix GUI. 221 */ 222 protected int getGuiPort() { 223 return guiPort; 224 } 225 226 /** 227 * @return the Heritrix files wrapper. 228 */ 229 protected HeritrixFiles getHeritrixFiles() { 230 return files; 231 } 232 233 /** 234 * @return the host name 235 */ 236 protected String getHostName() { 237 return hostName; 238 } 239 240 /** 241 * Get the login name for accessing the Heritrix GUI. This name can be set in the settings.xml file. 242 * 243 * @return Name to use for accessing Heritrix web GUI 244 */ 245 private String getHeritrixAdminName() { 246 return Settings.get(HarvesterSettings.HERITRIX_ADMIN_NAME); 247 } 248 249 /** 250 * Get the login password for accessing the Heritrix GUI. This password can be set in the settings.xml file. 251 * 252 * @return Password to use for accessing the Heritrix GUI 253 */ 254 private String getHeritrixAdminPassword() { 255 return Settings.get(HarvesterSettings.HERITRIX_ADMIN_PASSWORD); 256 } 257 258 /** 259 * Change an environment to be suitable for running Heritrix. 260 * <p> 261 * At the moment, this involves the following: 262 * <p> 263 * Prepend the Jar files from the lib/heritrix/lib dir to the classpath. Make sure the Heritrix jar file is at the 264 * front. 265 * 266 * @param environment The environment from a process builder 267 * @throws IOFailure If a Heritrix jarfile is not found. 268 */ 269 private static void updateEnvironment(Map<String, String> environment) { 270 List<String> classPathParts = SystemUtils.getCurrentClasspath(); 271 File heritrixLibDir = new File("lib/heritrix/lib"); 272 File[] jars = heritrixLibDir.listFiles(new FilenameFilter() { 273 public boolean accept(File file, String string) { 274 return string.endsWith(".jar"); 275 } 276 }); 277 278 // FIXME: Dirty hack around heretrix jars being elsewhere /tra 279 if (jars == null) { 280 jars = new File[0]; 281 } 282 283 // Reverse sort the file list in order to add in alphabetical order 284 // before the basic jars. 285 Arrays.sort(jars, new Comparator<File>() { 286 public int compare(File file, File file1) { 287 return file1.compareTo(file); 288 } 289 }); 290 String heritixJar = null; 291 for (File lib : jars) { 292 final String jarPath = new File(heritrixLibDir, lib.getName()).getAbsolutePath(); 293 if (lib.getName().startsWith("heritrix-")) { 294 // Heritrix should be at the very head, as it redefines some 295 // of the functions in its dependencies (!). Thus, we have to 296 // save it for later insertion at the head. 297 heritixJar = jarPath; 298 } else { 299 classPathParts.add(0, jarPath); 300 } 301 } 302 if (heritixJar != null) { 303 classPathParts.add(0, heritixJar); 304 } else { 305 throw new IOFailure("Heritrix jar file not found"); 306 } 307 environment.put("CLASSPATH", StringUtils.conjoin(FILE_PATH_SEPARATOR, classPathParts)); 308 } 309 310 /** 311 * Write various info on the system we're using into the given file. This info will later get put into metadata for 312 * the crawl. 313 * 314 * @param outputFile A file to write to. 315 * @param builder The ProcessBuilder being used to start the Heritrix process 316 */ 317 @SuppressWarnings("unchecked") 318 private void writeSystemInfo(File outputFile, ProcessBuilder builder) { 319 PrintWriter writer = null; 320 try { 321 writer = new PrintWriter(new FileWriter(outputFile)); 322 writer.println("The Heritrix process is started in the following" 323 + " environment\n (note that some entries will be" + " changed by the starting JVM):"); 324 Map<String, String> env = builder.environment(); 325 List<String> keyList = new ArrayList<String>(env.keySet()); 326 Collections.sort(keyList); 327 for (String key : keyList) { 328 writer.println(key + "=" + env.get(key)); 329 } 330 writer.println("Process properties:"); 331 Properties properties = System.getProperties(); 332 keyList = new ArrayList<String>((Set) properties.keySet()); 333 Collections.sort(keyList); 334 for (String key : keyList) { 335 writer.println(key + "=" + properties.get(key)); 336 } 337 } catch (IOException e) { 338 log.warn("Error writing basic properties to output file.", e); 339 } finally { 340 if (writer != null) { 341 writer.close(); 342 } 343 } 344 } 345 346 /** 347 * Add a shutdown hook that kills the process we've created. Since this hook will be run only in case of JVM 348 * shutdown, it cannot expect that the standard logging framework is still usable, and therefore writes to stdout 349 * instead. 350 */ 351 private void addProcessKillerHook() { 352 // Make sure that the process gets killed at the very end, at least 353 processKillerHook = new Thread() { 354 public void run() { 355 try { 356 // Only non-blocking way to check for process liveness 357 int exitValue = heritrixProcess.exitValue(); 358 System.out.println("Heritrix process of " + this + " exited with exit code " + exitValue); 359 } catch (IllegalThreadStateException e) { 360 // Process is still alive, kill it. 361 System.out.println("Killing process of " + this); 362 heritrixProcess.destroy(); 363 final Integer exitValue = ProcessUtils.waitFor(heritrixProcess, SHUTDOWN_HOOK_MAX_WAIT); 364 if (exitValue != null) { 365 System.out.println("Process of " + this + " returned exit code " + exitValue); 366 } else { 367 System.out.println("Process of " + this + " never exited!"); 368 } 369 } 370 } 371 }; 372 Runtime.getRuntime().addShutdownHook(processKillerHook); 373 } 374 375 /** 376 * Get a string that describes the current controller in terms of job ID, harvest ID, and crawldir. 377 * 378 * @return A human-readable string describing this controller. 379 */ 380 @Override 381 public String toString() { 382 if (heritrixProcess != null) { 383 return "job " + files.getJobID() + " of harvest " + files.getHarvestID() + " in " + files.getCrawlDir() 384 + " running process " + heritrixProcess; 385 } else { 386 return "job " + files.getJobID() + " of harvest " + files.getHarvestID() + " in " + files.getCrawlDir(); 387 } 388 } 389 390 /** 391 * Return true if the Heritrix process has exited, logging the exit value if so. 392 * 393 * @return True if the process has exited. 394 */ 395 protected boolean processHasExited() { 396 // First check if the process has exited already 397 try { 398 int exitValue = heritrixProcess.exitValue(); 399 log.info("Process of {} returned exit code {}", this, exitValue); 400 return true; 401 } catch (IllegalThreadStateException e) { 402 // Not exited yet, that's fine 403 } 404 return false; 405 } 406 407 /** 408 * Waits for the Heritrix process to exit. 409 */ 410 protected void waitForHeritrixProcessExit() { 411 final long maxWait = Settings.getLong(CommonSettings.PROCESS_TIMEOUT); 412 final int maxJmxRetries = JMXUtils.getMaxTries(); 413 Integer exitValue = ProcessUtils.waitFor(heritrixProcess, maxWait); 414 if (exitValue != null) { 415 log.info("Heritrix process of {} exited with exit code {}", this, exitValue); 416 } else { 417 log.warn("Heritrix process of {} not dead after {} millis, killing it", this, maxWait); 418 heritrixProcess.destroy(); 419 exitValue = ProcessUtils.waitFor(heritrixProcess, maxWait); 420 if (exitValue != null) { 421 log.info("Heritrix process of {} exited with exit code {}", this, exitValue); 422 } else { 423 // If it's not dead now, there's little we can do. 424 log.error("Heritrix process of {} not dead after destroy. Exiting harvest controller. " 425 + "Make sure you kill the runaway Heritrix before you restart.", this); 426 NotificationsFactory 427 .getInstance() 428 .notify("Heritrix process of " 429 + this 430 + " not dead after destroy. " 431 + "Exiting harvest controller. Make sure you kill the runaway Heritrix before you restart.", 432 NotificationType.ERROR); 433 System.exit(1); 434 } 435 } 436 Runtime.getRuntime().removeShutdownHook(processKillerHook); 437 // Wait until all collection threads are dead or until we have 438 // tried JMXUtils.MAX_TRIES times. 439 int attempt = 0; 440 do { 441 boolean anyAlive = false; 442 for (Thread t : collectionThreads) { 443 if (t.isAlive()) { 444 anyAlive = true; 445 } 446 } 447 if (!anyAlive) { 448 break; 449 } 450 TimeUtils.exponentialBackoffSleep(attempt); 451 } while (attempt++ < maxJmxRetries); 452 } 453 454 /** 455 * Return a human-readable description of the job. This will only be visible in the Heritrix GUI. 456 * 457 * @return String containing various information grabbed from HeritrixFiles. 458 */ 459 protected String getJobDescription() { 460 String dedupPart = (files.getIndexDir() != null) ? "with the deduplication index stored in '" 461 + files.getIndexDir().getAbsolutePath() + "'" : "with deduplication disabled"; 462 return "Job " + files.getJobID() + " for harvest " + files.getHarvestID() + " performed in " 463 + files.getCrawlDir() + dedupPart + " and " + FileUtils.countLines(files.getSeedsTxtFile()) + " seeds"; 464 } 465 466 public HeritrixFiles getFiles() { 467 return this.files; 468 } 469 470}