001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.heritrix3.controller; 024 025import java.io.File; 026import java.io.IOException; 027import java.io.PrintWriter; 028import java.util.concurrent.Semaphore; 029 030import org.netarchivesuite.heritrix3wrapper.CommandLauncher; 031import org.netarchivesuite.heritrix3wrapper.Heritrix3Wrapper; 032import org.netarchivesuite.heritrix3wrapper.LaunchResultHandlerAbstract; 033import org.netarchivesuite.heritrix3wrapper.unzip.UnzipUtils; 034import org.slf4j.Logger; 035import org.slf4j.LoggerFactory; 036 037import dk.netarkivet.common.exceptions.ArgumentNotValid; 038import dk.netarkivet.common.exceptions.IOFailure; 039import dk.netarkivet.common.utils.FileUtils; 040import dk.netarkivet.common.utils.Settings; 041import dk.netarkivet.common.utils.StringUtils; 042import dk.netarkivet.common.utils.SystemUtils; 043import dk.netarkivet.harvester.heritrix3.BlockingCommandLauncher; 044import dk.netarkivet.harvester.heritrix3.Heritrix3Files; 045import dk.netarkivet.harvester.heritrix3.Heritrix3Settings; 046 047/** 048 * Abstract base class for REST-based Heritrix controllers. 049 */ 050public abstract class AbstractRestHeritrixController implements IHeritrixController { 051 052 /** The logger for this class. */ 053 private static final Logger log = LoggerFactory.getLogger(AbstractRestHeritrixController.class); 054 055 /** The various files used by Heritrix. */ 056 protected final Heritrix3Files files; 057 058 protected Heritrix3Wrapper h3wrapper; 059 protected CommandLauncher h3launcher; 060 protected LaunchResultHandlerAbstract h3handler; 061 protected PrintWriter outputPrinter; 062 protected PrintWriter errorPrinter; 063 protected File heritrixBaseDir; 064 065 /** The host name for this machine that matches what Heritrix uses in its MBean names. */ 066 private final String hostName; 067 068 /** The port to use for Heritrix GUI, as set in settings.xml. */ 069 private final int guiPort = Settings.getInt(Heritrix3Settings.HERITRIX_GUI_PORT); 070 071 /** 072 * Create a AbstractRestHeritrixController object. 073 * 074 * @param files Files that are used to set up Heritrix. 075 */ 076 public AbstractRestHeritrixController(Heritrix3Files files) { 077 ArgumentNotValid.checkNotNull(files, "Heritrix3Files files"); 078 this.files = files; 079 SystemUtils.checkPortNotUsed(guiPort); 080 081 if (Settings.getBoolean(Heritrix3Settings.UMBRA_IS_ENABLED)) { 082 executeUmbraStartHook(); 083 } 084 085 hostName = SystemUtils.getLocalHostName(); 086 try { 087 log.info("Starting Heritrix for {} in crawldir {}", this, files.getCrawlDir()); 088 String zipFileStr = files.getHeritrixZip().getAbsolutePath(); 089 090 heritrixBaseDir = files.getHeritrixBaseDir(); 091 if (!heritrixBaseDir.isDirectory()) { 092 heritrixBaseDir.mkdirs(); 093 } 094 if (!heritrixBaseDir.isDirectory()) { 095 throw new IOFailure("Unable to create heritrixbasedir: " + heritrixBaseDir.getAbsolutePath() ); 096 } 097 098 log.debug("Unzipping heritrix into the crawldir"); 099 UnzipUtils.unzip(zipFileStr, 1, heritrixBaseDir.getAbsolutePath()); 100 101 if (files.getCertificateFile() != null) { 102 log.debug("Copying override keystore into heritrix dir"); 103 Heritrix3Wrapper.copyFileAs(files.getCertificateFile(), heritrixBaseDir, "h3server.jks"); 104 } 105 106 /** The bin/heritrix script should read the following environment-variables: 107 * 108 * JAVA_HOME Point at a JDK install to use 109 * 110 * HERITRIX_HOME Pointer to your heritrix install. If not present, we 111 * make an educated guess based of position relative to this 112 * script. 113 * 114 * HERITRIX_OUT Pathname to the Heritrix log file written when run in 115 * daemon mode. 116 * Default setting is $HERITRIX_HOME/heritrix_out.log 117 * 118 * JAVA_OPTS Java runtime options. Default setting is '-Xmx256m'. 119 * 120 * FOREGROUND 121 */ 122 String[] cmd = { 123 "./bin/heritrix", 124 "-b", 125 hostName, 126 "-p ", 127 Integer.toString(guiPort), 128 "-a ", 129 getHeritrixAdminName() + ":" + getHeritrixAdminPassword(), 130 "-s", 131 "h3server.jks,h3server,h3server" 132 }; 133 log.info("Starting Heritrix3 with the following arguments:{} ", 134 StringUtils.conjoin(" ", cmd)); 135 h3launcher = CommandLauncher.getInstance(); 136 h3launcher.init(heritrixBaseDir, cmd); 137 h3launcher.env.put("FOREGROUND", "true"); 138 log.info(".. and setting FOREGROUND to 'true'"); 139 String javaOpts = ""; 140 String jvmOptsStr = Settings.get(Heritrix3Settings.HERITRIX_JVM_OPTS); 141 if ((jvmOptsStr != null) && (!jvmOptsStr.isEmpty())) { 142 javaOpts = " " + jvmOptsStr; 143 } 144 String javaOptsValue = "-Xmx" + Settings.get(Heritrix3Settings.HERITRIX_HEAP_SIZE) + " " + javaOpts + " " + getSettingsProperty(); 145 h3launcher.env.put("JAVA_OPTS", javaOptsValue); 146 log.info(".. and setting JAVA_OPTS to '{}'", javaOptsValue); 147 String heritrixOutValue = files.getHeritrixOutput().getAbsolutePath(); 148 h3launcher.env.put("HERITRIX_OUT", heritrixOutValue); 149 log.info(".. and setting HERITRIX_OUT to '{}'", heritrixOutValue); 150 151 outputPrinter = new PrintWriter(files.getHeritrixStdoutLog(), "UTF-8"); 152 errorPrinter = new PrintWriter(files.getHeritrixStderrLog(), "UTF-8"); 153 log.info(".. and setting output from heritrix3 to '{}', and errors to '{}'", files.getHeritrixStdoutLog(),files.getHeritrixStderrLog() ); 154 h3handler = new LaunchResultHandler(outputPrinter, errorPrinter); 155 h3launcher.start(h3handler); 156 Runtime.getRuntime().addShutdownHook(new HeritrixKiller()); 157 // TODO HERE WE SHOULD DRAIN THE UMBRA QUEUE AFTER SHUTDOWN HOOK 158 log.info("Heritrix3 engine launched successfully"); 159 } catch( Throwable e) { 160 String errMsg = "Unexpected error while launching H3: "; 161 log.debug(errMsg, e); 162 throw new IOFailure(errMsg, e); 163 } 164 } 165 166 private void executeUmbraStartHook() { 167 String umbraScript = Settings.get(Heritrix3Settings.UMBRA_PRESTART_SCRIPT); 168 log.info("Executing umbra hook script {}", umbraScript); 169 BlockingCommandLauncher scriptLauncher = new BlockingCommandLauncher(new File(System.getProperty("user.dir")), umbraScript.trim().split("\\s+")); 170 try { 171 scriptLauncher.start(new LaunchResultHandlerAbstract() { 172 @Override public void exitValue(int exitValue) { 173 if (exitValue == 0) { 174 log.info("Umbra hook {} ended with exit value {}", umbraScript, exitValue); 175 } else { 176 log.error("Umbra hook {} ended with exit value {}", umbraScript, exitValue); 177 } 178 } 179 180 @Override public void output(String line) { 181 log.info("Output from {}: {}", umbraScript, line); 182 } 183 184 @Override public void closeOutput() { 185 log.info("Finished reading standard out from umbra hook {}", umbraScript); 186 } 187 188 @Override public void error(String line) { 189 log.warn("Error output from {}: {}", umbraScript, line); 190 } 191 192 @Override public void closeError() { 193 log.info("Finished reading standard err from umbra hook {}", umbraScript); 194 } 195 }); 196 } catch (IOException e) { 197 log.error("Exception executing umbra hook script {}.", umbraScript, e); 198 } 199 200 } 201 202 /** 203 * Implementation of a LaunchResultHandler for Heritrix3. 204 * 205 */ 206 public static class LaunchResultHandler implements LaunchResultHandlerAbstract { 207 protected Semaphore semaphore = new Semaphore(-2); 208 protected PrintWriter outputPrinter; 209 protected PrintWriter errorPrinter; 210 public LaunchResultHandler(PrintWriter outputPrinter, PrintWriter errorPrinter) { 211 this.outputPrinter = outputPrinter; 212 this.errorPrinter = errorPrinter; 213 } 214 @Override 215 public void exitValue(int exitValue) { 216 semaphore.release(); 217 if (exitValue != 0) { 218 log.error("Heritrix3 engine shutdown failed. ExitValue = {}", exitValue); 219 } else { 220 log.info("Heritrix3 engine shutdown was successful. ExitValue = {}", exitValue); 221 } 222 } 223 @Override 224 public void output(String line) { 225 outputPrinter.println(line); 226 } 227 @Override 228 public void closeOutput() { 229 outputPrinter.close(); 230 semaphore.release(); 231 } 232 @Override 233 public void error(String line) { 234 errorPrinter.println(line); 235 } 236 @Override 237 public void closeError() { 238 errorPrinter.close(); 239 semaphore.release(); 240 } 241 } 242 243 /** 244 * @return the Settingsproperty for heritrix3 245 */ 246 private static String getSettingsProperty() { 247 StringBuilder settingProperty = new StringBuilder(); 248 for (File file : Settings.getSettingsFiles()) { 249 settingProperty.append(File.pathSeparator); 250 String absolutePath = file.getAbsolutePath(); 251 // check that the settings files not only exist but 252 // are readable 253 boolean readable = new File(absolutePath).canRead(); 254 if (!readable) { 255 final String errMsg = "The file '" + absolutePath 256 + "' is missing. "; 257 log.warn(errMsg); 258 throw new IOFailure("Failed to read file '" + absolutePath 259 + "'"); 260 } 261 settingProperty.append(absolutePath); 262 } 263 if (settingProperty.length() > 0) { 264 // delete last path-separator 265 settingProperty.deleteCharAt(0); 266 } 267 return "-Ddk.netarkivet.settings.file=" + settingProperty; 268 } 269 270 /** 271 * @return the HTTP port used by the Heritrix3 GUI. 272 */ 273 protected int getGuiPort() { 274 return guiPort; 275 } 276 277 /** 278 * @return the Heritrix3 files wrapper. 279 */ 280 protected Heritrix3Files getHeritrixFiles() { 281 return files; 282 } 283 284 /** 285 * @return the host name 286 */ 287 protected String getHostName() { 288 return hostName; 289 } 290 291 /** 292 * Get the login name for accessing the Heritrix3 GUI. This name can be set in the settings.xml file. 293 * 294 * @return Name to use for accessing Heritrix3 web GUI 295 */ 296 protected String getHeritrixAdminName() { 297 return Settings.get(Heritrix3Settings.HERITRIX_ADMIN_NAME); 298 } 299 300 /** 301 * Get the login password for accessing the Heritrix3 GUI. This password can be set in the settings.xml file. 302 * 303 * @return Password to use for accessing the Heritrix3 GUI 304 */ 305 protected String getHeritrixAdminPassword() { 306 return Settings.get(Heritrix3Settings.HERITRIX_ADMIN_PASSWORD); 307 } 308 309 /** 310 * Get a string that describes the current controller in terms of job ID, harvest ID, and crawldir. 311 * 312 * @return A human-readable string describing this controller. 313 */ 314 @Override 315 public String toString() { 316 return "job " + files.getJobID() + " of harvest " + files.getHarvestID() 317 + " in " + files.getCrawlDir(); 318 } 319 320 /** 321 * Return a human-readable description of the job. This will only be visible in the Heritrix GUI. 322 * 323 * @return String containing various information grabbed from HeritrixFiles. 324 */ 325 protected String getJobDescription() { 326 String dedupPart = (files.getIndexDir() != null) ? "with the deduplication index stored in '" 327 + files.getIndexDir().getAbsolutePath() + "'" : "with deduplication disabled"; 328 return "Job " + files.getJobID() + " for harvest " + files.getHarvestID() + " performed in " 329 + files.getCrawlDir() + dedupPart + " and " + FileUtils.countLines(files.getSeedsTxtFile()) + " seeds"; 330 } 331 332 public Heritrix3Files getFiles() { 333 return this.files; 334 } 335 336 private class HeritrixKiller extends Thread { 337 @Override 338 public void run() { 339 stopHeritrix(); 340 } 341 } 342}