001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.heritrix3.controller; 024 025import java.io.File; 026import java.io.PrintWriter; 027import java.util.concurrent.Semaphore; 028 029import org.netarchivesuite.heritrix3wrapper.CommandLauncher; 030import org.netarchivesuite.heritrix3wrapper.Heritrix3Wrapper; 031import org.netarchivesuite.heritrix3wrapper.LaunchResultHandlerAbstract; 032import org.netarchivesuite.heritrix3wrapper.unzip.UnzipUtils; 033import org.slf4j.Logger; 034import org.slf4j.LoggerFactory; 035 036import dk.netarkivet.common.exceptions.ArgumentNotValid; 037import dk.netarkivet.common.exceptions.IOFailure; 038import dk.netarkivet.common.utils.FileUtils; 039import dk.netarkivet.common.utils.Settings; 040import dk.netarkivet.common.utils.StringUtils; 041import dk.netarkivet.common.utils.SystemUtils; 042import dk.netarkivet.harvester.heritrix3.Heritrix3Files; 043import dk.netarkivet.harvester.heritrix3.Heritrix3Settings; 044 045/** 046 * Abstract base class for REST-based Heritrix controllers. 047 */ 048public abstract class AbstractRestHeritrixController implements IHeritrixController { 049 050 /** The logger for this class. */ 051 private static final Logger log = LoggerFactory.getLogger(AbstractRestHeritrixController.class); 052 053 /** The various files used by Heritrix. */ 054 protected final Heritrix3Files files; 055 056 protected Heritrix3Wrapper h3wrapper; 057 protected CommandLauncher h3launcher; 058 protected LaunchResultHandlerAbstract h3handler; 059 protected PrintWriter outputPrinter; 060 protected PrintWriter errorPrinter; 061 protected File heritrixBaseDir; 062 063 /** The host name for this machine that matches what Heritrix uses in its MBean names. */ 064 private final String hostName; 065 066 /** The port to use for Heritrix GUI, as set in settings.xml. */ 067 private final int guiPort = Settings.getInt(Heritrix3Settings.HERITRIX_GUI_PORT); 068 069 /** 070 * Create a AbstractRestHeritrixController object. 071 * 072 * @param files Files that are used to set up Heritrix. 073 */ 074 public AbstractRestHeritrixController(Heritrix3Files files) { 075 ArgumentNotValid.checkNotNull(files, "Heritrix3Files files"); 076 this.files = files; 077 SystemUtils.checkPortNotUsed(guiPort); 078 079 hostName = SystemUtils.getLocalHostName(); 080 try { 081 log.info("Starting Heritrix for {} in crawldir {}", this, files.getCrawlDir()); 082 String zipFileStr = files.getHeritrixZip().getAbsolutePath(); 083 084 heritrixBaseDir = files.getHeritrixBaseDir(); 085 if (!heritrixBaseDir.isDirectory()) { 086 heritrixBaseDir.mkdirs(); 087 } 088 if (!heritrixBaseDir.isDirectory()) { 089 throw new IOFailure("Unable to create heritrixbasedir: " + heritrixBaseDir.getAbsolutePath() ); 090 } 091 092 log.debug("Unzipping heritrix into the crawldir"); 093 UnzipUtils.unzip(zipFileStr, 1, heritrixBaseDir.getAbsolutePath()); 094 095 if (files.getCertificateFile() != null) { 096 log.debug("Copying override keystore into heritrix dir"); 097 Heritrix3Wrapper.copyFileAs(files.getCertificateFile(), heritrixBaseDir, "h3server.jks"); 098 } 099 100 /** The bin/heritrix script should read the following environment-variables: 101 * 102 * JAVA_HOME Point at a JDK install to use 103 * 104 * HERITRIX_HOME Pointer to your heritrix install. If not present, we 105 * make an educated guess based of position relative to this 106 * script. 107 * 108 * HERITRIX_OUT Pathname to the Heritrix log file written when run in 109 * daemon mode. 110 * Default setting is $HERITRIX_HOME/heritrix_out.log 111 * 112 * JAVA_OPTS Java runtime options. Default setting is '-Xmx256m'. 113 * 114 * FOREGROUND 115 */ 116 String[] cmd = { 117 "./bin/heritrix", 118 "-b", 119 hostName, 120 "-p ", 121 Integer.toString(guiPort), 122 "-a ", 123 getHeritrixAdminName() + ":" + getHeritrixAdminPassword(), 124 "-s", 125 "h3server.jks,h3server,h3server" 126 }; 127 log.info("Starting Heritrix3 with the following arguments:{} ", 128 StringUtils.conjoin(" ", cmd)); 129 h3launcher = CommandLauncher.getInstance(); 130 h3launcher.init(heritrixBaseDir, cmd); 131 h3launcher.env.put("FOREGROUND", "true"); 132 log.info(".. and setting FOREGROUND to 'true'"); 133 String javaOpts = ""; 134 String jvmOptsStr = Settings.get(Heritrix3Settings.HERITRIX_JVM_OPTS); 135 if ((jvmOptsStr != null) && (!jvmOptsStr.isEmpty())) { 136 javaOpts = " " + jvmOptsStr; 137 } 138 String javaOptsValue = "-Xmx" + Settings.get(Heritrix3Settings.HERITRIX_HEAP_SIZE) + " " + javaOpts + " " + getSettingsProperty(); 139 h3launcher.env.put("JAVA_OPTS", javaOptsValue); 140 log.info(".. and setting JAVA_OPTS to '{}'", javaOptsValue); 141 String heritrixOutValue = files.getHeritrixOutput().getAbsolutePath(); 142 h3launcher.env.put("HERITRIX_OUT", heritrixOutValue); 143 log.info(".. and setting HERITRIX_OUT to '{}'", heritrixOutValue); 144 145 outputPrinter = new PrintWriter(files.getHeritrixStdoutLog(), "UTF-8"); 146 errorPrinter = new PrintWriter(files.getHeritrixStderrLog(), "UTF-8"); 147 log.info(".. and setting output from heritrix3 to '{}', and errors to '{}'", files.getHeritrixStdoutLog(),files.getHeritrixStderrLog() ); 148 h3handler = new LaunchResultHandler(outputPrinter, errorPrinter); 149 h3launcher.start(h3handler); 150 Runtime.getRuntime().addShutdownHook(new HeritrixKiller()); 151 log.info("Heritrix3 engine launched successfully"); 152 } catch( Throwable e) { 153 String errMsg = "Unexpected error while launching H3: "; 154 log.debug(errMsg, e); 155 throw new IOFailure(errMsg, e); 156 } 157 } 158 159 /** 160 * Implementation of a LaunchResultHandler for Heritrix3. 161 * 162 */ 163 public static class LaunchResultHandler implements LaunchResultHandlerAbstract { 164 protected Semaphore semaphore = new Semaphore(-2); 165 protected PrintWriter outputPrinter; 166 protected PrintWriter errorPrinter; 167 public LaunchResultHandler(PrintWriter outputPrinter, PrintWriter errorPrinter) { 168 this.outputPrinter = outputPrinter; 169 this.errorPrinter = errorPrinter; 170 } 171 @Override 172 public void exitValue(int exitValue) { 173 semaphore.release(); 174 if (exitValue != 0) { 175 log.error("Heritrix3 engine shutdown failed. ExitValue = {}", exitValue); 176 } else { 177 log.info("Heritrix3 engine shutdown was successful. ExitValue = {}", exitValue); 178 } 179 } 180 @Override 181 public void output(String line) { 182 outputPrinter.println(line); 183 } 184 @Override 185 public void closeOutput() { 186 outputPrinter.close(); 187 semaphore.release(); 188 } 189 @Override 190 public void error(String line) { 191 errorPrinter.println(line); 192 } 193 @Override 194 public void closeError() { 195 errorPrinter.close(); 196 semaphore.release(); 197 } 198 } 199 200 /** 201 * @return the Settingsproperty for heritrix3 202 */ 203 private static String getSettingsProperty() { 204 StringBuilder settingProperty = new StringBuilder(); 205 for (File file : Settings.getSettingsFiles()) { 206 settingProperty.append(File.pathSeparator); 207 String absolutePath = file.getAbsolutePath(); 208 // check that the settings files not only exist but 209 // are readable 210 boolean readable = new File(absolutePath).canRead(); 211 if (!readable) { 212 final String errMsg = "The file '" + absolutePath 213 + "' is missing. "; 214 log.warn(errMsg); 215 throw new IOFailure("Failed to read file '" + absolutePath 216 + "'"); 217 } 218 settingProperty.append(absolutePath); 219 } 220 if (settingProperty.length() > 0) { 221 // delete last path-separator 222 settingProperty.deleteCharAt(0); 223 } 224 return "-Ddk.netarkivet.settings.file=" + settingProperty; 225 } 226 227 /** 228 * @return the HTTP port used by the Heritrix3 GUI. 229 */ 230 protected int getGuiPort() { 231 return guiPort; 232 } 233 234 /** 235 * @return the Heritrix3 files wrapper. 236 */ 237 protected Heritrix3Files getHeritrixFiles() { 238 return files; 239 } 240 241 /** 242 * @return the host name 243 */ 244 protected String getHostName() { 245 return hostName; 246 } 247 248 /** 249 * Get the login name for accessing the Heritrix3 GUI. This name can be set in the settings.xml file. 250 * 251 * @return Name to use for accessing Heritrix3 web GUI 252 */ 253 protected String getHeritrixAdminName() { 254 return Settings.get(Heritrix3Settings.HERITRIX_ADMIN_NAME); 255 } 256 257 /** 258 * Get the login password for accessing the Heritrix3 GUI. This password can be set in the settings.xml file. 259 * 260 * @return Password to use for accessing the Heritrix3 GUI 261 */ 262 protected String getHeritrixAdminPassword() { 263 return Settings.get(Heritrix3Settings.HERITRIX_ADMIN_PASSWORD); 264 } 265 266 /** 267 * Get a string that describes the current controller in terms of job ID, harvest ID, and crawldir. 268 * 269 * @return A human-readable string describing this controller. 270 */ 271 @Override 272 public String toString() { 273 return "job " + files.getJobID() + " of harvest " + files.getHarvestID() 274 + " in " + files.getCrawlDir(); 275 } 276 277 /** 278 * Return a human-readable description of the job. This will only be visible in the Heritrix GUI. 279 * 280 * @return String containing various information grabbed from HeritrixFiles. 281 */ 282 protected String getJobDescription() { 283 String dedupPart = (files.getIndexDir() != null) ? "with the deduplication index stored in '" 284 + files.getIndexDir().getAbsolutePath() + "'" : "with deduplication disabled"; 285 return "Job " + files.getJobID() + " for harvest " + files.getHarvestID() + " performed in " 286 + files.getCrawlDir() + dedupPart + " and " + FileUtils.countLines(files.getSeedsTxtFile()) + " seeds"; 287 } 288 289 public Heritrix3Files getFiles() { 290 return this.files; 291 } 292 293 private class HeritrixKiller extends Thread { 294 @Override 295 public void run() { 296 stopHeritrix(); 297 } 298 } 299}