001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.heritrix3.controller; 024 025import java.io.File; 026import java.io.PrintWriter; 027import java.util.concurrent.Semaphore; 028 029import org.netarchivesuite.heritrix3wrapper.CommandLauncher; 030import org.netarchivesuite.heritrix3wrapper.Heritrix3Wrapper; 031import org.netarchivesuite.heritrix3wrapper.LaunchResultHandlerAbstract; 032import org.netarchivesuite.heritrix3wrapper.unzip.UnzipUtils; 033import org.slf4j.Logger; 034import org.slf4j.LoggerFactory; 035 036import dk.netarkivet.common.exceptions.ArgumentNotValid; 037import dk.netarkivet.common.exceptions.IOFailure; 038import dk.netarkivet.common.utils.FileUtils; 039import dk.netarkivet.common.utils.Settings; 040import dk.netarkivet.common.utils.StringUtils; 041import dk.netarkivet.common.utils.SystemUtils; 042import dk.netarkivet.harvester.heritrix3.Heritrix3Files; 043import dk.netarkivet.harvester.heritrix3.Heritrix3Settings; 044 045/** 046 * Abstract base class for REST-based Heritrix controllers. 047 */ 048public abstract class AbstractRestHeritrixController implements IHeritrixController { 049 050 /** The logger for this class. */ 051 private static final Logger log = LoggerFactory.getLogger(AbstractRestHeritrixController.class); 052 053 /** The various files used by Heritrix. */ 054 protected final Heritrix3Files files; 055 056 protected Heritrix3Wrapper h3wrapper; 057 protected CommandLauncher h3launcher; 058 protected LaunchResultHandlerAbstract h3handler; 059 protected PrintWriter outputPrinter; 060 protected PrintWriter errorPrinter; 061 protected File heritrixBaseDir; 062 063 /** The host name for this machine that matches what Heritrix uses in its MBean names. */ 064 private final String hostName; 065 066 /** The port to use for Heritrix GUI, as set in settings.xml. */ 067 private final int guiPort = Settings.getInt(Heritrix3Settings.HERITRIX_GUI_PORT); 068 069 /** 070 * Create a AbstractRestHeritrixController object. 071 * 072 * @param files Files that are used to set up Heritrix. 073 */ 074 public AbstractRestHeritrixController(Heritrix3Files files) { 075 ArgumentNotValid.checkNotNull(files, "Heritrix3Files files"); 076 this.files = files; 077 SystemUtils.checkPortNotUsed(guiPort); 078 079 hostName = SystemUtils.getLocalHostName(); 080 try { 081 log.info("Starting Heritrix for {} in crawldir {}", this, files.getCrawlDir()); 082 String zipFileStr = files.getHeritrixZip().getAbsolutePath(); 083 084 heritrixBaseDir = files.getHeritrixBaseDir(); 085 if (!heritrixBaseDir.isDirectory()) { 086 heritrixBaseDir.mkdirs(); 087 } 088 if (!heritrixBaseDir.isDirectory()) { 089 throw new IOFailure("Unable to create heritrixbasedir: " + heritrixBaseDir.getAbsolutePath() ); 090 } 091 092 log.debug("Unzipping heritrix into the crawldir"); 093 UnzipUtils.unzip(zipFileStr, 1, heritrixBaseDir.getAbsolutePath()); 094 095 if (files.getCertificateFile() != null) { 096 log.debug("Copying override keystore into heritrix dir"); 097 Heritrix3Wrapper.copyFileAs(files.getCertificateFile(), heritrixBaseDir, "h3server.jks"); 098 } 099 100 /** The bin/heritrix script should read the following environment-variables: 101 * 102 * JAVA_HOME Point at a JDK install to use 103 * 104 * HERITRIX_HOME Pointer to your heritrix install. If not present, we 105 * make an educated guess based of position relative to this 106 * script. 107 * 108 * HERITRIX_OUT Pathname to the Heritrix log file written when run in 109 * daemon mode. 110 * Default setting is $HERITRIX_HOME/heritrix_out.log 111 * 112 * JAVA_OPTS Java runtime options. Default setting is '-Xmx256m'. 113 * 114 * FOREGROUND 115 */ 116 String[] cmd = { 117 "./bin/heritrix", 118 "-b", 119 hostName, 120 "-p ", 121 Integer.toString(guiPort), 122 "-a ", 123 getHeritrixAdminName() + ":" + getHeritrixAdminPassword(), 124 "-s", 125 "h3server.jks,h3server,h3server" 126 }; 127 log.info("Starting Heritrix3 with the following arguments:{} ", 128 StringUtils.conjoin(" ", cmd)); 129 h3launcher = CommandLauncher.getInstance(); 130 h3launcher.init(heritrixBaseDir, cmd); 131 h3launcher.env.put("FOREGROUND", "true"); 132 log.info(".. and setting FOREGROUND to 'true'"); 133 String javaOpts = ""; 134 String jvmOptsStr = Settings.get(Heritrix3Settings.HERITRIX_JVM_OPTS); 135 if ((jvmOptsStr != null) && (!jvmOptsStr.isEmpty())) { 136 javaOpts = " " + jvmOptsStr; 137 } 138 String javaOptsValue = "-Xmx" + Settings.get(Heritrix3Settings.HERITRIX_HEAP_SIZE) + " " + javaOpts + " " + getSettingsProperty(); 139 h3launcher.env.put("JAVA_OPTS", javaOptsValue); 140 log.info(".. and setting JAVA_OPTS to '{}'", javaOptsValue); 141 String heritrixOutValue = files.getHeritrixOutput().getAbsolutePath(); 142 h3launcher.env.put("HERITRIX_OUT", heritrixOutValue); 143 log.info(".. and setting HERITRIX_OUT to '{}'", heritrixOutValue); 144 145 outputPrinter = new PrintWriter(files.getHeritrixStdoutLog(), "UTF-8"); 146 errorPrinter = new PrintWriter(files.getHeritrixStderrLog(), "UTF-8"); 147 h3handler = new LaunchResultHandler(outputPrinter, errorPrinter); 148 h3launcher.start(h3handler); 149 Runtime.getRuntime().addShutdownHook(new HeritrixKiller()); 150 log.info("Heritrix3 launched successfully"); 151 } catch( Throwable e) { 152 String errMsg = "Unexpected error while launching H3: "; 153 log.debug(errMsg, e); 154 throw new IOFailure(errMsg, e); 155 } 156 } 157 158 public static class LaunchResultHandler implements LaunchResultHandlerAbstract { 159 protected Semaphore semaphore = new Semaphore(-2); 160 protected PrintWriter outputPrinter; 161 protected PrintWriter errorPrinter; 162 public LaunchResultHandler(PrintWriter outputPrinter, PrintWriter errorPrinter) { 163 this.outputPrinter = outputPrinter; 164 this.errorPrinter = errorPrinter; 165 } 166 @Override 167 public void exitValue(int exitValue) { 168 semaphore.release(); 169 log.info("Heritrix3 exitValue=: {}", exitValue); 170 } 171 @Override 172 public void output(String line) { 173 outputPrinter.println(line); 174 } 175 @Override 176 public void closeOutput() { 177 outputPrinter.close(); 178 semaphore.release(); 179 } 180 @Override 181 public void error(String line) { 182 errorPrinter.println(line); 183 } 184 @Override 185 public void closeError() { 186 errorPrinter.close(); 187 semaphore.release(); 188 } 189 } 190 191 /** 192 * @return the Settingsproperty for heritrix3 193 */ 194 private static String getSettingsProperty() { 195 StringBuilder settingProperty = new StringBuilder(); 196 for (File file : Settings.getSettingsFiles()) { 197 settingProperty.append(File.pathSeparator); 198 String absolutePath = file.getAbsolutePath(); 199 // check that the settings files not only exist but 200 // are readable 201 boolean readable = new File(absolutePath).canRead(); 202 if (!readable) { 203 final String errMsg = "The file '" + absolutePath 204 + "' is missing. "; 205 log.warn(errMsg); 206 throw new IOFailure("Failed to read file '" + absolutePath 207 + "'"); 208 } 209 settingProperty.append(absolutePath); 210 } 211 if (settingProperty.length() > 0) { 212 // delete last path-separator 213 settingProperty.deleteCharAt(0); 214 } 215 return "-Ddk.netarkivet.settings.file=" + settingProperty; 216 } 217 218 /** 219 * @return the HTTP port used by the Heritrix GUI. 220 */ 221 protected int getGuiPort() { 222 return guiPort; 223 } 224 225 /** 226 * @return the Heritrix files wrapper. 227 */ 228 protected Heritrix3Files getHeritrixFiles() { 229 return files; 230 } 231 232 /** 233 * @return the host name 234 */ 235 protected String getHostName() { 236 return hostName; 237 } 238 239 /** 240 * Get the login name for accessing the Heritrix GUI. This name can be set in the settings.xml file. 241 * 242 * @return Name to use for accessing Heritrix web GUI 243 */ 244 protected String getHeritrixAdminName() { 245 return Settings.get(Heritrix3Settings.HERITRIX_ADMIN_NAME); 246 } 247 248 /** 249 * Get the login password for accessing the Heritrix GUI. This password can be set in the settings.xml file. 250 * 251 * @return Password to use for accessing the Heritrix GUI 252 */ 253 protected String getHeritrixAdminPassword() { 254 return Settings.get(Heritrix3Settings.HERITRIX_ADMIN_PASSWORD); 255 } 256 257 /** 258 * Get a string that describes the current controller in terms of job ID, harvest ID, and crawldir. 259 * 260 * @return A human-readable string describing this controller. 261 */ 262 @Override 263 public String toString() { 264 return "job " + files.getJobID() + " of harvest " + files.getHarvestID() 265 + " in " + files.getCrawlDir(); 266 } 267 268 /** 269 * Return a human-readable description of the job. This will only be visible in the Heritrix GUI. 270 * 271 * @return String containing various information grabbed from HeritrixFiles. 272 */ 273 protected String getJobDescription() { 274 String dedupPart = (files.getIndexDir() != null) ? "with the deduplication index stored in '" 275 + files.getIndexDir().getAbsolutePath() + "'" : "with deduplication disabled"; 276 return "Job " + files.getJobID() + " for harvest " + files.getHarvestID() + " performed in " 277 + files.getCrawlDir() + dedupPart + " and " + FileUtils.countLines(files.getSeedsTxtFile()) + " seeds"; 278 } 279 280 public Heritrix3Files getFiles() { 281 return this.files; 282 } 283 284 private class HeritrixKiller extends Thread { 285 @Override 286 public void run() { 287 stopHeritrix(); 288 } 289 } 290}