001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2018 The Royal Danish Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 *
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 *
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.heritrix3.controller;
024
025import java.io.File;
026import java.io.IOException;
027import java.io.PrintWriter;
028import java.util.concurrent.Semaphore;
029
030import org.netarchivesuite.heritrix3wrapper.CommandLauncher;
031import org.netarchivesuite.heritrix3wrapper.Heritrix3Wrapper;
032import org.netarchivesuite.heritrix3wrapper.LaunchResultHandlerAbstract;
033import org.netarchivesuite.heritrix3wrapper.unzip.UnzipUtils;
034import org.slf4j.Logger;
035import org.slf4j.LoggerFactory;
036
037import dk.netarkivet.common.exceptions.ArgumentNotValid;
038import dk.netarkivet.common.exceptions.IOFailure;
039import dk.netarkivet.common.utils.FileUtils;
040import dk.netarkivet.common.utils.Settings;
041import dk.netarkivet.common.utils.StringUtils;
042import dk.netarkivet.common.utils.SystemUtils;
043import dk.netarkivet.harvester.heritrix3.BlockingCommandLauncher;
044import dk.netarkivet.harvester.heritrix3.Heritrix3Files;
045import dk.netarkivet.harvester.heritrix3.Heritrix3Settings;
046
047/**
048 * Abstract base class for REST-based Heritrix controllers.
049 */
050public abstract class AbstractRestHeritrixController implements IHeritrixController {
051
052    /** The logger for this class. */
053    private static final Logger log = LoggerFactory.getLogger(AbstractRestHeritrixController.class);
054
055    /** The various files used by Heritrix. */
056    protected final Heritrix3Files files;
057
058    protected Heritrix3Wrapper h3wrapper;
059    protected CommandLauncher h3launcher;
060    protected LaunchResultHandlerAbstract h3handler;
061    protected PrintWriter outputPrinter;
062    protected PrintWriter errorPrinter;
063    protected File heritrixBaseDir;
064
065    /** The host name for this machine that matches what Heritrix uses in its MBean names. */
066    private final String hostName;
067
068    /** The port to use for Heritrix GUI, as set in settings.xml. */
069    private final int guiPort = Settings.getInt(Heritrix3Settings.HERITRIX_GUI_PORT);
070
071    /**
072     * Create a AbstractRestHeritrixController  object.
073     *
074     * @param files Files that are used to set up Heritrix.
075     */
076    public AbstractRestHeritrixController(Heritrix3Files files) {
077        ArgumentNotValid.checkNotNull(files, "Heritrix3Files files");
078        this.files = files;
079        SystemUtils.checkPortNotUsed(guiPort);
080
081        if (Settings.getBoolean(Heritrix3Settings.UMBRA_IS_ENABLED)) {
082            executeUmbraStartHook();
083        }
084
085        hostName = SystemUtils.getLocalHostName();
086        try {
087            log.info("Starting Heritrix for {} in crawldir {}", this, files.getCrawlDir());
088            String zipFileStr = files.getHeritrixZip().getAbsolutePath();
089
090            heritrixBaseDir = files.getHeritrixBaseDir();
091            if (!heritrixBaseDir.isDirectory()) {
092                heritrixBaseDir.mkdirs();
093            }
094            if (!heritrixBaseDir.isDirectory()) {
095                throw new IOFailure("Unable to create heritrixbasedir: " + heritrixBaseDir.getAbsolutePath() );
096            }
097
098            log.debug("Unzipping heritrix into the crawldir");
099            UnzipUtils.unzip(zipFileStr, 1, heritrixBaseDir.getAbsolutePath());
100
101            if (files.getCertificateFile() != null) {
102                log.debug("Copying override keystore into heritrix dir");
103                Heritrix3Wrapper.copyFileAs(files.getCertificateFile(), heritrixBaseDir, "h3server.jks");
104            }
105
106            /** The bin/heritrix script should read the following environment-variables:
107             *
108             * JAVA_HOME Point at a JDK install to use  
109             *
110             * HERITRIX_HOME    Pointer to your heritrix install.  If not present, we 
111             *                  make an educated guess based of position relative to this
112             *                  script.
113             *
114             * HERITRIX_OUT     Pathname to the Heritrix log file written when run in
115             *                  daemon mode.
116             *                  Default setting is $HERITRIX_HOME/heritrix_out.log
117             *
118             * JAVA_OPTS        Java runtime options.  Default setting is '-Xmx256m'.
119             *
120             * FOREGROUND      
121             */
122            String[] cmd = {
123                    "./bin/heritrix",
124                    "-b",
125                    hostName,
126                    "-p ",
127                    Integer.toString(guiPort),
128                    "-a ",
129                    getHeritrixAdminName() + ":" + getHeritrixAdminPassword(),
130                    "-s",
131                    "h3server.jks,h3server,h3server"
132            };
133            log.info("Starting Heritrix3 with the following arguments:{} ",
134                    StringUtils.conjoin(" ", cmd));
135            h3launcher = CommandLauncher.getInstance();
136            h3launcher.init(heritrixBaseDir, cmd);
137            h3launcher.env.put("FOREGROUND", "true");
138            log.info(".. and setting FOREGROUND to 'true'");
139            String javaOpts = "";
140            String jvmOptsStr = Settings.get(Heritrix3Settings.HERITRIX_JVM_OPTS);
141            if ((jvmOptsStr != null) && (!jvmOptsStr.isEmpty())) {
142                javaOpts = " " + jvmOptsStr;
143            }
144            String javaOptsValue = "-Xmx" + Settings.get(Heritrix3Settings.HERITRIX_HEAP_SIZE) + " " + javaOpts + " " +  getSettingsProperty();
145            h3launcher.env.put("JAVA_OPTS", javaOptsValue);
146            log.info(".. and setting JAVA_OPTS to '{}'", javaOptsValue);
147            String heritrixOutValue = files.getHeritrixOutput().getAbsolutePath();
148            h3launcher.env.put("HERITRIX_OUT", heritrixOutValue);
149            log.info(".. and setting HERITRIX_OUT to '{}'", heritrixOutValue);
150
151            outputPrinter = new PrintWriter(files.getHeritrixStdoutLog(), "UTF-8");
152            errorPrinter = new PrintWriter(files.getHeritrixStderrLog(), "UTF-8");
153            log.info(".. and setting output from heritrix3 to '{}', and errors to '{}'", files.getHeritrixStdoutLog(),files.getHeritrixStderrLog() );
154            h3handler = new LaunchResultHandler(outputPrinter, errorPrinter);
155            h3launcher.start(h3handler);
156            Runtime.getRuntime().addShutdownHook(new HeritrixKiller());
157            //            TODO HERE WE SHOULD DRAIN THE UMBRA QUEUE AFTER SHUTDOWN HOOK
158            log.info("Heritrix3 engine launched successfully");
159        } catch( Throwable e) {
160            String errMsg = "Unexpected error while launching H3: ";
161            log.debug(errMsg, e);
162            throw new IOFailure(errMsg, e);
163        }
164    }
165
166    private void executeUmbraStartHook() {
167        String umbraScript = Settings.get(Heritrix3Settings.UMBRA_PRESTART_SCRIPT);
168        log.info("Executing umbra hook script {}", umbraScript);
169        BlockingCommandLauncher scriptLauncher = new BlockingCommandLauncher(new File(System.getProperty("user.dir")), umbraScript.trim().split("\\s+"));
170        try {
171            scriptLauncher.start(new LaunchResultHandlerAbstract() {
172                @Override public void exitValue(int exitValue) {
173                    if (exitValue == 0) {
174                        log.info("Umbra hook {} ended with exit value {}", umbraScript, exitValue);
175                    } else {
176                        log.error("Umbra hook {} ended with exit value {}", umbraScript, exitValue);
177                    }
178                }
179
180                @Override public void output(String line) {
181                     log.info("Output from {}: {}", umbraScript, line);
182                }
183
184                @Override public void closeOutput() {
185                     log.info("Finished reading standard out from umbra hook {}", umbraScript);
186                }
187
188                @Override public void error(String line) {
189                    log.warn("Error output from {}: {}", umbraScript, line);
190                }
191
192                @Override public void closeError() {
193                    log.info("Finished reading standard err from umbra hook {}", umbraScript);
194                }
195            });
196        } catch (IOException e) {
197            log.error("Exception executing umbra hook script {}.", umbraScript, e);
198        }
199
200    }
201
202    /**
203     * Implementation of a LaunchResultHandler for Heritrix3. 
204     *
205     */
206    public static class LaunchResultHandler implements LaunchResultHandlerAbstract {
207        protected Semaphore semaphore = new Semaphore(-2);
208        protected PrintWriter outputPrinter;
209        protected PrintWriter errorPrinter;
210        public LaunchResultHandler(PrintWriter outputPrinter, PrintWriter errorPrinter) {
211            this.outputPrinter = outputPrinter;
212            this.errorPrinter = errorPrinter;
213        }
214        @Override
215        public void exitValue(int exitValue) {
216            semaphore.release();
217            if (exitValue != 0) {
218                log.error("Heritrix3 engine shutdown failed. ExitValue =  {}", exitValue);
219            } else {
220                log.info("Heritrix3 engine shutdown was successful. ExitValue =  {}", exitValue);
221            }
222        }
223        @Override
224        public void output(String line) {
225            outputPrinter.println(line);
226        }
227        @Override
228        public void closeOutput() {
229            outputPrinter.close();
230            semaphore.release();
231        }
232        @Override
233        public void error(String line) {
234            errorPrinter.println(line);
235        }
236        @Override
237        public void closeError() {
238            errorPrinter.close();
239            semaphore.release();
240        }
241    }
242
243    /**
244     * @return the Settingsproperty for heritrix3
245     */
246    private static String getSettingsProperty() {
247        StringBuilder settingProperty = new StringBuilder();
248        for (File file : Settings.getSettingsFiles()) {
249            settingProperty.append(File.pathSeparator);
250            String absolutePath = file.getAbsolutePath();
251            // check that the settings files not only exist but
252            // are readable
253            boolean readable = new File(absolutePath).canRead();
254            if (!readable) {
255                final String errMsg = "The file '" + absolutePath
256                        + "' is missing. ";
257                log.warn(errMsg);
258                throw new IOFailure("Failed to read file '" + absolutePath
259                        + "'");
260            }
261            settingProperty.append(absolutePath);
262        }
263        if (settingProperty.length() > 0) {
264            // delete last path-separator
265            settingProperty.deleteCharAt(0);
266        }
267        return "-Ddk.netarkivet.settings.file=" + settingProperty;
268    }
269
270    /**
271     * @return the HTTP port used by the Heritrix3 GUI.
272     */
273    protected int getGuiPort() {
274        return guiPort;
275    }
276
277    /**
278     * @return the Heritrix3 files wrapper.
279     */
280    protected Heritrix3Files getHeritrixFiles() {
281        return files;
282    }
283
284    /**
285     * @return the host name
286     */
287    protected String getHostName() {
288        return hostName;
289    }
290
291    /**
292     * Get the login name for accessing the Heritrix3 GUI. This name can be set in the settings.xml file.
293     *
294     * @return Name to use for accessing Heritrix3 web GUI
295     */
296    protected String getHeritrixAdminName() {
297        return Settings.get(Heritrix3Settings.HERITRIX_ADMIN_NAME);
298    }
299
300    /**
301     * Get the login password for accessing the Heritrix3 GUI. This password can be set in the settings.xml file.
302     *
303     * @return Password to use for accessing the Heritrix3 GUI
304     */
305    protected String getHeritrixAdminPassword() {
306        return Settings.get(Heritrix3Settings.HERITRIX_ADMIN_PASSWORD);
307    }
308
309    /**
310     * Get a string that describes the current controller in terms of job ID, harvest ID, and crawldir.
311     *
312     * @return A human-readable string describing this controller.
313     */
314    @Override
315    public String toString() {
316        return "job " + files.getJobID() + " of harvest " + files.getHarvestID()
317                + " in " + files.getCrawlDir();
318    }
319
320    /**
321     * Return a human-readable description of the job. This will only be visible in the Heritrix GUI.
322     *
323     * @return String containing various information grabbed from HeritrixFiles.
324     */
325    protected String getJobDescription() {
326        String dedupPart = (files.getIndexDir() != null) ? "with the deduplication index stored in '"
327                + files.getIndexDir().getAbsolutePath() + "'" : "with deduplication disabled";
328        return "Job " + files.getJobID() + " for harvest " + files.getHarvestID() + " performed in "
329                + files.getCrawlDir() + dedupPart + " and " + FileUtils.countLines(files.getSeedsTxtFile()) + " seeds";
330    }
331
332    public Heritrix3Files getFiles() {
333        return this.files;
334    }
335
336    private class HeritrixKiller extends Thread {
337        @Override
338        public void run() {
339            stopHeritrix();
340        }
341    }
342}