001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.heritrix3.controller;
024
025import java.io.File;
026import java.io.PrintWriter;
027import java.util.concurrent.Semaphore;
028
029import org.netarchivesuite.heritrix3wrapper.CommandLauncher;
030import org.netarchivesuite.heritrix3wrapper.Heritrix3Wrapper;
031import org.netarchivesuite.heritrix3wrapper.LaunchResultHandlerAbstract;
032import org.netarchivesuite.heritrix3wrapper.unzip.UnzipUtils;
033import org.slf4j.Logger;
034import org.slf4j.LoggerFactory;
035
036import dk.netarkivet.common.exceptions.ArgumentNotValid;
037import dk.netarkivet.common.exceptions.IOFailure;
038import dk.netarkivet.common.utils.FileUtils;
039import dk.netarkivet.common.utils.Settings;
040import dk.netarkivet.common.utils.StringUtils;
041import dk.netarkivet.common.utils.SystemUtils;
042import dk.netarkivet.harvester.heritrix3.Heritrix3Files;
043import dk.netarkivet.harvester.heritrix3.Heritrix3Settings;
044
045/**
046 * Abstract base class for REST-based Heritrix controllers.
047 */
048public abstract class AbstractRestHeritrixController implements IHeritrixController {
049
050    /** The logger for this class. */
051    private static final Logger log = LoggerFactory.getLogger(AbstractRestHeritrixController.class);
052
053    /** The various files used by Heritrix. */
054    protected final Heritrix3Files files;
055
056    protected Heritrix3Wrapper h3wrapper;
057    protected CommandLauncher h3launcher;
058    protected LaunchResultHandlerAbstract h3handler;
059    protected PrintWriter outputPrinter;
060    protected PrintWriter errorPrinter; 
061    protected File heritrixBaseDir;
062    
063    /** The host name for this machine that matches what Heritrix uses in its MBean names. */
064    private final String hostName;
065
066    /** The port to use for Heritrix GUI, as set in settings.xml. */
067    private final int guiPort = Settings.getInt(Heritrix3Settings.HERITRIX_GUI_PORT);
068
069   /**
070     * Create a AbstractRestHeritrixController  object.
071     *
072     * @param files Files that are used to set up Heritrix.
073     */
074    public AbstractRestHeritrixController(Heritrix3Files files) {
075        ArgumentNotValid.checkNotNull(files, "Heritrix3Files files");
076        this.files = files;
077        SystemUtils.checkPortNotUsed(guiPort);
078        
079        hostName = SystemUtils.getLocalHostName();
080        try {
081            log.info("Starting Heritrix for {} in crawldir {}", this, files.getCrawlDir());
082            String zipFileStr = files.getHeritrixZip().getAbsolutePath();
083
084            heritrixBaseDir = files.getHeritrixBaseDir();
085            if (!heritrixBaseDir.isDirectory()) {
086                heritrixBaseDir.mkdirs();
087            }
088            if (!heritrixBaseDir.isDirectory()) {
089                throw new IOFailure("Unable to create heritrixbasedir: " + heritrixBaseDir.getAbsolutePath() );
090            }
091
092            log.debug("Unzipping heritrix into the crawldir");
093            UnzipUtils.unzip(zipFileStr, 1, heritrixBaseDir.getAbsolutePath());
094
095            if (files.getCertificateFile() != null) {
096                log.debug("Copying override keystore into heritrix dir");
097                Heritrix3Wrapper.copyFileAs(files.getCertificateFile(), heritrixBaseDir, "h3server.jks");
098            }
099
100            /** The bin/heritrix script should read the following environment-variables:
101             * 
102             * JAVA_HOME Point at a JDK install to use  
103             * 
104             * HERITRIX_HOME    Pointer to your heritrix install.  If not present, we 
105             *                  make an educated guess based of position relative to this
106             *                  script.
107             *
108             * HERITRIX_OUT     Pathname to the Heritrix log file written when run in
109             *                  daemon mode.
110             *                  Default setting is $HERITRIX_HOME/heritrix_out.log
111             *
112             * JAVA_OPTS        Java runtime options.  Default setting is '-Xmx256m'.
113             *
114             * FOREGROUND      
115             */
116            String[] cmd = {
117                    "./bin/heritrix",
118                    "-b",
119                    hostName,
120                    "-p ",
121                    Integer.toString(guiPort),
122                    "-a ",
123                    getHeritrixAdminName() + ":" + getHeritrixAdminPassword(),
124                    "-s",
125                    "h3server.jks,h3server,h3server"
126            };
127            log.info("Starting Heritrix3 with the following arguments:{} ", 
128                        StringUtils.conjoin(" ", cmd));
129            h3launcher = CommandLauncher.getInstance();
130            h3launcher.init(heritrixBaseDir, cmd);
131            h3launcher.env.put("FOREGROUND", "true");
132            log.info(".. and setting FOREGROUND to 'true'");
133            String javaOpts = "";
134            String jvmOptsStr = Settings.get(Heritrix3Settings.HERITRIX_JVM_OPTS);
135            if ((jvmOptsStr != null) && (!jvmOptsStr.isEmpty())) {
136                javaOpts = " " + jvmOptsStr;
137            }
138            String javaOptsValue = "-Xmx" + Settings.get(Heritrix3Settings.HERITRIX_HEAP_SIZE) + " " + javaOpts + " " +  getSettingsProperty();
139            h3launcher.env.put("JAVA_OPTS", javaOptsValue);
140            log.info(".. and setting JAVA_OPTS to '{}'", javaOptsValue);
141            String heritrixOutValue = files.getHeritrixOutput().getAbsolutePath();
142            h3launcher.env.put("HERITRIX_OUT", heritrixOutValue);
143            log.info(".. and setting HERITRIX_OUT to '{}'", heritrixOutValue);
144            
145            outputPrinter = new PrintWriter(files.getHeritrixStdoutLog(), "UTF-8");
146            errorPrinter = new PrintWriter(files.getHeritrixStderrLog(), "UTF-8");
147            h3handler = new LaunchResultHandler(outputPrinter, errorPrinter);
148            h3launcher.start(h3handler);
149            Runtime.getRuntime().addShutdownHook(new HeritrixKiller());
150            log.info("Heritrix3 launched successfully");
151        } catch( Throwable e) {
152                String errMsg = "Unexpected error while launching H3: ";
153                log.debug(errMsg, e);
154                throw new IOFailure(errMsg, e);
155        }
156    }
157
158    public static class LaunchResultHandler implements LaunchResultHandlerAbstract {
159        protected Semaphore semaphore = new Semaphore(-2);
160        protected PrintWriter outputPrinter;
161        protected PrintWriter errorPrinter;
162        public LaunchResultHandler(PrintWriter outputPrinter, PrintWriter errorPrinter) {
163                this.outputPrinter = outputPrinter;
164                this.errorPrinter = errorPrinter;
165        }
166        @Override
167        public void exitValue(int exitValue) {
168                semaphore.release();
169                log.info("Heritrix3 exitValue=: {}", exitValue);
170            }
171        @Override
172        public void output(String line) {
173                outputPrinter.println(line);
174        }
175        @Override
176        public void closeOutput() {
177                outputPrinter.close();
178                semaphore.release();
179        }
180        @Override
181        public void error(String line) {
182                errorPrinter.println(line);
183        }
184        @Override
185        public void closeError() {
186                errorPrinter.close();
187                semaphore.release();
188        }
189    }
190
191    /**
192     * @return the Settingsproperty for heritrix3
193     */
194    private static String getSettingsProperty() {
195        StringBuilder settingProperty = new StringBuilder();
196        for (File file : Settings.getSettingsFiles()) {
197                settingProperty.append(File.pathSeparator);
198                String absolutePath = file.getAbsolutePath();
199                // check that the settings files not only exist but
200                // are readable
201                boolean readable = new File(absolutePath).canRead();
202                if (!readable) {
203                        final String errMsg = "The file '" + absolutePath
204                                        + "' is missing. ";
205                        log.warn(errMsg);
206                        throw new IOFailure("Failed to read file '" + absolutePath
207                                        + "'");
208                }
209                settingProperty.append(absolutePath);
210        }
211        if (settingProperty.length() > 0) {
212                // delete last path-separator
213                settingProperty.deleteCharAt(0);
214        }
215        return "-Ddk.netarkivet.settings.file=" + settingProperty;
216    }
217    
218    /**
219     * @return the HTTP port used by the Heritrix GUI.
220     */
221    protected int getGuiPort() {
222        return guiPort;
223    }
224
225    /**
226     * @return the Heritrix files wrapper.
227     */
228    protected Heritrix3Files getHeritrixFiles() {
229        return files;
230    }
231
232    /**
233     * @return the host name
234     */
235    protected String getHostName() {
236        return hostName;
237    }
238
239    /**
240     * Get the login name for accessing the Heritrix GUI. This name can be set in the settings.xml file.
241     *
242     * @return Name to use for accessing Heritrix web GUI
243     */
244    protected String getHeritrixAdminName() {
245        return Settings.get(Heritrix3Settings.HERITRIX_ADMIN_NAME);
246    }
247
248    /**
249     * Get the login password for accessing the Heritrix GUI. This password can be set in the settings.xml file.
250     *
251     * @return Password to use for accessing the Heritrix GUI
252     */
253    protected String getHeritrixAdminPassword() {
254        return Settings.get(Heritrix3Settings.HERITRIX_ADMIN_PASSWORD);
255    }
256
257    /**
258     * Get a string that describes the current controller in terms of job ID, harvest ID, and crawldir.
259     *
260     * @return A human-readable string describing this controller.
261     */
262    @Override
263    public String toString() {
264            return "job " + files.getJobID() + " of harvest " + files.getHarvestID() 
265                        + " in " + files.getCrawlDir();
266    }
267
268    /**
269     * Return a human-readable description of the job. This will only be visible in the Heritrix GUI.
270     *
271     * @return String containing various information grabbed from HeritrixFiles.
272     */
273    protected String getJobDescription() {
274        String dedupPart = (files.getIndexDir() != null) ? "with the deduplication index stored in '"
275                + files.getIndexDir().getAbsolutePath() + "'" : "with deduplication disabled";
276        return "Job " + files.getJobID() + " for harvest " + files.getHarvestID() + " performed in "
277                + files.getCrawlDir() + dedupPart + " and " + FileUtils.countLines(files.getSeedsTxtFile()) + " seeds";
278    }
279
280    public Heritrix3Files getFiles() {
281        return this.files;
282    }
283
284    private class HeritrixKiller extends Thread {
285        @Override
286        public void run() {
287            stopHeritrix();
288        }
289    }
290}