001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.harvesting.controller;
024
025import java.io.File;
026import java.io.FileWriter;
027import java.io.FilenameFilter;
028import java.io.IOException;
029import java.io.PrintWriter;
030import java.util.ArrayList;
031import java.util.Arrays;
032import java.util.Collections;
033import java.util.Comparator;
034import java.util.HashSet;
035import java.util.LinkedList;
036import java.util.List;
037import java.util.Map;
038import java.util.Properties;
039import java.util.Set;
040
041import org.archive.crawler.Heritrix;
042import org.slf4j.Logger;
043import org.slf4j.LoggerFactory;
044
045import dk.netarkivet.common.CommonSettings;
046import dk.netarkivet.common.exceptions.ArgumentNotValid;
047import dk.netarkivet.common.exceptions.IOFailure;
048import dk.netarkivet.common.utils.FileUtils;
049import dk.netarkivet.common.utils.JMXUtils;
050import dk.netarkivet.common.utils.NotificationType;
051import dk.netarkivet.common.utils.NotificationsFactory;
052import dk.netarkivet.common.utils.ProcessUtils;
053import dk.netarkivet.common.utils.Settings;
054import dk.netarkivet.common.utils.StringUtils;
055import dk.netarkivet.common.utils.SystemUtils;
056import dk.netarkivet.common.utils.TimeUtils;
057import dk.netarkivet.harvester.HarvesterSettings;
058import dk.netarkivet.harvester.harvesting.HeritrixFiles;
059
060/**
061 * Abstract base class for JMX-based Heritrix controllers.
062 */
063@SuppressWarnings({"rawtypes"})
064public abstract class AbstractJMXHeritrixController implements HeritrixController {
065
066    /** The logger for this class. */
067    private static final Logger log = LoggerFactory.getLogger(AbstractJMXHeritrixController.class);
068
069    /** File path Separator. Used to separate the jar-files in the classpath. */
070    private static final String FILE_PATH_SEPARATOR = ":";
071
072    /** How long we're willing to wait for Heritrix to shutdown in a shutdown hook. */
073    private static final long SHUTDOWN_HOOK_MAX_WAIT = 1000L;
074
075    /** The various files used by Heritrix. */
076    private final HeritrixFiles files;
077
078    /** The threads used to collect process output. Only one thread used presently. */
079    private Set<Thread> collectionThreads = new HashSet<Thread>(1);
080
081    /** The host name for this machine that matches what Heritrix uses in its MBean names. */
082    private final String hostName;
083
084    /** The port to use for Heritrix JMX, as set in settings.xml. */
085    private final int jmxPort = Settings.getInt(HarvesterSettings.HERITRIX_JMX_PORT);
086
087    /** The port to use for Heritrix GUI, as set in settings.xml. */
088    private final int guiPort = Settings.getInt(HarvesterSettings.HERITRIX_GUI_PORT);
089
090    /**
091     * The shutdownHook that takes care of killing our process. This is removed in cleanup() when the process is shut
092     * down.
093     */
094    private Thread processKillerHook;
095
096    /**
097     * The one-shot Heritrix process created in the constructor. It will only perform a single crawl before being shut
098     * down.
099     */
100    private final Process heritrixProcess;
101
102    /**
103     * Create a BnfHeritrixController object.
104     *
105     * @param files Files that are used to set up Heritrix.
106     */
107    public AbstractJMXHeritrixController(HeritrixFiles files) {
108        ArgumentNotValid.checkNotNull(files, "HeritrixFile files");
109        this.files = files;
110
111        SystemUtils.checkPortNotUsed(guiPort);
112        SystemUtils.checkPortNotUsed(jmxPort);
113
114        hostName = SystemUtils.getLocalHostName();
115
116        try {
117            log.info("Starting Heritrix for {}", this);
118            /*
119             * To start Heritrix, we need to do the following (taken from the Heritrix startup shell script): - set
120             * heritrix.home to base dir of Heritrix stuff - set com.sun.management.jmxremote.port to JMX port - set
121             * com.sun.management.jmxremote.ssl to false - set com.sun.management.jmxremote.password.file to JMX
122             * password file - set heritrix.out to heritrix_out.log - set java.protocol.handler.pkgs=org.archive.net -
123             * send processOutput & stderr into heritrix.out - let the Heritrix GUI-webserver listen on all available
124             * network interfaces: This is done with argument "--bind /" (default is 127.0.0.1) - listen on a specific
125             * port using the port argument: --port <GUI port>
126             * 
127             * We also need to output something like the following to heritrix.out: `date Starting heritrix uname -a
128             * java -version JAVA_OPTS ulimit -a
129             */
130            File heritrixOutputFile = files.getHeritrixOutput();
131            StringBuilder settingProperty = new StringBuilder();
132            for (File file : Settings.getSettingsFiles()) {
133                settingProperty.append(File.pathSeparator);
134
135                String absolutePath = file.getAbsolutePath();
136                // check that the settings files not only exist but
137                // are readable
138                boolean readable = new File(absolutePath).canRead();
139                if (!readable) {
140                    log.warn("The file '{}' is missing.", absolutePath);
141                    throw new IOFailure("Failed to read file '" + absolutePath + "'");
142                }
143                settingProperty.append(absolutePath);
144            }
145            if (settingProperty.length() > 0) {
146                // delete last path-separator
147                settingProperty.deleteCharAt(0);
148            }
149
150            List<String> allOpts = new LinkedList<String>();
151            allOpts.add(new File(new File(System.getProperty("java.home"), "bin"), "java").getAbsolutePath());
152            allOpts.add("-Xmx" + Settings.get(HarvesterSettings.HERITRIX_HEAP_SIZE));
153            allOpts.add("-Dheritrix.home=" + files.getCrawlDir().getAbsolutePath());
154
155            String jvmOptsStr = Settings.get(HarvesterSettings.HERITRIX_JVM_OPTS);
156            if ((jvmOptsStr != null) && (!jvmOptsStr.isEmpty())) {
157                String[] add = jvmOptsStr.split(" ");
158                allOpts.addAll(Arrays.asList(add));
159            }
160
161            allOpts.add("-Dcom.sun.management.jmxremote.port=" + jmxPort);
162            allOpts.add("-Dcom.sun.management.jmxremote.ssl=false");
163            // check that JMX password and access files are readable.
164            // TODO This should probably be extracted to a method?
165            File passwordFile = files.getJmxPasswordFile();
166            String pwAbsolutePath = passwordFile.getAbsolutePath();
167            if (!passwordFile.canRead()) {
168                final String errMsg = "Failed to read the password file '" + pwAbsolutePath + "'. "
169                        + "It is possibly missing.";
170                log.warn(errMsg);
171                throw new IOFailure(errMsg);
172            }
173            File accessFile = files.getJmxAccessFile();
174            String acAbsolutePath = accessFile.getAbsolutePath();
175            if (!accessFile.canRead()) {
176                final String errMsg = "Failed to read the access file '" + acAbsolutePath + "'. "
177                        + "It is possibly missing.";
178                log.warn(errMsg);
179                throw new IOFailure(errMsg);
180            }
181            allOpts.add("-Dcom.sun.management.jmxremote.password.file=" + new File(pwAbsolutePath));
182            allOpts.add("-Dcom.sun.management.jmxremote.access.file=" + new File(acAbsolutePath));
183            allOpts.add("-Dheritrix.out=" + heritrixOutputFile.getAbsolutePath());
184            allOpts.add("-Djava.protocol.handler.pkgs=org.archive.net");
185            allOpts.add("-Ddk.netarkivet.settings.file=" + settingProperty);
186            allOpts.add(Heritrix.class.getName());
187            allOpts.add("--bind");
188            allOpts.add("/");
189            allOpts.add("--port=" + guiPort);
190            allOpts.add("--admin=" + getHeritrixAdminName() + ":" + getHeritrixAdminPassword());
191
192            String[] args = allOpts.toArray(new String[allOpts.size()]);
193            log.info("Starting Heritrix process with args" + Arrays.toString(args));
194            log.debug("The JMX timeout is set to " + TimeUtils.readableTimeInterval(JMXUtils.getJmxTimeout()));
195
196            ProcessBuilder builder = new ProcessBuilder(args);
197
198            updateEnvironment(builder.environment());
199            FileUtils.copyDirectory(new File("lib/heritrix"), files.getCrawlDir());
200            builder.directory(files.getCrawlDir());
201            builder.redirectErrorStream(true);
202            writeSystemInfo(heritrixOutputFile, builder);
203            FileUtils.appendToFile(heritrixOutputFile, "Working directory: " + files.getCrawlDir());
204            addProcessKillerHook();
205            heritrixProcess = builder.start();
206            ProcessUtils.writeProcessOutput(heritrixProcess.getInputStream(), heritrixOutputFile, collectionThreads);
207        } catch (IOException e) {
208            throw new IOFailure("Error starting Heritrix process", e);
209        }
210    }
211
212    /**
213     * @return the JMX port for communicating with Heritrix.
214     */
215    protected int getJmxPort() {
216        return jmxPort;
217    }
218
219    /**
220     * @return the HTTP port used by the Heritrix GUI.
221     */
222    protected int getGuiPort() {
223        return guiPort;
224    }
225
226    /**
227     * @return the Heritrix files wrapper.
228     */
229    protected HeritrixFiles getHeritrixFiles() {
230        return files;
231    }
232
233    /**
234     * @return the host name
235     */
236    protected String getHostName() {
237        return hostName;
238    }
239
240    /**
241     * Get the login name for accessing the Heritrix GUI. This name can be set in the settings.xml file.
242     *
243     * @return Name to use for accessing Heritrix web GUI
244     */
245    private String getHeritrixAdminName() {
246        return Settings.get(HarvesterSettings.HERITRIX_ADMIN_NAME);
247    }
248
249    /**
250     * Get the login password for accessing the Heritrix GUI. This password can be set in the settings.xml file.
251     *
252     * @return Password to use for accessing the Heritrix GUI
253     */
254    private String getHeritrixAdminPassword() {
255        return Settings.get(HarvesterSettings.HERITRIX_ADMIN_PASSWORD);
256    }
257
258    /**
259     * Change an environment to be suitable for running Heritrix.
260     * <p>
261     * At the moment, this involves the following:
262     * <p>
263     * Prepend the Jar files from the lib/heritrix/lib dir to the classpath. Make sure the Heritrix jar file is at the
264     * front.
265     *
266     * @param environment The environment from a process builder
267     * @throws IOFailure If a Heritrix jarfile is not found.
268     */
269    private static void updateEnvironment(Map<String, String> environment) {
270        List<String> classPathParts = SystemUtils.getCurrentClasspath();
271        File heritrixLibDir = new File("lib/heritrix/lib");
272        File[] jars = heritrixLibDir.listFiles(new FilenameFilter() {
273            public boolean accept(File file, String string) {
274                return string.endsWith(".jar");
275            }
276        });
277
278        // FIXME: Dirty hack around heretrix jars being elsewhere /tra
279        if (jars == null) {
280            jars = new File[0];
281        }
282
283        // Reverse sort the file list in order to add in alphabetical order
284        // before the basic jars.
285        Arrays.sort(jars, new Comparator<File>() {
286            public int compare(File file, File file1) {
287                return file1.compareTo(file);
288            }
289        });
290        String heritixJar = null;
291        for (File lib : jars) {
292            final String jarPath = new File(heritrixLibDir, lib.getName()).getAbsolutePath();
293            if (lib.getName().startsWith("heritrix-")) {
294                // Heritrix should be at the very head, as it redefines some
295                // of the functions in its dependencies (!). Thus, we have to
296                // save it for later insertion at the head.
297                heritixJar = jarPath;
298            } else {
299                classPathParts.add(0, jarPath);
300            }
301        }
302        if (heritixJar != null) {
303            classPathParts.add(0, heritixJar);
304        } else {
305            throw new IOFailure("Heritrix jar file not found");
306        }
307        environment.put("CLASSPATH", StringUtils.conjoin(FILE_PATH_SEPARATOR, classPathParts));
308    }
309
310    /**
311     * Write various info on the system we're using into the given file. This info will later get put into metadata for
312     * the crawl.
313     *
314     * @param outputFile A file to write to.
315     * @param builder The ProcessBuilder being used to start the Heritrix process
316     */
317    @SuppressWarnings("unchecked")
318    private void writeSystemInfo(File outputFile, ProcessBuilder builder) {
319        PrintWriter writer = null;
320        try {
321            writer = new PrintWriter(new FileWriter(outputFile));
322            writer.println("The Heritrix process is started in the following"
323                    + " environment\n (note that some entries will be" + " changed by the starting JVM):");
324            Map<String, String> env = builder.environment();
325            List<String> keyList = new ArrayList<String>(env.keySet());
326            Collections.sort(keyList);
327            for (String key : keyList) {
328                writer.println(key + "=" + env.get(key));
329            }
330            writer.println("Process properties:");
331            Properties properties = System.getProperties();
332            keyList = new ArrayList<String>((Set) properties.keySet());
333            Collections.sort(keyList);
334            for (String key : keyList) {
335                writer.println(key + "=" + properties.get(key));
336            }
337        } catch (IOException e) {
338            log.warn("Error writing basic properties to output file.", e);
339        } finally {
340            if (writer != null) {
341                writer.close();
342            }
343        }
344    }
345
346    /**
347     * Add a shutdown hook that kills the process we've created. Since this hook will be run only in case of JVM
348     * shutdown, it cannot expect that the standard logging framework is still usable, and therefore writes to stdout
349     * instead.
350     */
351    private void addProcessKillerHook() {
352        // Make sure that the process gets killed at the very end, at least
353        processKillerHook = new Thread() {
354            public void run() {
355                try {
356                    // Only non-blocking way to check for process liveness
357                    int exitValue = heritrixProcess.exitValue();
358                    System.out.println("Heritrix process of " + this + " exited with exit code " + exitValue);
359                } catch (IllegalThreadStateException e) {
360                    // Process is still alive, kill it.
361                    System.out.println("Killing process of " + this);
362                    heritrixProcess.destroy();
363                    final Integer exitValue = ProcessUtils.waitFor(heritrixProcess, SHUTDOWN_HOOK_MAX_WAIT);
364                    if (exitValue != null) {
365                        System.out.println("Process of " + this + " returned exit code " + exitValue);
366                    } else {
367                        System.out.println("Process of " + this + " never exited!");
368                    }
369                }
370            }
371        };
372        Runtime.getRuntime().addShutdownHook(processKillerHook);
373    }
374
375    /**
376     * Get a string that describes the current controller in terms of job ID, harvest ID, and crawldir.
377     *
378     * @return A human-readable string describing this controller.
379     */
380    @Override
381    public String toString() {
382        if (heritrixProcess != null) {
383            return "job " + files.getJobID() + " of harvest " + files.getHarvestID() + " in " + files.getCrawlDir()
384                    + " running process " + heritrixProcess;
385        } else {
386            return "job " + files.getJobID() + " of harvest " + files.getHarvestID() + " in " + files.getCrawlDir();
387        }
388    }
389
390    /**
391     * Return true if the Heritrix process has exited, logging the exit value if so.
392     *
393     * @return True if the process has exited.
394     */
395    protected boolean processHasExited() {
396        // First check if the process has exited already
397        try {
398            int exitValue = heritrixProcess.exitValue();
399            log.info("Process of {} returned exit code {}", this, exitValue);
400            return true;
401        } catch (IllegalThreadStateException e) {
402            // Not exited yet, that's fine
403        }
404        return false;
405    }
406
407    /**
408     * Waits for the Heritrix process to exit.
409     */
410    protected void waitForHeritrixProcessExit() {
411        final long maxWait = Settings.getLong(CommonSettings.PROCESS_TIMEOUT);
412        final int maxJmxRetries = JMXUtils.getMaxTries();
413        Integer exitValue = ProcessUtils.waitFor(heritrixProcess, maxWait);
414        if (exitValue != null) {
415            log.info("Heritrix process of {} exited with exit code {}", this, exitValue);
416        } else {
417            log.warn("Heritrix process of {} not dead after {} millis, killing it", this, maxWait);
418            heritrixProcess.destroy();
419            exitValue = ProcessUtils.waitFor(heritrixProcess, maxWait);
420            if (exitValue != null) {
421                log.info("Heritrix process of {} exited with exit code {}", this, exitValue);
422            } else {
423                // If it's not dead now, there's little we can do.
424                log.error("Heritrix process of {} not dead after destroy. Exiting harvest controller. "
425                        + "Make sure you kill the runaway Heritrix before you restart.", this);
426                NotificationsFactory
427                        .getInstance()
428                        .notify("Heritrix process of "
429                                + this
430                                + " not dead after destroy. "
431                                + "Exiting harvest controller. Make sure you kill the runaway Heritrix before you restart.",
432                                NotificationType.ERROR);
433                System.exit(1);
434            }
435        }
436        Runtime.getRuntime().removeShutdownHook(processKillerHook);
437        // Wait until all collection threads are dead or until we have
438        // tried JMXUtils.MAX_TRIES times.
439        int attempt = 0;
440        do {
441            boolean anyAlive = false;
442            for (Thread t : collectionThreads) {
443                if (t.isAlive()) {
444                    anyAlive = true;
445                }
446            }
447            if (!anyAlive) {
448                break;
449            }
450            TimeUtils.exponentialBackoffSleep(attempt);
451        } while (attempt++ < maxJmxRetries);
452    }
453
454    /**
455     * Return a human-readable description of the job. This will only be visible in the Heritrix GUI.
456     *
457     * @return String containing various information grabbed from HeritrixFiles.
458     */
459    protected String getJobDescription() {
460        String dedupPart = (files.getIndexDir() != null) ? "with the deduplication index stored in '"
461                + files.getIndexDir().getAbsolutePath() + "'" : "with deduplication disabled";
462        return "Job " + files.getJobID() + " for harvest " + files.getHarvestID() + " performed in "
463                + files.getCrawlDir() + dedupPart + " and " + FileUtils.countLines(files.getSeedsTxtFile()) + " seeds";
464    }
465
466    public HeritrixFiles getFiles() {
467        return this.files;
468    }
469
470}