001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.harvester.harvesting.controller;
025
026import java.util.Collection;
027
028import javax.management.openmbean.CompositeData;
029import javax.management.openmbean.TabularData;
030import javax.management.remote.JMXConnector;
031
032import org.archive.util.JmxUtils;
033import org.slf4j.Logger;
034import org.slf4j.LoggerFactory;
035
036import dk.netarkivet.common.exceptions.IOFailure;
037import dk.netarkivet.common.exceptions.IllegalState;
038import dk.netarkivet.common.utils.JMXUtils;
039import dk.netarkivet.common.utils.Settings;
040import dk.netarkivet.common.utils.SystemUtils;
041import dk.netarkivet.common.utils.TimeUtils;
042import dk.netarkivet.harvester.HarvesterSettings;
043import dk.netarkivet.harvester.harvesting.HeritrixFiles;
044
045/**
046 * This implementation of the HeritrixController interface starts Heritrix as a separate process and uses JMX to
047 * communicate with it. Each instance executes exactly one process that runs exactly one crawl job.
048 *
049 * @deprecated Use the {@link BnfHeritrixController} instead
050 */
051@SuppressWarnings({"unused", "unchecked"})
052public class JMXHeritrixController extends AbstractJMXHeritrixController {
053
054    /** The logger for this class. */
055    private static final Logger log = LoggerFactory.getLogger(JMXHeritrixController.class);
056
057    /*
058     * The below commands and attributes are copied from org.archive.crawler.admin.CrawlJob.
059     * 
060     * @see <A href="http://crawler.archive.org/xref/org/archive/crawler/admin/CrawlJob.html">
061     * org.archive.crawler.admin.CrawlJob</A>
062     * 
063     * These strings are currently not visible from outside the Heritrix class. See
064     * http://webteam.archive.org/jira/browse/HER-1285
065     */
066    /** The command to submit a new crawljob to the Crawlcontroller. */
067    private static final String ADD_JOB_COMMAND = "addJob";
068    /** The command to retrieve progress statistics for the currently running job. */
069    private static final String PROGRESS_STATISTICS_COMMAND = "progressStatistics";
070    /**
071     * The command to retrieve a progress statistics legend for the currently running job.
072     */
073    private static final String PROGRESS_STATISTICS_LEGEND_COMMAND = "progressStatisticsLegend";
074    /**
075     * The attribute for the current download rate in kbytes for the currently running job.
076     */
077    private static final String CURRENT_KB_RATE_ATTRIBUTE = "CurrentKbRate";
078    /** The attribute for the number of currently running process-threads. */
079    private static final String THREAD_COUNT_ATTRIBUTE = "ThreadCount";
080    /**
081     * The attribute for the number of discovered URIs for the currently running job.
082     */
083    private static final String DISCOVERED_COUNT_ATTRIBUTE = "DiscoveredCount";
084    /**
085     * The attribute for the number of downloaded URIs for the currently running job.
086     */
087    private static final String DOWNLOADED_COUNT_ATTRIBUTE = "DownloadedCount";
088    /** The attribute for the status for the currently running job. */
089    private static final String STATUS_ATTRIBUTE = "Status";
090
091    /*
092     * The below commands and attributes are copied from org.archive.crawler.Heritrix
093     * 
094     * @see <A href="http://crawler.archive.org/apidocs/org/archive/crawler/Heritrix.html">
095     * org.archive.crawler.Heritrix</A>
096     * 
097     * These strings are currently not visible from outside the Heritrix class. See
098     * http://webteam.archive.org/jira/browse/HER-1285
099     */
100    /*
101     * Note: The Heritrix JMX interface has two apparent ways to stop crawling: stopCrawling and terminateCurrentJob.
102     * stopCrawling merely makes Heritrix not start any more jobs, but the old jobs continue. Note that if we start
103     * using more than one job at a time, terminateCurrentJob will only stop one job.
104     */
105    /** Command to start crawling. */
106    private static final String START_CRAWLING_COMMAND = "startCrawling";
107    /** Make the currently active (selected?) job stop. */
108    private static final String TERMINATE_CURRENT_JOB_COMMAND = "terminateCurrentJob";
109    /** Command for returning list of pending jobs. */
110    private static final String PENDING_JOBS_COMMAND = "pendingJobs";
111    /** Command for returning list of completed jobs. */
112    private static final String COMPLETED_JOBS_COMMAND = "completedJobs";
113    /** Command for shutting down Heritrix. */
114    private static final String SHUTDOWN_COMMAND = "shutdown";
115
116    /**
117     * The part of the Job MBean name that designates the unique id. For some reason, this is not included in the normal
118     * Heritrix definitions in JmxUtils, otherwise we wouldn't have to define it. I have committed a feature request:
119     * http://webteam.archive.org/jira/browse/HER-1618
120     */
121    private static final String UID_PROPERTY = "uid";
122
123    /**
124     * The name that Heritrix gives to the job we ask it to create. This is part of the name of the MBean for that job,
125     * but we can only retrieve the name after the MBean has been created.
126     */
127    private String jobName;
128
129    /** The header line (legend) for the statistics report. */
130    private String progressStatisticsLegend;
131
132    /*
133     * The possible values of a request of the status attribute. Copied from private values in {@link
134     * org.archive.crawler.framework.CrawlController}
135     * 
136     * These strings are currently not visible from outside the CrawlController class. See
137     * http://webteam.archive.org/jira/browse/HER-1285
138     */
139    /** The 'NASCENT' status. */
140    // private static final String NASCENT_STATUS = "NASCENT";
141    /** The 'RUNNING' status. */
142    // private static final String RUNNING_STATUS = "RUNNING";
143    /** The 'PAUSED' status. */
144    private static final String PAUSED_STATUS = "PAUSED";
145    /** The 'PAUSING' status. */
146    private static final String PAUSING_STATUS = "PAUSING";
147    /** The 'CHECKPOINTING' status. */
148    // private static final String CHECKPOINTING_STATUS = "CHECKPOINTING";
149    /** The 'STOPPING' status. */
150    // private static final String STOPPING_STATUS = "STOPPING";
151    /** The 'FINISHED' status. */
152    private static final String FINISHED_STATUS = "FINISHED";
153    /** The 'STARTED status. */
154    // private static final String STARTED_STATUS = "STARTED";
155    /** The 'PREPARING' status. */
156    // private static final String PREPARING_STATUS = "PREPARING";
157    /** The 'Illegal State' status. */
158    private static final String ILLEGAL_STATUS = "Illegal State";
159
160    /**
161     * Create a JMXHeritrixController object.
162     *
163     * @param files Files that are used to set up Heritrix.
164     */
165    public JMXHeritrixController(HeritrixFiles files) {
166        super(files);
167    }
168
169    /**
170     * @throws IOFailure If Heritrix dies before initialization, or we encounter any problems during the initialization.
171     * @see HeritrixController#initialize()
172     */
173    public void initialize() {
174        if (processHasExited()) {
175            String errMsg = "Heritrix process of " + this + " died before initialization";
176            log.warn(errMsg);
177            throw new IOFailure(errMsg);
178        }
179        // We want to be sure there are no jobs when starting, in case we got
180        // an old Heritrix or somebody added jobs behind our back.
181        TabularData doneJobs = (TabularData) executeHeritrixCommand(COMPLETED_JOBS_COMMAND);
182        TabularData pendingJobs = (TabularData) executeHeritrixCommand(PENDING_JOBS_COMMAND);
183        if (doneJobs != null && doneJobs.size() > 0 || pendingJobs != null && pendingJobs.size() > 0) {
184            throw new IllegalState("This Heritrix instance is in a illegalState! "
185                    + "This instance has either old done jobs (" + doneJobs + "), or old pending jobs (" + pendingJobs
186                    + ").");
187        }
188        // From here on, we can assume there's only the one job we make.
189        // We'll use the arc file prefix to name the job, since the prefix
190        // already contains the harvest id and job id.
191        HeritrixFiles files = getHeritrixFiles();
192        executeHeritrixCommand(ADD_JOB_COMMAND, files.getOrderXmlFile().getAbsolutePath(),
193                files.getArchiveFilePrefix(), getJobDescription(), files.getSeedsTxtFile().getAbsolutePath());
194        jobName = getJobName();
195        initializeProgressStatisticsLegend();
196    }
197
198    /**
199     * @throws IOFailure if unable to communicate with Heritrix
200     * @see HeritrixController#requestCrawlStart()
201     */
202    public void requestCrawlStart() {
203        executeHeritrixCommand(START_CRAWLING_COMMAND);
204    }
205
206    /** @see HeritrixController#atFinish() */
207    public boolean atFinish() {
208        return crawlIsEnded();
209    }
210
211    /**
212     * @throws IOFailure if unable to communicate with Heritrix
213     * @see HeritrixController#beginCrawlStop()
214     */
215    public void beginCrawlStop() {
216        executeHeritrixCommand(TERMINATE_CURRENT_JOB_COMMAND);
217    }
218
219    /** @see HeritrixController#getActiveToeCount() */
220    public int getActiveToeCount() {
221        Integer activeToeCount = (Integer) getCrawlJobAttribute(THREAD_COUNT_ATTRIBUTE);
222        if (activeToeCount == null) {
223            return 0;
224        }
225        return activeToeCount;
226    }
227
228    /** @see HeritrixController#requestCrawlStop(String) */
229    public void requestCrawlStop(String reason) {
230        if (!atFinish()) {
231            beginCrawlStop();
232        }
233    }
234
235    /**
236     * @see HeritrixController#getQueuedUriCount()
237     */
238    public long getQueuedUriCount() {
239        /*
240         * Implementation note: This count is not as precise as what StatisticsTracker could provide, but it's presently
241         * only used in a warning in the HeritrixLauncher.doCrawlLoop() method.
242         */
243        Long discoveredUris = (Long) getCrawlJobAttribute(DISCOVERED_COUNT_ATTRIBUTE);
244        Long downloadedUris = (Long) getCrawlJobAttribute(DOWNLOADED_COUNT_ATTRIBUTE);
245        if (discoveredUris == null) {
246            return 0;
247        }
248        if (downloadedUris == null) {
249            return discoveredUris;
250        }
251        return discoveredUris - downloadedUris;
252    }
253
254    /** @see HeritrixController#getCurrentProcessedKBPerSec() */
255    public int getCurrentProcessedKBPerSec() {
256        Long currentDownloadRate = (Long) getCrawlJobAttribute(CURRENT_KB_RATE_ATTRIBUTE);
257        if (currentDownloadRate == null) {
258            return 0;
259        }
260        return currentDownloadRate.intValue();
261    }
262
263    /** @see HeritrixController#getProgressStats() */
264    public String getProgressStats() {
265        String status = (String) getCrawlJobAttribute(STATUS_ATTRIBUTE);
266
267        if (status == null) {
268            status = "NO STATUS";
269        }
270
271        String progressStatistics = (String) executeCrawlJobCommand(PROGRESS_STATISTICS_COMMAND);
272
273        if (progressStatistics == null) {
274            progressStatistics = "No progress statistics available";
275        } else {
276            // Since progressStatisticsLegend acts as a latch, we can check
277            // for non-null even though it gets assigned asynchronously.
278            if (progressStatisticsLegend != null) {
279                progressStatistics = progressStatisticsLegend + '\n' + progressStatistics;
280            }
281        }
282        return status + " " + progressStatistics;
283    }
284
285    /** Store the statistics legend line (asynchronously). */
286    private void initializeProgressStatisticsLegend() {
287        new Thread() {
288            public void run() {
289                progressStatisticsLegend = (String) executeCrawlJobCommand(PROGRESS_STATISTICS_LEGEND_COMMAND);
290            }
291        }.start();
292    }
293
294    /** @see HeritrixController#isPaused() */
295    public boolean isPaused() {
296        String status = (String) getCrawlJobAttribute(STATUS_ATTRIBUTE);
297        log.debug("Heritrix state: '{}'", status);
298        // Either Pausing or Paused in case of not null
299        return status != null && (status.equals(PAUSED_STATUS) || status.equals(PAUSING_STATUS));
300    }
301
302    /**
303     * Check if the crawl has ended, either because Heritrix finished of its own, or because we terminated it.
304     *
305     * @return True if the crawl has ended, either because Heritrix finished or because we terminated it. Otherwise we
306     * return false.
307     * @see HeritrixController#crawlIsEnded()
308     */
309    public synchronized boolean crawlIsEnded() {
310        // End of crawl can be seen in one of three ways:
311        // 1) The Heritrix process has exited.
312        // 2) The job has been moved to the completed jobs list in Heritrix.
313        // 3) The job is in one of the FINISHED states.
314        if (processHasExited()) {
315            return true;
316        }
317        TabularData jobs = (TabularData) executeHeritrixCommand(COMPLETED_JOBS_COMMAND);
318        if (jobs != null && jobs.size() > 0) {
319            for (CompositeData value : (Collection<CompositeData>) jobs.values()) {
320                String thisJobID = value.get(JmxUtils.NAME) + "-" + value.get(UID_PROPERTY);
321                if (thisJobID.equals(jobName)) {
322                    return true;
323                }
324            }
325        }
326        String status = (String) getCrawlJobAttribute(STATUS_ATTRIBUTE);
327        return status == null || status.equals(FINISHED_STATUS) || status.equals(ILLEGAL_STATUS);
328    }
329
330    /**
331     * Cleanup after an Heritrix process. This entails sending the shutdown command to the Heritrix process, and killing
332     * it forcefully, if it is still alive after waiting the period of time specified by the
333     * CommonSettings.PROCESS_TIMEOUT setting.
334     *
335     * @see HeritrixController#cleanup()
336     */
337    public void cleanup() {
338        try {
339            executeHeritrixCommand(SHUTDOWN_COMMAND);
340        } catch (IOFailure e) {
341            log.error("JMX error while cleaning up Heritrix controller", e);
342        }
343
344        waitForHeritrixProcessExit();
345    }
346
347    /**
348     * Return the URL for monitoring this instance.
349     *
350     * @return the URL for monitoring this instance.
351     */
352    public String getHarvestInformation() {
353        return "http://" + SystemUtils.getLocalHostName() + ":" + getGUIPort();
354    }
355
356    /**
357     * Get the name of the one job we let this Heritrix run. The handling of done jobs depends on Heritrix not being in
358     * crawl. This call may take several seconds to finish.
359     *
360     * @return The name of the one job that Heritrix has.
361     * @throws IOFailure if the job created failed to initialize or didn't appear in time.
362     * @throws IllegalState if more than one job in done list, or more than one pending job
363     */
364    private String getJobName() {
365        /*
366         * This is called just after we've told Heritrix to create a job. It may take a while before the job is actually
367         * created, so we have to wait around a bit.
368         */
369        TabularData pendingJobs = null;
370        TabularData doneJobs;
371        int retries = 0;
372        while (retries++ < JMXUtils.getMaxTries()) {
373            // If the job turns up in Heritrix' pending jobs list, it's ready
374            pendingJobs = (TabularData) executeHeritrixCommand(PENDING_JOBS_COMMAND);
375            if (pendingJobs != null && pendingJobs.size() > 0) {
376                break; // It's ready, we can move on.
377            }
378
379            // If there's an error in the job configuration, the job will be put
380            // in Heritrix' completed jobs list.
381            doneJobs = (TabularData) executeHeritrixCommand(COMPLETED_JOBS_COMMAND);
382            if (doneJobs != null && doneJobs.size() >= 1) {
383                // Since we haven't allowed Heritrix to start any crawls yet,
384                // the only way the job could have ended and then put into
385                // the list of completed jobs is by error.
386                if (doneJobs.size() > 1) {
387                    throw new IllegalState("More than one job in done list: " + doneJobs);
388                } else {
389                    CompositeData job = JMXUtils.getOneCompositeData(doneJobs);
390                    throw new IOFailure("Job " + job + " failed: " + job.get(STATUS_ATTRIBUTE));
391                }
392            }
393            if (retries < JMXUtils.getMaxTries()) {
394                TimeUtils.exponentialBackoffSleep(retries);
395            }
396        }
397        // If all went well, we now have exactly one job in the pending
398        // jobs list.
399        if (pendingJobs == null || pendingJobs.size() == 0) {
400            throw new IOFailure("Heritrix has not created a job after " + (Math.pow(2, JMXUtils.getMaxTries()) / 1000)
401                    + " seconds, giving up.");
402        } else if (pendingJobs.size() > 1) {
403            throw new IllegalState("More than one pending job: " + pendingJobs);
404        } else {
405            // Note that we may actually get through to here even if the job
406            // is malformed. The job will then die as soon as we tell it to
407            // start crawling.
408            CompositeData job = JMXUtils.getOneCompositeData(pendingJobs);
409            String name = job.get(JmxUtils.NAME) + "-" + job.get(UID_PROPERTY);
410            log.info("Heritrix created a job with name {}", name);
411            return name;
412        }
413    }
414
415    /**
416     * Get the name to use for logging on to Heritrix' JMX with full control. The name cannot be set by the user.
417     *
418     * @return Name to use when connecting to Heritrix JMX
419     */
420    private String getJMXAdminName() {
421        String jmxUsername = Settings.get(HarvesterSettings.HERITRIX_JMX_USERNAME);
422        log.debug("The JMX username used for connecting to the Heritrix GUI is: '{}'.", jmxUsername);
423        return jmxUsername;
424    }
425
426    /**
427     * Get the password to use to access the Heritrix JMX as the user returned by getJMXAdminName(). This password can
428     * be set in a file pointed to in settings.xml.
429     *
430     * @return Password for accessing Heritrix JMX
431     */
432    private String getJMXAdminPassword() {
433        return Settings.get(HarvesterSettings.HERITRIX_JMX_PASSWORD);
434    }
435
436    /**
437     * Get the port to use for Heritrix JMX, as set in settings.xml.
438     *
439     * @return Port that Heritrix will expose its JMX interface on.
440     */
441    private int getJMXPort() {
442        return Settings.getInt(HarvesterSettings.HERITRIX_JMX_PORT);
443    }
444
445    /**
446     * Get the port to use for Heritrix GUI, as set in settings.xml.
447     *
448     * @return Port that Heritrix will expose its web interface on.
449     */
450    private int getGUIPort() {
451        return Settings.getInt(HarvesterSettings.HERITRIX_GUI_PORT);
452    }
453
454    /**
455     * Execute a command for the Heritrix process we're running.
456     *
457     * @param command The command to execute.
458     * @param arguments Any arguments to the command. These arguments can only be of String type.
459     * @return Whatever object was returned by the JMX invocation.
460     */
461    private Object executeHeritrixCommand(String command, String... arguments) {
462        return JMXUtils.executeCommand(getHeritrixJMXConnector(), getHeritrixBeanName(), command, arguments);
463    }
464
465    /**
466     * Execute a command for the Heritrix job. This must only be called after initialize() has been run.
467     *
468     * @param command The command to execute.
469     * @param arguments Any arguments to the command. These arguments can only be of String type.
470     * @return Whatever object was returned by the JMX invocation.
471     */
472    private Object executeCrawlJobCommand(String command, String... arguments) {
473        return JMXUtils.executeCommand(getHeritrixJMXConnector(), getCrawlJobBeanName(), command, arguments);
474    }
475
476    /**
477     * Get an attribute of the Heritrix process we're running.
478     *
479     * @param attribute The attribute to get.
480     * @return The value of the attribute.
481     */
482    private Object getHeritrixAttribute(String attribute) {
483        return JMXUtils.getAttribute(getHeritrixJMXConnector(), getHeritrixBeanName(), attribute);
484    }
485
486    /**
487     * Get an attribute of the Heritrix job. This must only be called after initialize() has been run.
488     *
489     * @param attribute The attribute to get.
490     * @return The value of the attribute.
491     */
492    private Object getCrawlJobAttribute(String attribute) {
493        return JMXUtils.getAttribute(getHeritrixJMXConnector(), getCrawlJobBeanName(), attribute);
494    }
495
496    /**
497     * Get the name for the main bean of the Heritrix instance.
498     *
499     * @return Bean name, to be passed into JMXUtils#getBeanName(String)
500     */
501    private String getHeritrixBeanName() {
502        return "org.archive.crawler:" + JmxUtils.NAME + "=Heritrix," + JmxUtils.TYPE + "=CrawlService,"
503                + JmxUtils.JMX_PORT + "=" + getJMXPort() + "," + JmxUtils.GUI_PORT + "=" + getGUIPort() + ","
504                + JmxUtils.HOST + "=" + getHostName();
505
506    }
507
508    /**
509     * Get the name for the bean of a single job. This bean does not exist until after a job has been created using
510     * initialize().
511     *
512     * @return Bean name, to be passed into JMXUtils#getBeanName(String)
513     */
514    private String getCrawlJobBeanName() {
515        return "org.archive.crawler:" + JmxUtils.NAME + "=" + jobName + "," + JmxUtils.TYPE + "=CrawlService.Job,"
516                + JmxUtils.JMX_PORT + "=" + getJMXPort() + "," + JmxUtils.MOTHER + "=Heritrix," + JmxUtils.HOST + "="
517                + getHostName();
518    }
519
520    /**
521     * Get the JMX connector to Heritrix.
522     *
523     * @return A connector that connects to a local Heritrix instance.
524     */
525    private JMXConnector getHeritrixJMXConnector() {
526        return JMXUtils.getJMXConnector(SystemUtils.LOCALHOST, getJMXPort(), getJMXAdminName(), getJMXAdminPassword());
527    }
528
529}