Source code

001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.harvester.heritrix3.controller;
025
026import dk.netarkivet.common.exceptions.IOFailure;
027
028/**
029 * This interface encapsulates the direct access to Heritrix, allowing for accessing in various ways (direct class
030 * access or JMX). Heritrix is expected to perform one crawl for each instance of an implementing class.
031 */
032public interface IHeritrixController {
033
034    /**
035     * Initialize a new CrawlController for executing a Heritrix crawl. This does not start the crawl.
036     */
037    void initialize();
038
039    /**
040     * Request that Heritrix start crawling. When this method returns, either Heritrix has failed in the early stages,
041     * or the crawljob has been successfully created. Actual crawling will commence at some point hereafter.
042     *
043     * @throws IOFailure If something goes wrong during startup.
044     */
045    void requestCrawlStart() throws IOFailure;
046
047    /**
048     * Tell Heritrix to stop crawling. Heritrix may take a while to actually stop, so you cannot assume that crawling is
049     * stopped when this method returns.
050     */
051    void beginCrawlStop();
052
053    /**
054     * Request that the crawler stops. This makes a call to beginCrawlStop(), unless the crawler is already shutting down.
055     * In that case it does nothing.
056     *
057     * @param reason A human-readable reason the crawl is being stopped.
058     */
059    void requestCrawlStop(String reason);
060
061    /**
062     * Query whether Heritrix is in a state where it can finish crawling. Returns true if no uris remain to be
063     * harvested, or it has met either the maxbytes limit, the document limit, or the time-limit for the current
064     * harvest.
065     *
066     * @return True if Heritrix thinks it is time to stop crawling.
067     */
068    boolean atFinish();
069
070    /**
071     * Returns true if the crawl has ended, either because Heritrix finished or because we terminated it.
072     *
073     * @return True if the CrawlEnded event has happened in Heritrix, indicating that all crawls have stopped.
074     */
075    boolean crawlIsEnded();
076
077    /**
078     * Get the number of currently active ToeThreads (crawler threads).
079     *
080     * @return Number of ToeThreads currently active within Heritrix.
081     */
082    int getActiveToeCount();
083
084    /**
085     * Get the number of URIs currently on the queue to be processed. This number may not be exact and should only be
086     * used in informal texts.
087     *
088     * @return How many URIs Heritrix have lined up for processing.
089     */
090    long getQueuedUriCount();
091
092    /**
093     * Get an estimate of the rate, in kb, at which documents are currently being processed by the crawler.
094     *
095     * @return Number of KB data downloaded by Heritrix over an undefined interval up to now.
096     * @see org.archive.crawler.framework.StatisticsTracking#currentProcessedKBPerSec()
097     */
098    int getCurrentProcessedKBPerSec();
099
100    /**
101     * Get a human-readable set of statistics on the progress of the crawl. The statistics is discovered uris, queued
102     * uris, downloaded uris, doc/s(avg), KB/s(avg), dl-failures, busy-thread, mem-use-KB, heap-size-KB, congestion,
103     * max-depth, avg-depth. If no statistics are available, the string "No statistics available" is returned. Note:
104     * this method may disappear in the future.
105     *
106     * @return Some ascii-formatted statistics on the progress of the crawl.
107     */
108    String getProgressStats();
109
110    /**
111     * Returns true if the crawler has been paused, and thus not supposed to fetch anything. Heritrix may still be
112     * fetching stuff, as it takes some time for it to go into full pause mode. This method can be used as an indicator
113     * that we should not be worried if Heritrix appears to be idle.
114     *
115     * @return True if the crawler has been paused, e.g. by using the Heritrix GUI.
116     */
117    boolean isPaused();
118
119    /**
120     * Release any resources kept by the class.
121     */
122    void cleanup();
123
124    /**
125     * Get harvest information. An example of this can be an URL pointing to the GUI of a running Heritrix process.
126     *
127     * @return information about the harvest process.
128     */
129    String getHarvestInformation();
130
131    /**
132     * Stop the heritrix process.
133     */
134    public void stopHeritrix();
135
136}