001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.harvester.heritrix3.controller; 025 026import dk.netarkivet.common.exceptions.IOFailure; 027 028/** 029 * This interface encapsulates the direct access to Heritrix, allowing for accessing in various ways (direct class 030 * access or JMX). Heritrix is expected to perform one crawl for each instance of an implementing class. 031 */ 032public interface IHeritrixController { 033 034 /** 035 * Initialize a new CrawlController for executing a Heritrix crawl. This does not start the crawl. 036 */ 037 void initialize(); 038 039 /** 040 * Request that Heritrix start crawling. When this method returns, either Heritrix has failed in the early stages, 041 * or the crawljob has been successfully created. Actual crawling will commence at some point hereafter. 042 * 043 * @throws IOFailure If something goes wrong during startup. 044 */ 045 void requestCrawlStart() throws IOFailure; 046 047 /** 048 * Tell Heritrix to stop crawling. Heritrix may take a while to actually stop, so you cannot assume that crawling is 049 * stopped when this method returns. 050 */ 051 void beginCrawlStop(); 052 053 /** 054 * Request that the crawler stops. This makes a call to beginCrawlStop(), unless the crawler is already shutting down. 055 * In that case it does nothing. 056 * 057 * @param reason A human-readable reason the crawl is being stopped. 058 */ 059 void requestCrawlStop(String reason); 060 061 /** 062 * Query whether Heritrix is in a state where it can finish crawling. Returns true if no uris remain to be 063 * harvested, or it has met either the maxbytes limit, the document limit, or the time-limit for the current 064 * harvest. 065 * 066 * @return True if Heritrix thinks it is time to stop crawling. 067 */ 068 boolean atFinish(); 069 070 /** 071 * Returns true if the crawl has ended, either because Heritrix finished or because we terminated it. 072 * 073 * @return True if the CrawlEnded event has happened in Heritrix, indicating that all crawls have stopped. 074 */ 075 boolean crawlIsEnded(); 076 077 /** 078 * Get the number of currently active ToeThreads (crawler threads). 079 * 080 * @return Number of ToeThreads currently active within Heritrix. 081 */ 082 int getActiveToeCount(); 083 084 /** 085 * Get the number of URIs currently on the queue to be processed. This number may not be exact and should only be 086 * used in informal texts. 087 * 088 * @return How many URIs Heritrix have lined up for processing. 089 */ 090 long getQueuedUriCount(); 091 092 /** 093 * Get an estimate of the rate, in kb, at which documents are currently being processed by the crawler. 094 * 095 * @return Number of KB data downloaded by Heritrix over an undefined interval up to now. 096 * @see org.archive.crawler.framework.StatisticsTracking#currentProcessedKBPerSec() 097 */ 098 int getCurrentProcessedKBPerSec(); 099 100 /** 101 * Get a human-readable set of statistics on the progress of the crawl. The statistics is discovered uris, queued 102 * uris, downloaded uris, doc/s(avg), KB/s(avg), dl-failures, busy-thread, mem-use-KB, heap-size-KB, congestion, 103 * max-depth, avg-depth. If no statistics are available, the string "No statistics available" is returned. Note: 104 * this method may disappear in the future. 105 * 106 * @return Some ascii-formatted statistics on the progress of the crawl. 107 */ 108 String getProgressStats(); 109 110 /** 111 * Returns true if the crawler has been paused, and thus not supposed to fetch anything. Heritrix may still be 112 * fetching stuff, as it takes some time for it to go into full pause mode. This method can be used as an indicator 113 * that we should not be worried if Heritrix appears to be idle. 114 * 115 * @return True if the crawler has been paused, e.g. by using the Heritrix GUI. 116 */ 117 boolean isPaused(); 118 119 /** 120 * Release any resources kept by the class. 121 */ 122 void cleanup(); 123 124 /** 125 * Get harvest information. An example of this can be an URL pointing to the GUI of a running Heritrix process. 126 * 127 * @return information about the harvest process. 128 */ 129 String getHarvestInformation(); 130 131 /** 132 * Stop the heritrix process. 133 */ 134 public void stopHeritrix(); 135 136}