001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.harvesting.controller; 024 025import org.slf4j.Logger; 026import org.slf4j.LoggerFactory; 027 028import dk.netarkivet.common.exceptions.ArgumentNotValid; 029import dk.netarkivet.common.exceptions.IOFailure; 030import dk.netarkivet.common.utils.Settings; 031import dk.netarkivet.harvester.HarvesterSettings; 032import dk.netarkivet.harvester.harvesting.HeritrixFiles; 033import dk.netarkivet.harvester.harvesting.HeritrixLauncher; 034 035/** 036 * Default implementation of the crawl control. 037 */ 038public class DefaultHeritrixLauncher extends HeritrixLauncher { 039 040 /** The class logger. */ 041 private static final Logger log = LoggerFactory.getLogger(DefaultHeritrixLauncher.class); 042 043 /** Number of milliseconds in a second. */ 044 private static final int MILLIS_PER_SECOND = 1000; 045 046 /** How long to wait before aborting a request from a webserver. */ 047 private static long timeOutInMillisReceivedData = Long.parseLong(Settings 048 .get(HarvesterSettings.CRAWLER_TIMEOUT_NON_RESPONDING)) * MILLIS_PER_SECOND; 049 050 /** How long to wait without any activity before aborting the harvest. */ 051 private static long timeOutInMillis = Long.parseLong(Settings.get(HarvesterSettings.INACTIVITY_TIMEOUT_IN_SECS)) 052 * MILLIS_PER_SECOND; 053 /** The HeritrixController instance used by the HeritrixLauncher. */ 054 private HeritrixController heritrixController; 055 056 /** 057 * Constructor for the DefaultHeritrixLauncher. 058 * 059 * @param files the Heritrix configuration. 060 * @throws ArgumentNotValid 061 */ 062 private DefaultHeritrixLauncher(HeritrixFiles files) throws ArgumentNotValid { 063 super(files); 064 } 065 066 /** 067 * Get instance of this class. 068 * 069 * @param files Object encapsulating location of Heritrix crawldir and configuration files 070 * @return {@link DefaultHeritrixLauncher} object 071 * @throws ArgumentNotValid If either order.xml or seeds.txt does not exist, or argument files is null. 072 */ 073 public static DefaultHeritrixLauncher getInstance(HeritrixFiles files) throws ArgumentNotValid { 074 ArgumentNotValid.checkNotNull(files, "HeritrixFiles files"); 075 return new DefaultHeritrixLauncher(files); 076 } 077 078 /** 079 * This method launches heritrix in the following way:</br> 1. copies the orderfile and the seedsfile to current 080 * working directory. </br> 2. sets up the newly created copy of the orderfile </br> 3. starts the crawler </br> 4. 081 * stops the crawler (Either when heritrix has finished crawling, or when heritrix is forcefully stopped due to 082 * inactivity). </p> The exit from the while-loop depends on Heritrix calling the crawlEnded() method, when the 083 * crawling is finished. This method is called from the HarvestControllerServer.onDoOneCrawl() method. 084 * 085 * @throws IOFailure - if the order.xml is invalid if unable to initialize Heritrix CrawlController if Heritrix 086 * process interrupted 087 */ 088 public void doCrawl() throws IOFailure { 089 setupOrderfile(getHeritrixFiles()); 090 heritrixController = HeritrixControllerFactory.getDefaultHeritrixController(getControllerArguments()); 091 try { 092 // Initialize Heritrix settings according to the order.xml 093 heritrixController.initialize(); 094 log.debug("Starting crawl.."); 095 heritrixController.requestCrawlStart(); 096 if (heritrixController.atFinish()) { 097 heritrixController.beginCrawlStop(); 098 } else { 099 doCrawlLoop(); 100 } 101 } catch (IOFailure e) { 102 log.warn("Error during initialisation of crawl", e); 103 throw (e); 104 } catch (Exception e) { 105 log.warn("Exception during crawl", e); 106 throw new RuntimeException("Exception during crawl", e); 107 } finally { 108 if (heritrixController != null) { 109 heritrixController.cleanup(); 110 } 111 } 112 log.debug("Heritrix is finished crawling..."); 113 } 114 115 /** 116 * Monitors the crawling performed by Heritrix. Regularly checks whether any progress is made. If no progress has 117 * been made for too long, the crawl is ended. 118 * 119 * @throws IOFailure if the call to HeritrixController.requestCrawlStop() fails. Other failures in calls to the 120 * controller are caught and logged. 121 */ 122 private void doCrawlLoop() throws IOFailure { 123 String errorMessage = "Non-fatal I/O error while communicating with Heritrix during crawl"; 124 long lastNonZeroActiveQueuesTime = System.currentTimeMillis(); 125 long lastTimeReceivedData = System.currentTimeMillis(); 126 boolean crawlIsEnded = false; 127 try { 128 crawlIsEnded = heritrixController.crawlIsEnded(); 129 } catch (IOFailure e) { 130 log.debug(errorMessage, e); 131 } 132 while (!crawlIsEnded) { 133 String harvestInformation = null; 134 String progressStats = null; 135 try { 136 harvestInformation = heritrixController.getHarvestInformation(); 137 progressStats = heritrixController.getProgressStats(); 138 } catch (IOFailure e) { 139 log.debug(errorMessage, e); 140 } 141 142 HeritrixFiles files = getHeritrixFiles(); 143 log.info("Job ID: {}, Harvest ID: {}, {}\n{}", files.getJobID(), files.getHarvestID(), harvestInformation, 144 ((progressStats == null) ? "" : progressStats)); 145 // Note that we don't check for timeout while paused. 146 int processedKBPerSec = 0; 147 boolean paused = false; 148 try { 149 processedKBPerSec = heritrixController.getCurrentProcessedKBPerSec(); 150 paused = heritrixController.isPaused(); 151 } catch (IOFailure e) { 152 log.debug(errorMessage, e); 153 } 154 if (processedKBPerSec > 0 || paused) { 155 lastTimeReceivedData = System.currentTimeMillis(); 156 } 157 int activeToeCount = 0; 158 paused = false; 159 try { 160 activeToeCount = heritrixController.getActiveToeCount(); 161 paused = heritrixController.isPaused(); 162 } catch (IOFailure e) { 163 log.debug(errorMessage, e); 164 } 165 if (activeToeCount > 0 || paused) { 166 lastNonZeroActiveQueuesTime = System.currentTimeMillis(); 167 } 168 if ((lastNonZeroActiveQueuesTime + timeOutInMillis < System.currentTimeMillis()) 169 || (lastTimeReceivedData + timeOutInMillisReceivedData < System.currentTimeMillis())) { 170 final double noActiveQueuesTimeoutInSeconds = timeOutInMillis / 1000.0; 171 final double noDataReceivedTimeoutInSeconds = timeOutInMillisReceivedData / 1000.0; 172 long queuedUriCount = 0; 173 try { 174 queuedUriCount = heritrixController.getQueuedUriCount(); 175 } catch (IOFailure e) { 176 log.debug(errorMessage, e); 177 } 178 long ctm = System.currentTimeMillis(); 179 log.warn( 180 "Aborting crawl because of inactivity. No active queues for the last {} seconds " 181 + "(timeout is {} seconds).No traffic for the last {} seconds (timeout is {} seconds). URLs in queue:{}", 182 ((ctm - lastNonZeroActiveQueuesTime) / 1000.0), noActiveQueuesTimeoutInSeconds, 183 ((ctm - lastTimeReceivedData) / 1000.0), noDataReceivedTimeoutInSeconds, queuedUriCount); 184 // The following is the only controller command exception we 185 // don't catch here. Otherwise we might loop forever. 186 heritrixController.requestCrawlStop("Aborting because of inactivity"); 187 } 188 189 // Optimization: don't wait if ended since beginning of the loop 190 try { 191 crawlIsEnded = heritrixController.crawlIsEnded(); 192 } catch (IOFailure e) { 193 log.debug(errorMessage, e); 194 } 195 if (!crawlIsEnded) { 196 try { 197 /* 198 * Wait for heritrix to do something. WAIT_PERIOD is the interval between checks of whether we have 199 * passed timeouts. Note that timeouts are defined in the settings, while WAIT_PERIOD (being less 200 * relevant to the user) is defined in this class. 201 */ 202 synchronized (this) { 203 wait(1000 * CRAWL_CONTROL_WAIT_PERIOD); 204 } 205 } catch (InterruptedException e) { 206 log.trace("Waiting thread awoken: {}", e.getMessage(), e); 207 } 208 } 209 } // end of while (!crawlIsEnded) 210 } 211 212}