001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.harvesting.controller; 024 025import org.slf4j.Logger; 026import org.slf4j.LoggerFactory; 027 028import dk.netarkivet.common.distribute.JMSConnectionFactory; 029import dk.netarkivet.common.exceptions.ArgumentNotValid; 030import dk.netarkivet.common.exceptions.HarvestingAbort; 031import dk.netarkivet.common.exceptions.IOFailure; 032import dk.netarkivet.common.lifecycle.PeriodicTaskExecutor; 033import dk.netarkivet.common.lifecycle.PeriodicTaskExecutor.PeriodicTask; 034import dk.netarkivet.common.utils.Settings; 035import dk.netarkivet.harvester.HarvesterSettings; 036import dk.netarkivet.harvester.harvesting.HeritrixFiles; 037import dk.netarkivet.harvester.harvesting.HeritrixLauncher; 038import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage; 039import dk.netarkivet.harvester.harvesting.monitor.HarvestMonitor; 040 041/** 042 * BnF specific Heritrix launcher, that forces the use of {@link BnfHeritrixController}. Every turn of the crawl control 043 * loop, asks the Heritrix controller to generate a progress report as a {@link CrawlProgressMessage} and then send this 044 * message on the JMS bus to be consumed by the {@link HarvestMonitor} instance. 045 */ 046public class BnfHeritrixLauncher extends HeritrixLauncher { 047 048 /** The class logger. */ 049 private static final Logger log = LoggerFactory.getLogger(BnfHeritrixLauncher.class); 050 051 /** 052 * This class executes a crawl control task, e.g. queries the crawler for progress summary, sends the adequate JMS 053 * message to the monitor, and checks whether the crawl is finished, in which case crawl control will be ended. 054 * <p> 055 * These tasks are scheduled by a {@link CrawlControlExecutor}. 056 */ 057 private class CrawlControl implements Runnable { 058 059 @Override 060 public void run() { 061 if (crawlIsOver) { // Don't check again; we are already done 062 return; 063 } 064 CrawlProgressMessage cpm = null; 065 try { 066 cpm = heritrixController.getCrawlProgress(); 067 } catch (IOFailure e) { 068 // Log a warning and retry 069 log.warn("IOFailure while getting crawl progress", e); 070 return; 071 } catch (HarvestingAbort e) { 072 log.warn("Got HarvestingAbort exception while getting crawl progress. Means crawl is over", e); 073 crawlIsOver = true; 074 return; 075 } 076 077 JMSConnectionFactory.getInstance().send(cpm); 078 079 HeritrixFiles files = getHeritrixFiles(); 080 if (cpm.crawlIsFinished()) { 081 log.info("Job ID: {}: crawl is finished.", files.getJobID()); 082 crawlIsOver = true; 083 return; 084 } 085 086 log.info("Job ID: " + files.getJobID() + ", Harvest ID: " + files.getHarvestID() + ", " + cpm.getHostUrl() 087 + "\n" + cpm.getProgressStatisticsLegend() + "\n" + cpm.getJobStatus().getStatus() + " " 088 + cpm.getJobStatus().getProgressStatistics()); 089 } 090 } 091 092 /** Wait time in milliseconds (10s). */ 093 private static final int SLEEP_TIME_MS = 10 * 60 * 1000; 094 095 /** 096 * Frequency in seconds for generating the full harvest report. Also serves as delay before the first generation 097 * occurs. 098 */ 099 static final long FRONTIER_REPORT_GEN_FREQUENCY = Settings.getLong(HarvesterSettings.FRONTIER_REPORT_WAIT_TIME); 100 101 /** The CrawlController used. */ 102 private BnfHeritrixController heritrixController; 103 /** Is the heritrix crawl finished. */ 104 private boolean crawlIsOver = false; 105 106 /** 107 * Private constructor for this class. 108 * 109 * @param files the files needed by Heritrix to launch a job. 110 * @throws ArgumentNotValid 111 */ 112 private BnfHeritrixLauncher(HeritrixFiles files) throws ArgumentNotValid { 113 super(files); 114 } 115 116 /** 117 * Get instance of this class. 118 * 119 * @param files Object encapsulating location of Heritrix crawldir and configuration files 120 * @return {@link BnfHeritrixLauncher} object 121 * @throws ArgumentNotValid If either order.xml or seeds.txt does not exist, or argument files is null. 122 */ 123 public static BnfHeritrixLauncher getInstance(HeritrixFiles files) throws ArgumentNotValid { 124 ArgumentNotValid.checkNotNull(files, "HeritrixFiles files"); 125 return new BnfHeritrixLauncher(files); 126 } 127 128 /** 129 * Initializes an Heritrix controller, then launches the Heritrix instance. Then starts the crawl control loop: 130 * <ol> 131 * <li>Waits the amount of time configured in {@link HarvesterSettings#CRAWL_LOOP_WAIT_TIME}.</li> 132 * <li>Obtains crawl progress information as a {@link CrawlProgressMessage} from the Heritrix controller</li> 133 * <li>Sends the progress message via JMS</li> 134 * <li>If the crawl if reported as finished, end loop.</li> 135 * </ol> 136 */ 137 public void doCrawl() throws IOFailure { 138 setupOrderfile(getHeritrixFiles()); 139 heritrixController = new BnfHeritrixController(getHeritrixFiles()); 140 141 PeriodicTaskExecutor exec = null; 142 try { 143 // Initialize Heritrix settings according to the order.xml 144 heritrixController.initialize(); 145 log.debug("Starting crawl.."); 146 heritrixController.requestCrawlStart(); 147 148 // Schedule full frontier report generation 149 exec = new PeriodicTaskExecutor(new PeriodicTask("CrawlControl", new CrawlControl(), 150 CRAWL_CONTROL_WAIT_PERIOD, CRAWL_CONTROL_WAIT_PERIOD), new PeriodicTask("FrontierReportAnalyzer", 151 new FrontierReportAnalyzer(heritrixController), FRONTIER_REPORT_GEN_FREQUENCY, 152 FRONTIER_REPORT_GEN_FREQUENCY)); 153 154 while (!crawlIsOver) { 155 // Wait a bit 156 try { 157 synchronized (this) { 158 wait(SLEEP_TIME_MS); 159 } 160 } catch (InterruptedException e) { 161 log.trace("Waiting thread awoken: {}", e.getMessage(), e); 162 } 163 } 164 165 } catch (IOFailure e) { 166 log.warn("Error during initialisation of crawl", e); 167 throw (e); 168 } catch (Exception e) { 169 log.warn("Exception during crawl", e); 170 throw new RuntimeException("Exception during crawl", e); 171 } finally { 172 // Stop the crawl control & frontier report analyzer 173 if (exec != null) { 174 exec.shutdown(); 175 } 176 177 if (heritrixController != null) { 178 heritrixController.cleanup(getHeritrixFiles().getCrawlDir()); 179 } 180 } 181 log.debug("Heritrix has finished crawling..."); 182 183 } 184 185}