001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.heritrix3.controller; 024 025import org.slf4j.Logger; 026import org.slf4j.LoggerFactory; 027 028import dk.netarkivet.common.distribute.JMSConnectionFactory; 029import dk.netarkivet.common.exceptions.ArgumentNotValid; 030import dk.netarkivet.common.exceptions.HarvestingAbort; 031import dk.netarkivet.common.exceptions.IOFailure; 032import dk.netarkivet.common.lifecycle.PeriodicTaskExecutor; 033import dk.netarkivet.common.lifecycle.PeriodicTaskExecutor.PeriodicTask; 034import dk.netarkivet.common.utils.Settings; 035import dk.netarkivet.harvester.HarvesterSettings; 036import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage; 037import dk.netarkivet.harvester.harvesting.monitor.HarvestMonitor; 038import dk.netarkivet.harvester.heritrix3.Heritrix3Files; 039import dk.netarkivet.harvester.heritrix3.Heritrix3Settings; 040import dk.netarkivet.harvester.heritrix3.HeritrixLauncherAbstract; 041 042/** 043 * BnF specific Heritrix launcher, that forces the use of {@link HeritrixController}. Every turn of the crawl control 044 * loop, asks the Heritrix controller to generate a progress report as a {@link CrawlProgressMessage} and then send this 045 * message on the JMS bus to be consumed by the {@link HarvestMonitor} instance. 046 */ 047public class HeritrixLauncher extends HeritrixLauncherAbstract { 048 049 /** The class logger. */ 050 private static final Logger log = LoggerFactory.getLogger(HeritrixLauncher.class); 051 052 /** Wait time in milliseconds (10s). */ 053 private static final int SLEEP_TIME_MS = 10 * 60 * 1000; 054 055 /** Frequency in seconds for generating the full harvest report. Also serves as delay before the first generation 056 * occurs. */ 057 static final long FRONTIER_REPORT_GEN_FREQUENCY = Settings.getLong(Heritrix3Settings.FRONTIER_REPORT_WAIT_TIME); 058 059 /** The CrawlController used. */ 060 private HeritrixController heritrixController; 061 062 private String jobName; 063 064 /** Is the heritrix crawl finished. */ 065 private boolean crawlIsOver = false; 066 067 /** 068 * Private constructor for this class. 069 * 070 * @param files the files needed by Heritrix to launch a job. 071 * @throws ArgumentNotValid 072 */ 073 private HeritrixLauncher(Heritrix3Files files, String jobName) throws ArgumentNotValid { 074 super(files); 075 this.jobName = jobName; 076 } 077 078 /** 079 * Get instance of this class. 080 * 081 * @param files Object encapsulating location of Heritrix crawldir and configuration files 082 * @return {@link HeritrixLauncher} object 083 * @throws ArgumentNotValid If either order.xml or seeds.txt does not exist, or argument files is null. 084 */ 085 public static HeritrixLauncher getInstance(Heritrix3Files files, String jobName) throws ArgumentNotValid { 086 ArgumentNotValid.checkNotNull(files, "Heritrix3Files files"); 087 return new HeritrixLauncher(files, jobName); // The launching takes place here 088 } 089 090 /** 091 * Initializes an Heritrix3controller, then launches the Heritrix3 instance. Then starts the crawl control loop: 092 * <ol> 093 * <li>Waits the amount of time configured in {@link HarvesterSettings#CRAWL_LOOP_WAIT_TIME}.</li> 094 * <li>Obtains crawl progress information as a {@link CrawlProgressMessage} from the Heritrix controller</li> 095 * <li>Sends a progress message via JMS</li> 096 * <li>If the crawl is reported as finished, end loop.</li> 097 * </ol> 098 */ 099 public void doCrawl() throws IOFailure { 100 setupOrderfile(getHeritrixFiles()); 101 heritrixController = new HeritrixController(getHeritrixFiles(), jobName); 102 103 PeriodicTaskExecutor exec = null; 104 try { 105 // Initialize Heritrix settings according to the crawler-beans.cxml file. 106 heritrixController.initialize(); 107 log.debug("Starting crawl.."); 108 heritrixController.requestCrawlStart(); 109 110 // Schedule full frontier report generation 111 112 log.info("Starting CrawlControl PeriodicTaskExecutor that repeatedly fetches a fullfrontierreport"); 113 exec = new PeriodicTaskExecutor(new PeriodicTask("CrawlControl", new CrawlControl(), 114 CRAWL_CONTROL_WAIT_PERIOD, CRAWL_CONTROL_WAIT_PERIOD) 115 //FIXME disabled until further notice 116 /* ,new PeriodicTask("FrontierReportAnalyzer", 117 new FrontierReportAnalyzer(heritrixController), FRONTIER_REPORT_GEN_FREQUENCY, 118 FRONTIER_REPORT_GEN_FREQUENCY) 119 */ 120 ); 121 122 while (!crawlIsOver) { 123 // Wait a bit 124 try { 125 synchronized (this) { 126 wait(SLEEP_TIME_MS); 127 } 128 } catch (InterruptedException e) { 129 log.trace("Waiting thread awoken: {}", e.getMessage(), e); 130 } 131 } 132 } catch (IOFailure e) { 133 log.warn("Error during initialisation of crawl", e); 134 throw (e); 135 } catch (Exception e) { 136 log.warn("Exception during crawl", e); 137 throw new RuntimeException("Exception during crawl", e); 138 } finally { 139 // Stop the crawl control & frontier report analyzer 140 if (exec != null) { 141 exec.shutdown(); 142 } 143 144 if (heritrixController != null) { 145 heritrixController.cleanup(getHeritrixFiles().getCrawlDir()); 146 } 147 } 148 log.debug("Heritrix has finished crawling..."); 149 } 150 151 /** 152 * This class executes a crawl control task, e.g. queries the crawler for progress summary, sends the adequate JMS 153 * message to the monitor, and checks whether the crawl is finished, in which case crawl control will be ended. 154 * <p> 155 * These tasks are scheduled by a {@link CrawlControlExecutor}. 156 */ 157 private class CrawlControl implements Runnable { 158 159 @Override 160 public void run() { 161 if (crawlIsOver) { // Don't check again; we are already done 162 return; 163 } 164 CrawlProgressMessage cpm = null; 165 try { 166 cpm = heritrixController.getCrawlProgress(); 167 } catch (IOFailure e) { 168 // Log a warning and retry 169 log.warn("IOFailure while getting crawl progress", e); 170 return; 171 } catch (HarvestingAbort e) { 172 log.warn("Got HarvestingAbort exception while getting crawl progress. Means crawl is over", e); 173 crawlIsOver = true; 174 return; 175 } 176 177 JMSConnectionFactory.getInstance().send(cpm); 178 179 Heritrix3Files files = getHeritrixFiles(); 180 if (cpm.crawlIsFinished()) { 181 log.info("Job ID: {}: crawl is finished.", files.getJobID()); 182 crawlIsOver = true; 183 return; 184 } 185 /* 186 log.info("Job ID: " + files.getJobID() + ", Harvest ID: " + files.getHarvestID() + ", " + cpm.getHostUrl() 187 + "\n" + cpm.getProgressStatisticsLegend() + "\n" + cpm.getJobStatus().getStatus() + " " 188 + cpm.getJobStatus().getProgressStatistics()); 189 */ 190 } 191 192 } 193 194}