001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.heritrix3.controller; 024 025import org.slf4j.Logger; 026import org.slf4j.LoggerFactory; 027 028import dk.netarkivet.common.distribute.JMSConnectionFactory; 029import dk.netarkivet.common.exceptions.ArgumentNotValid; 030import dk.netarkivet.common.exceptions.HarvestingAbort; 031import dk.netarkivet.common.exceptions.IOFailure; 032import dk.netarkivet.common.utils.Settings; 033import dk.netarkivet.harvester.HarvesterSettings; 034import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage; 035import dk.netarkivet.harvester.harvesting.monitor.HarvestMonitor; 036import dk.netarkivet.harvester.heritrix3.Heritrix3Files; 037import dk.netarkivet.harvester.heritrix3.Heritrix3Settings; 038import dk.netarkivet.harvester.heritrix3.HeritrixLauncherAbstract; 039 040/** 041 * BnF specific Heritrix3 launcher, that forces the use of {@link HeritrixController}. Every turn of the crawl control 042 * loop, asks the Heritrix3 controller to generate a progress report as a {@link CrawlProgressMessage} and then send this 043 * message on the JMS bus to be consumed by the {@link HarvestMonitor} instance. 044 */ 045public class HeritrixLauncher extends HeritrixLauncherAbstract { 046 047 /** The class logger. */ 048 private static final Logger log = LoggerFactory.getLogger(HeritrixLauncher.class); 049 050 /** Frequency in seconds for generating the full harvest report. Also serves as delay before the first generation 051 * occurs. */ 052 static final long FRONTIER_REPORT_GEN_FREQUENCY = Settings.getLong(Heritrix3Settings.FRONTIER_REPORT_WAIT_TIME); 053 054 /** The CrawlController used. */ 055 private HeritrixController heritrixController; 056 057 private String jobName; 058 059 /** Is the heritrix3 crawl finished. */ 060 private boolean crawlIsOver = false; 061 062 /** 063 * Private constructor for this class. 064 * 065 * @param files the files needed by Heritrix to launch a job. 066 * @throws ArgumentNotValid 067 */ 068 private HeritrixLauncher(Heritrix3Files files, String jobName) throws ArgumentNotValid { 069 super(files); 070 this.jobName = jobName; 071 } 072 073 /** 074 * Get instance of this class. 075 * 076 * @param files Object encapsulating location of Heritrix crawldir and configuration files 077 * @return {@link HeritrixLauncher} object 078 * @throws ArgumentNotValid If either order.xml or seeds.txt does not exist, or argument files is null. 079 */ 080 public static HeritrixLauncher getInstance(Heritrix3Files files, String jobName) throws ArgumentNotValid { 081 ArgumentNotValid.checkNotNull(files, "Heritrix3Files files"); 082 return new HeritrixLauncher(files, jobName); // The launching takes place here 083 } 084 085 /** 086 * Initializes an Heritrix3controller, then launches the Heritrix3 instance. Then starts the crawl control loop: 087 * <ol> 088 * <li>Waits the amount of time configured in {@link HarvesterSettings#CRAWL_LOOP_WAIT_TIME}.</li> 089 * <li>Obtains crawl progress information as a {@link CrawlProgressMessage} from the Heritrix controller</li> 090 * <li>Sends a progress message via JMS</li> 091 * <li>If the crawl is reported as finished, end loop.</li> 092 * </ol> 093 */ 094 public void doCrawl() throws IOFailure { 095 setupOrderfile(getHeritrixFiles()); 096 heritrixController = new HeritrixController(getHeritrixFiles(), jobName); 097 098 try { 099 // Initialize Heritrix settings according to the crawler-beans.cxml file. 100 heritrixController.initialize(); 101 log.debug("Setup and start new h3 crawl"); 102 heritrixController.requestCrawlStart(); 103 104 log.info("Starting periodic CrawlControl with CRAWL_CONTROL_WAIT_PERIOD={} seconds", CRAWL_CONTROL_WAIT_PERIOD); 105 106 while (!crawlIsOver) { 107 CrawlControl cc = new CrawlControl(); 108 cc.run(); 109 FrontierReportAnalyzer fra = new FrontierReportAnalyzer(heritrixController); 110 fra.run(); 111 if (!crawlIsOver) { 112 try { 113 Thread.sleep(CRAWL_CONTROL_WAIT_PERIOD*1000L); 114 } catch (InterruptedException e) { 115 log.warn("Wait interrupted: " + e); 116 } 117 } 118 } 119 log.info("CrawlJob is now over"); 120 } catch (IOFailure e) { 121 log.warn("Error during initialisation of crawl", e); 122 throw (e); 123 } catch (Exception e) { 124 log.warn("Exception during crawl", e); 125 throw new RuntimeException("Exception during crawl", e); 126 } finally { 127 if (heritrixController != null) { 128 heritrixController.cleanup(getHeritrixFiles().getCrawlDir()); 129 } 130 } 131 log.debug("Heritrix3 has finished crawling..."); 132 } 133 134 /** 135 * This class executes a crawl control task, e.g. queries the crawler for progress summary, sends the adequate JMS 136 * message to the monitor, and checks whether the crawl is finished, in which case crawl control will be ended. 137 * <p> 138 */ 139 private class CrawlControl implements Runnable { 140 141 @Override 142 public void run() { 143 CrawlProgressMessage cpm = null; 144 try { 145 cpm = heritrixController.getCrawlProgress(); 146 } catch (IOFailure e) { 147 // Log a warning and retry 148 log.warn("IOFailure while getting crawl progress", e); 149 return; 150 } catch (HarvestingAbort e) { 151 log.warn("Got HarvestingAbort exception while getting crawl progress. Means crawl is over", e); 152 crawlIsOver = true; 153 return; 154 } 155 JMSConnectionFactory.getInstance().send(cpm); 156 157 Heritrix3Files files = getHeritrixFiles(); 158 if (cpm.crawlIsFinished()) { 159 log.info("Job ID {}: crawl is finished.", files.getJobID()); 160 crawlIsOver = true; 161 return; 162 } 163 164 log.info("Job ID: " + files.getJobID() + ", Harvest ID: " + files.getHarvestID() + ", " + cpm.getHostUrl() 165 + "\n" + cpm.getProgressStatisticsLegend() + "\n" + cpm.getJobStatus().getStatus() + " " 166 + cpm.getJobStatus().getProgressStatistics()); 167 } 168 169 } 170 171}