001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.heritrix3.controller;
024
025import org.slf4j.Logger;
026import org.slf4j.LoggerFactory;
027
028import dk.netarkivet.common.distribute.JMSConnectionFactory;
029import dk.netarkivet.common.exceptions.ArgumentNotValid;
030import dk.netarkivet.common.exceptions.HarvestingAbort;
031import dk.netarkivet.common.exceptions.IOFailure;
032import dk.netarkivet.common.utils.Settings;
033import dk.netarkivet.harvester.HarvesterSettings;
034import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage;
035import dk.netarkivet.harvester.harvesting.monitor.HarvestMonitor;
036import dk.netarkivet.harvester.heritrix3.Heritrix3Files;
037import dk.netarkivet.harvester.heritrix3.Heritrix3Settings;
038import dk.netarkivet.harvester.heritrix3.HeritrixLauncherAbstract;
039
040/**
041 * BnF specific Heritrix3 launcher, that forces the use of {@link HeritrixController}. Every turn of the crawl control
042 * loop, asks the Heritrix3 controller to generate a progress report as a {@link CrawlProgressMessage} and then send this
043 * message on the JMS bus to be consumed by the {@link HarvestMonitor} instance.
044 */
045public class HeritrixLauncher extends HeritrixLauncherAbstract {
046
047    /** The class logger. */
048    private static final Logger log = LoggerFactory.getLogger(HeritrixLauncher.class);
049
050    /** Frequency in seconds for generating the full harvest report. Also serves as delay before the first generation
051     *  occurs. */
052    static final long FRONTIER_REPORT_GEN_FREQUENCY = Settings.getLong(Heritrix3Settings.FRONTIER_REPORT_WAIT_TIME);
053
054    /** The CrawlController used. */
055    private HeritrixController heritrixController;
056
057    private String jobName;
058
059    /** Is the heritrix3 crawl finished. */
060    private boolean crawlIsOver = false;
061
062    /**
063     * Private constructor for this class.
064     *
065     * @param files the files needed by Heritrix to launch a job.
066     * @throws ArgumentNotValid
067     */
068    private HeritrixLauncher(Heritrix3Files files, String jobName) throws ArgumentNotValid {
069        super(files);
070        this.jobName = jobName;
071    }
072
073    /**
074     * Get instance of this class.
075     *
076     * @param files Object encapsulating location of Heritrix crawldir and configuration files
077     * @return {@link HeritrixLauncher} object
078     * @throws ArgumentNotValid If either order.xml or seeds.txt does not exist, or argument files is null.
079     */
080    public static HeritrixLauncher getInstance(Heritrix3Files files, String jobName) throws ArgumentNotValid {
081        ArgumentNotValid.checkNotNull(files, "Heritrix3Files files");
082        return new HeritrixLauncher(files, jobName); // The launching takes place here
083    } 
084
085    /**
086     * Initializes an Heritrix3controller, then launches the Heritrix3 instance. Then starts the crawl control loop:
087     * <ol>
088     * <li>Waits the amount of time configured in {@link HarvesterSettings#CRAWL_LOOP_WAIT_TIME}.</li>
089     * <li>Obtains crawl progress information as a {@link CrawlProgressMessage} from the Heritrix controller</li>
090     * <li>Sends a progress message via JMS</li>
091     * <li>If the crawl is reported as finished, end loop.</li>
092     * </ol>
093     */
094    public void doCrawl() throws IOFailure {
095        setupOrderfile(getHeritrixFiles());
096        heritrixController = new HeritrixController(getHeritrixFiles(), jobName);
097        
098        try {
099            // Initialize Heritrix settings according to the crawler-beans.cxml file.
100            heritrixController.initialize();
101            log.debug("Setup and start new h3 crawl");
102            heritrixController.requestCrawlStart();
103                
104            log.info("Starting periodic CrawlControl with CRAWL_CONTROL_WAIT_PERIOD={} seconds", CRAWL_CONTROL_WAIT_PERIOD);            
105          
106            while (!crawlIsOver) {
107                CrawlControl cc = new CrawlControl();
108                cc.run();
109                FrontierReportAnalyzer fra = new FrontierReportAnalyzer(heritrixController);
110                fra.run();
111                if (!crawlIsOver) {
112                    try {
113                    Thread.sleep(CRAWL_CONTROL_WAIT_PERIOD*1000L);
114                    } catch (InterruptedException e) {
115                        log.warn("Wait interrupted: " + e);
116                    }
117                }
118            }
119            log.info("CrawlJob is now over");
120        } catch (IOFailure e) {
121            log.warn("Error during initialisation of crawl", e);
122            throw (e);
123        } catch (Exception e) {
124            log.warn("Exception during crawl", e);
125            throw new RuntimeException("Exception during crawl", e);
126        } finally {
127            if (heritrixController != null) {
128                heritrixController.cleanup(getHeritrixFiles().getCrawlDir());
129            }
130        }
131        log.debug("Heritrix3 has finished crawling...");
132    }
133
134    /**
135     * This class executes a crawl control task, e.g. queries the crawler for progress summary, sends the adequate JMS
136     * message to the monitor, and checks whether the crawl is finished, in which case crawl control will be ended.
137     * <p>
138     */
139    private class CrawlControl implements Runnable {
140       
141        @Override
142        public void run() {
143            CrawlProgressMessage cpm = null;
144            try {
145                cpm = heritrixController.getCrawlProgress();
146            } catch (IOFailure e) {
147                // Log a warning and retry
148                log.warn("IOFailure while getting crawl progress", e);
149                return;
150            } catch (HarvestingAbort e) {
151                log.warn("Got HarvestingAbort exception while getting crawl progress. Means crawl is over", e);
152                crawlIsOver = true;
153                return;
154            }
155            JMSConnectionFactory.getInstance().send(cpm);
156
157            Heritrix3Files files = getHeritrixFiles();
158            if (cpm.crawlIsFinished()) {
159                log.info("Job ID {}: crawl is finished.", files.getJobID());
160                crawlIsOver = true;
161                return;
162            }
163            
164            log.info("Job ID: " + files.getJobID() + ", Harvest ID: " + files.getHarvestID() + ", " + cpm.getHostUrl()
165                    + "\n" + cpm.getProgressStatisticsLegend() + "\n" + cpm.getJobStatus().getStatus() + " "
166                    + cpm.getJobStatus().getProgressStatistics());
167        }
168
169    }
170
171}