001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.harvesting.controller;
024
025import org.slf4j.Logger;
026import org.slf4j.LoggerFactory;
027
028import dk.netarkivet.common.distribute.JMSConnectionFactory;
029import dk.netarkivet.common.exceptions.ArgumentNotValid;
030import dk.netarkivet.common.exceptions.HarvestingAbort;
031import dk.netarkivet.common.exceptions.IOFailure;
032import dk.netarkivet.common.lifecycle.PeriodicTaskExecutor;
033import dk.netarkivet.common.lifecycle.PeriodicTaskExecutor.PeriodicTask;
034import dk.netarkivet.common.utils.Settings;
035import dk.netarkivet.harvester.HarvesterSettings;
036import dk.netarkivet.harvester.harvesting.HeritrixFiles;
037import dk.netarkivet.harvester.harvesting.HeritrixLauncher;
038import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage;
039import dk.netarkivet.harvester.harvesting.monitor.HarvestMonitor;
040
041/**
042 * BnF specific Heritrix launcher, that forces the use of {@link BnfHeritrixController}. Every turn of the crawl control
043 * loop, asks the Heritrix controller to generate a progress report as a {@link CrawlProgressMessage} and then send this
044 * message on the JMS bus to be consumed by the {@link HarvestMonitor} instance.
045 */
046public class BnfHeritrixLauncher extends HeritrixLauncher {
047
048    /** The class logger. */
049    private static final Logger log = LoggerFactory.getLogger(BnfHeritrixLauncher.class);
050
051    /**
052     * This class executes a crawl control task, e.g. queries the crawler for progress summary, sends the adequate JMS
053     * message to the monitor, and checks whether the crawl is finished, in which case crawl control will be ended.
054     * <p>
055     * These tasks are scheduled by a {@link CrawlControlExecutor}.
056     */
057    private class CrawlControl implements Runnable {
058
059        @Override
060        public void run() {
061            if (crawlIsOver) { // Don't check again; we are already done
062                return;
063            }
064            CrawlProgressMessage cpm = null;
065            try {
066                cpm = heritrixController.getCrawlProgress();
067            } catch (IOFailure e) {
068                // Log a warning and retry
069                log.warn("IOFailure while getting crawl progress", e);
070                return;
071            } catch (HarvestingAbort e) {
072                log.warn("Got HarvestingAbort exception while getting crawl progress. Means crawl is over", e);
073                crawlIsOver = true;
074                return;
075            }
076
077            JMSConnectionFactory.getInstance().send(cpm);
078
079            HeritrixFiles files = getHeritrixFiles();
080            if (cpm.crawlIsFinished()) {
081                log.info("Job ID: {}: crawl is finished.", files.getJobID());
082                crawlIsOver = true;
083                return;
084            }
085
086            log.info("Job ID: " + files.getJobID() + ", Harvest ID: " + files.getHarvestID() + ", " + cpm.getHostUrl()
087                    + "\n" + cpm.getProgressStatisticsLegend() + "\n" + cpm.getJobStatus().getStatus() + " "
088                    + cpm.getJobStatus().getProgressStatistics());
089        }
090    }
091
092    /** Wait time in milliseconds (10s). */
093    private static final int SLEEP_TIME_MS = 10 * 60 * 1000;
094
095    /**
096     * Frequency in seconds for generating the full harvest report. Also serves as delay before the first generation
097     * occurs.
098     */
099    static final long FRONTIER_REPORT_GEN_FREQUENCY = Settings.getLong(HarvesterSettings.FRONTIER_REPORT_WAIT_TIME);
100
101    /** The CrawlController used. */
102    private BnfHeritrixController heritrixController;
103    /** Is the heritrix crawl finished. */
104    private boolean crawlIsOver = false;
105
106    /**
107     * Private constructor for this class.
108     *
109     * @param files the files needed by Heritrix to launch a job.
110     * @throws ArgumentNotValid
111     */
112    private BnfHeritrixLauncher(HeritrixFiles files) throws ArgumentNotValid {
113        super(files);
114    }
115
116    /**
117     * Get instance of this class.
118     *
119     * @param files Object encapsulating location of Heritrix crawldir and configuration files
120     * @return {@link BnfHeritrixLauncher} object
121     * @throws ArgumentNotValid If either order.xml or seeds.txt does not exist, or argument files is null.
122     */
123    public static BnfHeritrixLauncher getInstance(HeritrixFiles files) throws ArgumentNotValid {
124        ArgumentNotValid.checkNotNull(files, "HeritrixFiles files");
125        return new BnfHeritrixLauncher(files);
126    }
127
128    /**
129     * Initializes an Heritrix controller, then launches the Heritrix instance. Then starts the crawl control loop:
130     * <ol>
131     * <li>Waits the amount of time configured in {@link HarvesterSettings#CRAWL_LOOP_WAIT_TIME}.</li>
132     * <li>Obtains crawl progress information as a {@link CrawlProgressMessage} from the Heritrix controller</li>
133     * <li>Sends the progress message via JMS</li>
134     * <li>If the crawl if reported as finished, end loop.</li>
135     * </ol>
136     */
137    public void doCrawl() throws IOFailure {
138        setupOrderfile(getHeritrixFiles());
139        heritrixController = new BnfHeritrixController(getHeritrixFiles());
140
141        PeriodicTaskExecutor exec = null;
142        try {
143            // Initialize Heritrix settings according to the order.xml
144            heritrixController.initialize();
145            log.debug("Starting crawl..");
146            heritrixController.requestCrawlStart();
147
148            // Schedule full frontier report generation
149            exec = new PeriodicTaskExecutor(new PeriodicTask("CrawlControl", new CrawlControl(),
150                    CRAWL_CONTROL_WAIT_PERIOD, CRAWL_CONTROL_WAIT_PERIOD), new PeriodicTask("FrontierReportAnalyzer",
151                    new FrontierReportAnalyzer(heritrixController), FRONTIER_REPORT_GEN_FREQUENCY,
152                    FRONTIER_REPORT_GEN_FREQUENCY));
153
154            while (!crawlIsOver) {
155                // Wait a bit
156                try {
157                    synchronized (this) {
158                        wait(SLEEP_TIME_MS);
159                    }
160                } catch (InterruptedException e) {
161                    log.trace("Waiting thread awoken: {}", e.getMessage(), e);
162                }
163            }
164
165        } catch (IOFailure e) {
166            log.warn("Error during initialisation of crawl", e);
167            throw (e);
168        } catch (Exception e) {
169            log.warn("Exception during crawl", e);
170            throw new RuntimeException("Exception during crawl", e);
171        } finally {
172            // Stop the crawl control & frontier report analyzer
173            if (exec != null) {
174                exec.shutdown();
175            }
176
177            if (heritrixController != null) {
178                heritrixController.cleanup(getHeritrixFiles().getCrawlDir());
179            }
180        }
181        log.debug("Heritrix has finished crawling...");
182
183    }
184
185}