001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.heritrix3.controller;
024
025import org.slf4j.Logger;
026import org.slf4j.LoggerFactory;
027
028import dk.netarkivet.common.distribute.JMSConnectionFactory;
029import dk.netarkivet.common.exceptions.ArgumentNotValid;
030import dk.netarkivet.common.exceptions.HarvestingAbort;
031import dk.netarkivet.common.exceptions.IOFailure;
032import dk.netarkivet.common.lifecycle.PeriodicTaskExecutor;
033import dk.netarkivet.common.lifecycle.PeriodicTaskExecutor.PeriodicTask;
034import dk.netarkivet.common.utils.Settings;
035import dk.netarkivet.harvester.HarvesterSettings;
036import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage;
037import dk.netarkivet.harvester.harvesting.monitor.HarvestMonitor;
038import dk.netarkivet.harvester.heritrix3.Heritrix3Files;
039import dk.netarkivet.harvester.heritrix3.Heritrix3Settings;
040import dk.netarkivet.harvester.heritrix3.HeritrixLauncherAbstract;
041
042/**
043 * BnF specific Heritrix launcher, that forces the use of {@link HeritrixController}. Every turn of the crawl control
044 * loop, asks the Heritrix controller to generate a progress report as a {@link CrawlProgressMessage} and then send this
045 * message on the JMS bus to be consumed by the {@link HarvestMonitor} instance.
046 */
047public class HeritrixLauncher extends HeritrixLauncherAbstract {
048
049    /** The class logger. */
050    private static final Logger log = LoggerFactory.getLogger(HeritrixLauncher.class);
051
052    /** Wait time in milliseconds (10s). */
053    private static final int SLEEP_TIME_MS = 10 * 60 * 1000;
054
055    /** Frequency in seconds for generating the full harvest report. Also serves as delay before the first generation
056     *  occurs. */
057    static final long FRONTIER_REPORT_GEN_FREQUENCY = Settings.getLong(Heritrix3Settings.FRONTIER_REPORT_WAIT_TIME);
058
059    /** The CrawlController used. */
060    private HeritrixController heritrixController;
061
062    private String jobName;
063
064    /** Is the heritrix crawl finished. */
065    private boolean crawlIsOver = false;
066
067    /**
068     * Private constructor for this class.
069     *
070     * @param files the files needed by Heritrix to launch a job.
071     * @throws ArgumentNotValid
072     */
073    private HeritrixLauncher(Heritrix3Files files, String jobName) throws ArgumentNotValid {
074        super(files);
075        this.jobName = jobName;
076    }
077
078    /**
079     * Get instance of this class.
080     *
081     * @param files Object encapsulating location of Heritrix crawldir and configuration files
082     * @return {@link HeritrixLauncher} object
083     * @throws ArgumentNotValid If either order.xml or seeds.txt does not exist, or argument files is null.
084     */
085    public static HeritrixLauncher getInstance(Heritrix3Files files, String jobName) throws ArgumentNotValid {
086        ArgumentNotValid.checkNotNull(files, "Heritrix3Files files");
087        return new HeritrixLauncher(files, jobName); // The launching takes place here
088    } 
089
090    /**
091     * Initializes an Heritrix3controller, then launches the Heritrix3 instance. Then starts the crawl control loop:
092     * <ol>
093     * <li>Waits the amount of time configured in {@link HarvesterSettings#CRAWL_LOOP_WAIT_TIME}.</li>
094     * <li>Obtains crawl progress information as a {@link CrawlProgressMessage} from the Heritrix controller</li>
095     * <li>Sends a progress message via JMS</li>
096     * <li>If the crawl is reported as finished, end loop.</li>
097     * </ol>
098     */
099    public void doCrawl() throws IOFailure {
100        setupOrderfile(getHeritrixFiles());
101        heritrixController = new HeritrixController(getHeritrixFiles(), jobName);
102
103        PeriodicTaskExecutor exec = null;
104        try {
105            // Initialize Heritrix settings according to the crawler-beans.cxml file.
106            heritrixController.initialize();
107            log.debug("Starting crawl..");
108            heritrixController.requestCrawlStart();
109
110            // Schedule full frontier report generation
111            
112            log.info("Starting CrawlControl PeriodicTaskExecutor that repeatedly fetches a fullfrontierreport");
113            exec = new PeriodicTaskExecutor(new PeriodicTask("CrawlControl", new CrawlControl(),
114                    CRAWL_CONTROL_WAIT_PERIOD, CRAWL_CONTROL_WAIT_PERIOD) 
115                        //FIXME disabled until further notice
116                    /*                                          ,new PeriodicTask("FrontierReportAnalyzer",
117                    new FrontierReportAnalyzer(heritrixController), FRONTIER_REPORT_GEN_FREQUENCY,
118                    FRONTIER_REPORT_GEN_FREQUENCY)
119                    */
120                    );
121
122            while (!crawlIsOver) {
123                // Wait a bit
124                try {
125                    synchronized (this) {
126                        wait(SLEEP_TIME_MS);
127                    }
128                } catch (InterruptedException e) {
129                    log.trace("Waiting thread awoken: {}", e.getMessage(), e);
130                }
131            }
132        } catch (IOFailure e) {
133            log.warn("Error during initialisation of crawl", e);
134            throw (e);
135        } catch (Exception e) {
136            log.warn("Exception during crawl", e);
137            throw new RuntimeException("Exception during crawl", e);
138        } finally {
139            // Stop the crawl control & frontier report analyzer
140            if (exec != null) {
141                exec.shutdown();
142            }
143
144            if (heritrixController != null) {
145                heritrixController.cleanup(getHeritrixFiles().getCrawlDir());
146            }
147        }
148        log.debug("Heritrix has finished crawling...");
149    }
150
151    /**
152     * This class executes a crawl control task, e.g. queries the crawler for progress summary, sends the adequate JMS
153     * message to the monitor, and checks whether the crawl is finished, in which case crawl control will be ended.
154     * <p>
155     * These tasks are scheduled by a {@link CrawlControlExecutor}.
156     */
157    private class CrawlControl implements Runnable {
158
159        @Override
160        public void run() {
161            if (crawlIsOver) { // Don't check again; we are already done
162                return;
163            }
164            CrawlProgressMessage cpm = null;
165            try {
166                cpm = heritrixController.getCrawlProgress();
167            } catch (IOFailure e) {
168                // Log a warning and retry
169                log.warn("IOFailure while getting crawl progress", e);
170                return;
171            } catch (HarvestingAbort e) {
172                log.warn("Got HarvestingAbort exception while getting crawl progress. Means crawl is over", e);
173                crawlIsOver = true;
174                return;
175            }
176
177            JMSConnectionFactory.getInstance().send(cpm);
178
179            Heritrix3Files files = getHeritrixFiles();
180            if (cpm.crawlIsFinished()) {
181                log.info("Job ID: {}: crawl is finished.", files.getJobID());
182                crawlIsOver = true;
183                return;
184            }
185            /*
186            log.info("Job ID: " + files.getJobID() + ", Harvest ID: " + files.getHarvestID() + ", " + cpm.getHostUrl()
187                    + "\n" + cpm.getProgressStatisticsLegend() + "\n" + cpm.getJobStatus().getStatus() + " "
188                    + cpm.getJobStatus().getProgressStatistics());
189                    */
190        }
191
192    }
193
194}