001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.harvesting.controller;
024
025import org.slf4j.Logger;
026import org.slf4j.LoggerFactory;
027
028import dk.netarkivet.common.exceptions.ArgumentNotValid;
029import dk.netarkivet.common.exceptions.IOFailure;
030import dk.netarkivet.common.utils.Settings;
031import dk.netarkivet.harvester.HarvesterSettings;
032import dk.netarkivet.harvester.harvesting.HeritrixFiles;
033import dk.netarkivet.harvester.harvesting.HeritrixLauncher;
034
035/**
036 * Default implementation of the crawl control.
037 */
038public class DefaultHeritrixLauncher extends HeritrixLauncher {
039
040    /** The class logger. */
041    private static final Logger log = LoggerFactory.getLogger(DefaultHeritrixLauncher.class);
042
043    /** Number of milliseconds in a second. */
044    private static final int MILLIS_PER_SECOND = 1000;
045
046    /** How long to wait before aborting a request from a webserver. */
047    private static long timeOutInMillisReceivedData = Long.parseLong(Settings
048            .get(HarvesterSettings.CRAWLER_TIMEOUT_NON_RESPONDING)) * MILLIS_PER_SECOND;
049
050    /** How long to wait without any activity before aborting the harvest. */
051    private static long timeOutInMillis = Long.parseLong(Settings.get(HarvesterSettings.INACTIVITY_TIMEOUT_IN_SECS))
052            * MILLIS_PER_SECOND;
053    /** The HeritrixController instance used by the HeritrixLauncher. */
054    private HeritrixController heritrixController;
055
056    /**
057     * Constructor for the DefaultHeritrixLauncher.
058     *
059     * @param files the Heritrix configuration.
060     * @throws ArgumentNotValid
061     */
062    private DefaultHeritrixLauncher(HeritrixFiles files) throws ArgumentNotValid {
063        super(files);
064    }
065
066    /**
067     * Get instance of this class.
068     *
069     * @param files Object encapsulating location of Heritrix crawldir and configuration files
070     * @return {@link DefaultHeritrixLauncher} object
071     * @throws ArgumentNotValid If either order.xml or seeds.txt does not exist, or argument files is null.
072     */
073    public static DefaultHeritrixLauncher getInstance(HeritrixFiles files) throws ArgumentNotValid {
074        ArgumentNotValid.checkNotNull(files, "HeritrixFiles files");
075        return new DefaultHeritrixLauncher(files);
076    }
077
078    /**
079     * This method launches heritrix in the following way:</br> 1. copies the orderfile and the seedsfile to current
080     * working directory. </br> 2. sets up the newly created copy of the orderfile </br> 3. starts the crawler </br> 4.
081     * stops the crawler (Either when heritrix has finished crawling, or when heritrix is forcefully stopped due to
082     * inactivity). </p> The exit from the while-loop depends on Heritrix calling the crawlEnded() method, when the
083     * crawling is finished. This method is called from the HarvestControllerServer.onDoOneCrawl() method.
084     *
085     * @throws IOFailure - if the order.xml is invalid if unable to initialize Heritrix CrawlController if Heritrix
086     * process interrupted
087     */
088    public void doCrawl() throws IOFailure {
089        setupOrderfile(getHeritrixFiles());
090        heritrixController = HeritrixControllerFactory.getDefaultHeritrixController(getControllerArguments());
091        try {
092            // Initialize Heritrix settings according to the order.xml
093            heritrixController.initialize();
094            log.debug("Starting crawl..");
095            heritrixController.requestCrawlStart();
096            if (heritrixController.atFinish()) {
097                heritrixController.beginCrawlStop();
098            } else {
099                doCrawlLoop();
100            }
101        } catch (IOFailure e) {
102            log.warn("Error during initialisation of crawl", e);
103            throw (e);
104        } catch (Exception e) {
105            log.warn("Exception during crawl", e);
106            throw new RuntimeException("Exception during crawl", e);
107        } finally {
108            if (heritrixController != null) {
109                heritrixController.cleanup();
110            }
111        }
112        log.debug("Heritrix is finished crawling...");
113    }
114
115    /**
116     * Monitors the crawling performed by Heritrix. Regularly checks whether any progress is made. If no progress has
117     * been made for too long, the crawl is ended.
118     *
119     * @throws IOFailure if the call to HeritrixController.requestCrawlStop() fails. Other failures in calls to the
120     * controller are caught and logged.
121     */
122    private void doCrawlLoop() throws IOFailure {
123        String errorMessage = "Non-fatal I/O error while communicating with Heritrix during crawl";
124        long lastNonZeroActiveQueuesTime = System.currentTimeMillis();
125        long lastTimeReceivedData = System.currentTimeMillis();
126        boolean crawlIsEnded = false;
127        try {
128            crawlIsEnded = heritrixController.crawlIsEnded();
129        } catch (IOFailure e) {
130            log.debug(errorMessage, e);
131        }
132        while (!crawlIsEnded) {
133            String harvestInformation = null;
134            String progressStats = null;
135            try {
136                harvestInformation = heritrixController.getHarvestInformation();
137                progressStats = heritrixController.getProgressStats();
138            } catch (IOFailure e) {
139                log.debug(errorMessage, e);
140            }
141
142            HeritrixFiles files = getHeritrixFiles();
143            log.info("Job ID: {}, Harvest ID: {}, {}\n{}", files.getJobID(), files.getHarvestID(), harvestInformation,
144                    ((progressStats == null) ? "" : progressStats));
145            // Note that we don't check for timeout while paused.
146            int processedKBPerSec = 0;
147            boolean paused = false;
148            try {
149                processedKBPerSec = heritrixController.getCurrentProcessedKBPerSec();
150                paused = heritrixController.isPaused();
151            } catch (IOFailure e) {
152                log.debug(errorMessage, e);
153            }
154            if (processedKBPerSec > 0 || paused) {
155                lastTimeReceivedData = System.currentTimeMillis();
156            }
157            int activeToeCount = 0;
158            paused = false;
159            try {
160                activeToeCount = heritrixController.getActiveToeCount();
161                paused = heritrixController.isPaused();
162            } catch (IOFailure e) {
163                log.debug(errorMessage, e);
164            }
165            if (activeToeCount > 0 || paused) {
166                lastNonZeroActiveQueuesTime = System.currentTimeMillis();
167            }
168            if ((lastNonZeroActiveQueuesTime + timeOutInMillis < System.currentTimeMillis())
169                    || (lastTimeReceivedData + timeOutInMillisReceivedData < System.currentTimeMillis())) {
170                final double noActiveQueuesTimeoutInSeconds = timeOutInMillis / 1000.0;
171                final double noDataReceivedTimeoutInSeconds = timeOutInMillisReceivedData / 1000.0;
172                long queuedUriCount = 0;
173                try {
174                    queuedUriCount = heritrixController.getQueuedUriCount();
175                } catch (IOFailure e) {
176                    log.debug(errorMessage, e);
177                }
178                long ctm = System.currentTimeMillis();
179                log.warn(
180                        "Aborting crawl because of inactivity. No active queues for the last {} seconds "
181                                + "(timeout is {} seconds).No traffic for the last {} seconds (timeout is {} seconds). URLs in queue:{}",
182                        ((ctm - lastNonZeroActiveQueuesTime) / 1000.0), noActiveQueuesTimeoutInSeconds,
183                        ((ctm - lastTimeReceivedData) / 1000.0), noDataReceivedTimeoutInSeconds, queuedUriCount);
184                // The following is the only controller command exception we
185                // don't catch here. Otherwise we might loop forever.
186                heritrixController.requestCrawlStop("Aborting because of inactivity");
187            }
188
189            // Optimization: don't wait if ended since beginning of the loop
190            try {
191                crawlIsEnded = heritrixController.crawlIsEnded();
192            } catch (IOFailure e) {
193                log.debug(errorMessage, e);
194            }
195            if (!crawlIsEnded) {
196                try {
197                    /*
198                     * Wait for heritrix to do something. WAIT_PERIOD is the interval between checks of whether we have
199                     * passed timeouts. Note that timeouts are defined in the settings, while WAIT_PERIOD (being less
200                     * relevant to the user) is defined in this class.
201                     */
202                    synchronized (this) {
203                        wait(1000 * CRAWL_CONTROL_WAIT_PERIOD);
204                    }
205                } catch (InterruptedException e) {
206                    log.trace("Waiting thread awoken: {}", e.getMessage(), e);
207                }
208            }
209        } // end of while (!crawlIsEnded)
210    }
211
212}