001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.harvesting.controller;
024
025import java.util.LinkedList;
026import java.util.List;
027
028import org.slf4j.Logger;
029import org.slf4j.LoggerFactory;
030
031import dk.netarkivet.common.distribute.JMSConnectionFactory;
032import dk.netarkivet.common.exceptions.HarvestingAbort;
033import dk.netarkivet.common.utils.Settings;
034import dk.netarkivet.common.utils.StringUtils;
035import dk.netarkivet.common.utils.TimeUtils;
036import dk.netarkivet.harvester.HarvesterSettings;
037import dk.netarkivet.harvester.harvesting.distribute.FrontierReportMessage;
038import dk.netarkivet.harvester.harvesting.frontier.FrontierReportFilter;
039import dk.netarkivet.harvester.harvesting.frontier.FullFrontierReport;
040import dk.netarkivet.harvester.harvesting.frontier.InMemoryFrontierReport;
041import dk.netarkivet.harvester.harvesting.monitor.HarvestMonitor;
042
043/**
044 * Implements the analysis of a full frontier report obtained from Heritrix, as the execution of a sequence of
045 * user-defined filters, that each generate a smaller, in-memory frontier report that are sent in a JMS message to the
046 * {@link HarvestMonitor}.
047 */
048public class FrontierReportAnalyzer implements Runnable {
049
050    /** The logger to use. */
051    static final Logger LOG = LoggerFactory.getLogger(FrontierReportAnalyzer.class);
052    /** The controller used to communicate with the Heritrix instance. */
053    private final BnfHeritrixController heritrixController;
054    /** The last time this Analyzer was executed. */
055    private long lastExecTime = System.currentTimeMillis();
056
057    /**
058     * Builds an analyzer, given an Heritrix controller instance.
059     *
060     * @param heritrixController the controller allowing communication with the Heritrix crawler instance.
061     */
062    public FrontierReportAnalyzer(BnfHeritrixController heritrixController) {
063        super();
064        this.heritrixController = heritrixController;
065
066        // Build list of filters from the settings.
067
068        String[] filterClasses = Settings.getAll(HarvesterSettings.FRONTIER_REPORT_FILTER_CLASS);
069        String[] filterArgs = Settings.getAll(HarvesterSettings.FRONTIER_REPORT_FILTER_ARGS);
070
071        for (int i = 0; i < filterClasses.length; i++) {
072            String fClass = filterClasses[i];
073            String[] fArgs = filterArgs[i].split(";");
074
075            try {
076                FrontierReportFilter filter = (FrontierReportFilter) Class.forName(fClass).newInstance();
077                filter.init(fArgs);
078                filters.add(filter);
079            } catch (InstantiationException e) {
080                LOG.error("Failed to instantiate filter of class " + fClass, e);
081            } catch (IllegalAccessException e) {
082                LOG.error("Failed to instantiate filter of class " + fClass, e);
083            } catch (ClassNotFoundException e) {
084                LOG.error("Failed to instantiate filter of class " + fClass, e);
085            }
086        }
087    }
088
089    /**
090     * The filters to apply to the full report, as defined in the settings.
091     *
092     * @see HarvesterSettings#FRONTIER_REPORT_FILTER_CLASS
093     * @see HarvesterSettings#FRONTIER_REPORT_FILTER_ARGS
094     */
095    private List<FrontierReportFilter> filters = new LinkedList<FrontierReportFilter>();
096
097    @Override
098    public void run() {
099        long startTime = System.currentTimeMillis();
100        long elapsed = startTime - lastExecTime;
101        LOG.info("Will generate full Heritrix frontier report, "
102                + StringUtils.formatDuration(elapsed / TimeUtils.SECOND_IN_MILLIS)
103                + " elapsed since last generation started.");
104        FullFrontierReport ffr = null;
105        try {
106            ffr = heritrixController.getFullFrontierReport();
107        } catch (HarvestingAbort e) {
108            LOG.debug("Unable to retrieve full frontier-reports from Heritrix", e);
109            return;
110        }
111        long endTime = System.currentTimeMillis();
112        elapsed = endTime - startTime;
113        LOG.info("Generated full Heritrix frontier report in "
114                + (elapsed < TimeUtils.SECOND_IN_MILLIS ? elapsed + " ms" : StringUtils.formatDuration(elapsed
115                        / TimeUtils.SECOND_IN_MILLIS)) + ".");
116
117        lastExecTime = endTime;
118
119        for (FrontierReportFilter filter : filters) {
120            startTime = System.currentTimeMillis();
121            InMemoryFrontierReport filtered = filter.process(ffr);
122            endTime = System.currentTimeMillis();
123            elapsed = endTime - startTime;
124            LOG.info("Applied filter "
125                    + filter.getClass().getName()
126                    + " to full frontier report, this took "
127                    + (elapsed < TimeUtils.SECOND_IN_MILLIS ? elapsed + " ms" : StringUtils.formatDuration(elapsed
128                            / TimeUtils.SECOND_IN_MILLIS)) + ".");
129            Long jobId = heritrixController.getFiles().getJobID();
130            JMSConnectionFactory.getInstance().send(new FrontierReportMessage(filter, filtered, jobId));
131        }
132
133        ffr.dispose();
134    }
135
136}