001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.heritrix3.controller;
024
025import java.util.LinkedList;
026import java.util.List;
027
028import org.slf4j.Logger;
029import org.slf4j.LoggerFactory;
030
031import dk.netarkivet.common.distribute.JMSConnectionFactory;
032import dk.netarkivet.common.exceptions.HarvestingAbort;
033import dk.netarkivet.common.utils.Settings;
034import dk.netarkivet.common.utils.StringUtils;
035import dk.netarkivet.common.utils.TimeUtils;
036import dk.netarkivet.harvester.HarvesterSettings;
037import dk.netarkivet.harvester.harvesting.distribute.FrontierReportMessage;
038import dk.netarkivet.harvester.harvesting.frontier.FrontierReportFilter;
039import dk.netarkivet.harvester.harvesting.frontier.FullFrontierReport;
040import dk.netarkivet.harvester.harvesting.frontier.InMemoryFrontierReport;
041import dk.netarkivet.harvester.harvesting.monitor.HarvestMonitor;
042
043/**
044 * Implements the analysis of a full frontier report obtained from Heritrix3, as the execution of a sequence of
045 * user-defined filters, that each generate a smaller, in-memory frontier report that are sent in a JMS message to the
046 * {@link HarvestMonitor}.
047 */
048public class FrontierReportAnalyzer implements Runnable {
049
050    /** The logger to use. */
051        private static final Logger LOG = LoggerFactory.getLogger(FrontierReportAnalyzer.class);
052    /** The controller used to communicate with the Heritrix3 instance. */
053    private final HeritrixController heritrixController;
054    /** The last time this Analyzer was executed. */
055    private long lastExecTime = System.currentTimeMillis();
056
057    /**
058     * Builds an analyzer, given an Heritrix3 controller instance.
059     * calls heritrixController.getFullFrontierReport().
060     *
061     * @param heritrixController the controller allowing communication with the Heritrix3 crawler instance.
062     */
063    public FrontierReportAnalyzer(HeritrixController heritrixController) {
064        super();
065        this.heritrixController = heritrixController;
066
067        // Build list of filters from the settings.
068
069        String[] filterClasses = Settings.getAll(HarvesterSettings.FRONTIER_REPORT_FILTER_CLASS);
070        String[] filterArgs = Settings.getAll(HarvesterSettings.FRONTIER_REPORT_FILTER_ARGS);
071
072        for (int i = 0; i < filterClasses.length; i++) {
073            String fClass = filterClasses[i];
074            String[] fArgs = filterArgs[i].split(";");
075
076            try {
077                FrontierReportFilter filter = (FrontierReportFilter) Class.forName(fClass).newInstance();
078                filter.init(fArgs);
079                filters.add(filter);
080            } catch (InstantiationException e) {
081                LOG.error("Failed to instantiate filter of class " + fClass, e);
082            } catch (IllegalAccessException e) {
083                LOG.error("Failed to instantiate filter of class " + fClass, e);
084            } catch (ClassNotFoundException e) {
085                LOG.error("Failed to instantiate filter of class " + fClass, e);
086            }
087        }
088    }
089
090    /**
091     * The filters to apply to the full report, as defined in the settings.
092     *
093     * @see HarvesterSettings#FRONTIER_REPORT_FILTER_CLASS
094     * @see HarvesterSettings#FRONTIER_REPORT_FILTER_ARGS
095     */
096    private List<FrontierReportFilter> filters = new LinkedList<FrontierReportFilter>();
097
098    @Override
099    public void run() {
100        long startTime = System.currentTimeMillis();
101        long elapsed = startTime - lastExecTime;
102        LOG.info("Will generate full Heritrix frontier report, "
103                + StringUtils.formatDuration(elapsed / TimeUtils.SECOND_IN_MILLIS)
104                + " elapsed since last generation started.");
105        FullFrontierReport ffr = null;
106        LOG.debug("Trying to retrieve full frontier-reports from Heritrix3");
107        try {
108            ffr = heritrixController.getFullFrontierReport();
109        } catch (HarvestingAbort e) {
110            LOG.debug("Unable to retrieve full frontier-reports from Heritrix3", e);
111            return;
112        }
113        long endTime = System.currentTimeMillis();
114        elapsed = endTime - startTime;
115        LOG.info("Generated full Heritrix frontier report in "
116                + (elapsed < TimeUtils.SECOND_IN_MILLIS ? elapsed + " ms" : StringUtils.formatDuration(elapsed
117                        / TimeUtils.SECOND_IN_MILLIS)) + ".");
118
119        lastExecTime = endTime;
120
121        for (FrontierReportFilter filter : filters) {
122            startTime = System.currentTimeMillis();
123            InMemoryFrontierReport filtered = filter.process(ffr);
124            endTime = System.currentTimeMillis();
125            elapsed = endTime - startTime;
126            LOG.info("Applied filter "
127                    + filter.getClass().getName()
128                    + " to full frontier report, this took "
129                    + (elapsed < TimeUtils.SECOND_IN_MILLIS ? elapsed + " ms" : StringUtils.formatDuration(elapsed
130                            / TimeUtils.SECOND_IN_MILLIS)) + ".");
131            Long jobId = heritrixController.getFiles().getJobID();
132            JMSConnectionFactory.getInstance().send(new FrontierReportMessage(filter, filtered, jobId));
133        }
134
135        ffr.dispose();
136    }
137
138}