001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.harvesting.controller; 024 025import java.util.LinkedList; 026import java.util.List; 027 028import org.slf4j.Logger; 029import org.slf4j.LoggerFactory; 030 031import dk.netarkivet.common.distribute.JMSConnectionFactory; 032import dk.netarkivet.common.exceptions.HarvestingAbort; 033import dk.netarkivet.common.utils.Settings; 034import dk.netarkivet.common.utils.StringUtils; 035import dk.netarkivet.common.utils.TimeUtils; 036import dk.netarkivet.harvester.HarvesterSettings; 037import dk.netarkivet.harvester.harvesting.distribute.FrontierReportMessage; 038import dk.netarkivet.harvester.harvesting.frontier.FrontierReportFilter; 039import dk.netarkivet.harvester.harvesting.frontier.FullFrontierReport; 040import dk.netarkivet.harvester.harvesting.frontier.InMemoryFrontierReport; 041import dk.netarkivet.harvester.harvesting.monitor.HarvestMonitor; 042 043/** 044 * Implements the analysis of a full frontier report obtained from Heritrix, as the execution of a sequence of 045 * user-defined filters, that each generate a smaller, in-memory frontier report that are sent in a JMS message to the 046 * {@link HarvestMonitor}. 047 */ 048public class FrontierReportAnalyzer implements Runnable { 049 050 /** The logger to use. */ 051 static final Logger LOG = LoggerFactory.getLogger(FrontierReportAnalyzer.class); 052 /** The controller used to communicate with the Heritrix instance. */ 053 private final BnfHeritrixController heritrixController; 054 /** The last time this Analyzer was executed. */ 055 private long lastExecTime = System.currentTimeMillis(); 056 057 /** 058 * Builds an analyzer, given an Heritrix controller instance. 059 * 060 * @param heritrixController the controller allowing communication with the Heritrix crawler instance. 061 */ 062 public FrontierReportAnalyzer(BnfHeritrixController heritrixController) { 063 super(); 064 this.heritrixController = heritrixController; 065 066 // Build list of filters from the settings. 067 068 String[] filterClasses = Settings.getAll(HarvesterSettings.FRONTIER_REPORT_FILTER_CLASS); 069 String[] filterArgs = Settings.getAll(HarvesterSettings.FRONTIER_REPORT_FILTER_ARGS); 070 071 for (int i = 0; i < filterClasses.length; i++) { 072 String fClass = filterClasses[i]; 073 String[] fArgs = filterArgs[i].split(";"); 074 075 try { 076 FrontierReportFilter filter = (FrontierReportFilter) Class.forName(fClass).newInstance(); 077 filter.init(fArgs); 078 filters.add(filter); 079 } catch (InstantiationException e) { 080 LOG.error("Failed to instantiate filter of class " + fClass, e); 081 } catch (IllegalAccessException e) { 082 LOG.error("Failed to instantiate filter of class " + fClass, e); 083 } catch (ClassNotFoundException e) { 084 LOG.error("Failed to instantiate filter of class " + fClass, e); 085 } 086 } 087 } 088 089 /** 090 * The filters to apply to the full report, as defined in the settings. 091 * 092 * @see HarvesterSettings#FRONTIER_REPORT_FILTER_CLASS 093 * @see HarvesterSettings#FRONTIER_REPORT_FILTER_ARGS 094 */ 095 private List<FrontierReportFilter> filters = new LinkedList<FrontierReportFilter>(); 096 097 @Override 098 public void run() { 099 long startTime = System.currentTimeMillis(); 100 long elapsed = startTime - lastExecTime; 101 LOG.info("Will generate full Heritrix frontier report, " 102 + StringUtils.formatDuration(elapsed / TimeUtils.SECOND_IN_MILLIS) 103 + " elapsed since last generation started."); 104 FullFrontierReport ffr = null; 105 try { 106 ffr = heritrixController.getFullFrontierReport(); 107 } catch (HarvestingAbort e) { 108 LOG.debug("Unable to retrieve full frontier-reports from Heritrix", e); 109 return; 110 } 111 long endTime = System.currentTimeMillis(); 112 elapsed = endTime - startTime; 113 LOG.info("Generated full Heritrix frontier report in " 114 + (elapsed < TimeUtils.SECOND_IN_MILLIS ? elapsed + " ms" : StringUtils.formatDuration(elapsed 115 / TimeUtils.SECOND_IN_MILLIS)) + "."); 116 117 lastExecTime = endTime; 118 119 for (FrontierReportFilter filter : filters) { 120 startTime = System.currentTimeMillis(); 121 InMemoryFrontierReport filtered = filter.process(ffr); 122 endTime = System.currentTimeMillis(); 123 elapsed = endTime - startTime; 124 LOG.info("Applied filter " 125 + filter.getClass().getName() 126 + " to full frontier report, this took " 127 + (elapsed < TimeUtils.SECOND_IN_MILLIS ? elapsed + " ms" : StringUtils.formatDuration(elapsed 128 / TimeUtils.SECOND_IN_MILLIS)) + "."); 129 Long jobId = heritrixController.getFiles().getJobID(); 130 JMSConnectionFactory.getInstance().send(new FrontierReportMessage(filter, filtered, jobId)); 131 } 132 133 ffr.dispose(); 134 } 135 136}