001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.heritrix3.controller; 024 025import java.util.LinkedList; 026import java.util.List; 027 028import org.slf4j.Logger; 029import org.slf4j.LoggerFactory; 030 031import dk.netarkivet.common.distribute.JMSConnectionFactory; 032import dk.netarkivet.common.exceptions.HarvestingAbort; 033import dk.netarkivet.common.utils.Settings; 034import dk.netarkivet.common.utils.StringUtils; 035import dk.netarkivet.common.utils.TimeUtils; 036import dk.netarkivet.harvester.HarvesterSettings; 037import dk.netarkivet.harvester.harvesting.distribute.FrontierReportMessage; 038import dk.netarkivet.harvester.harvesting.frontier.FrontierReportFilter; 039import dk.netarkivet.harvester.harvesting.frontier.FullFrontierReport; 040import dk.netarkivet.harvester.harvesting.frontier.InMemoryFrontierReport; 041import dk.netarkivet.harvester.harvesting.monitor.HarvestMonitor; 042 043/** 044 * Implements the analysis of a full frontier report obtained from Heritrix3, as the execution of a sequence of 045 * user-defined filters, that each generate a smaller, in-memory frontier report that are sent in a JMS message to the 046 * {@link HarvestMonitor}. 047 */ 048public class FrontierReportAnalyzer implements Runnable { 049 050 /** The logger to use. */ 051 private static final Logger LOG = LoggerFactory.getLogger(FrontierReportAnalyzer.class); 052 /** The controller used to communicate with the Heritrix3 instance. */ 053 private final HeritrixController heritrixController; 054 /** The last time this Analyzer was executed. */ 055 private long lastExecTime = System.currentTimeMillis(); 056 057 /** 058 * Builds an analyzer, given an Heritrix3 controller instance. 059 * calls heritrixController.getFullFrontierReport(). 060 * 061 * @param heritrixController the controller allowing communication with the Heritrix3 crawler instance. 062 */ 063 public FrontierReportAnalyzer(HeritrixController heritrixController) { 064 super(); 065 this.heritrixController = heritrixController; 066 067 // Build list of filters from the settings. 068 069 String[] filterClasses = Settings.getAll(HarvesterSettings.FRONTIER_REPORT_FILTER_CLASS); 070 String[] filterArgs = Settings.getAll(HarvesterSettings.FRONTIER_REPORT_FILTER_ARGS); 071 072 for (int i = 0; i < filterClasses.length; i++) { 073 String fClass = filterClasses[i]; 074 String[] fArgs = filterArgs[i].split(";"); 075 076 try { 077 FrontierReportFilter filter = (FrontierReportFilter) Class.forName(fClass).newInstance(); 078 filter.init(fArgs); 079 filters.add(filter); 080 } catch (InstantiationException e) { 081 LOG.error("Failed to instantiate filter of class " + fClass, e); 082 } catch (IllegalAccessException e) { 083 LOG.error("Failed to instantiate filter of class " + fClass, e); 084 } catch (ClassNotFoundException e) { 085 LOG.error("Failed to instantiate filter of class " + fClass, e); 086 } 087 } 088 } 089 090 /** 091 * The filters to apply to the full report, as defined in the settings. 092 * 093 * @see HarvesterSettings#FRONTIER_REPORT_FILTER_CLASS 094 * @see HarvesterSettings#FRONTIER_REPORT_FILTER_ARGS 095 */ 096 private List<FrontierReportFilter> filters = new LinkedList<FrontierReportFilter>(); 097 098 @Override 099 public void run() { 100 long startTime = System.currentTimeMillis(); 101 long elapsed = startTime - lastExecTime; 102 LOG.info("Will generate full Heritrix frontier report, " 103 + StringUtils.formatDuration(elapsed / TimeUtils.SECOND_IN_MILLIS) 104 + " elapsed since last generation started."); 105 FullFrontierReport ffr = null; 106 LOG.debug("Trying to retrieve full frontier-reports from Heritrix3"); 107 try { 108 ffr = heritrixController.getFullFrontierReport(); 109 } catch (HarvestingAbort e) { 110 LOG.debug("Unable to retrieve full frontier-reports from Heritrix3", e); 111 return; 112 } 113 long endTime = System.currentTimeMillis(); 114 elapsed = endTime - startTime; 115 LOG.info("Generated full Heritrix frontier report in " 116 + (elapsed < TimeUtils.SECOND_IN_MILLIS ? elapsed + " ms" : StringUtils.formatDuration(elapsed 117 / TimeUtils.SECOND_IN_MILLIS)) + "."); 118 119 lastExecTime = endTime; 120 121 for (FrontierReportFilter filter : filters) { 122 startTime = System.currentTimeMillis(); 123 InMemoryFrontierReport filtered = filter.process(ffr); 124 endTime = System.currentTimeMillis(); 125 elapsed = endTime - startTime; 126 LOG.info("Applied filter " 127 + filter.getClass().getName() 128 + " to full frontier report, this took " 129 + (elapsed < TimeUtils.SECOND_IN_MILLIS ? elapsed + " ms" : StringUtils.formatDuration(elapsed 130 / TimeUtils.SECOND_IN_MILLIS)) + "."); 131 Long jobId = heritrixController.getFiles().getJobID(); 132 JMSConnectionFactory.getInstance().send(new FrontierReportMessage(filter, filtered, jobId)); 133 } 134 135 ffr.dispose(); 136 } 137 138}