001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.heritrix3; 024 025import org.slf4j.Logger; 026import org.slf4j.LoggerFactory; 027 028import dk.netarkivet.common.exceptions.ArgumentNotValid; 029import dk.netarkivet.common.exceptions.IOFailure; 030import dk.netarkivet.common.exceptions.IllegalState; 031import dk.netarkivet.common.utils.Settings; 032import dk.netarkivet.harvester.datamodel.H3HeritrixTemplate; 033import dk.netarkivet.harvester.datamodel.HeritrixTemplate; 034 035/** 036 * A HeritrixLauncher object wraps around an instance of the web crawler Heritrix3. The object is constructed with the 037 * necessary information to do a crawl. The crawl is performed when doOneCrawl() is called. doOneCrawl() monitors 038 * progress and returns when the crawl is finished or must be stopped because it has stalled. 039 */ 040public abstract class HeritrixLauncherAbstract { 041 042 /** The logger for this class. */ 043 private static final Logger log = LoggerFactory.getLogger(HeritrixLauncherAbstract.class); 044 045 /** Class encapsulating placement of various files. */ 046 private Heritrix3Files files; 047 048 /** the arguments passed to the HeritrixController constructor. */ 049 private Object[] args; 050 051 /** The period to wait in seconds before checking if Heritrix3 has done anything. */ 052 protected static final int CRAWL_CONTROL_WAIT_PERIOD = Settings.getInt(Heritrix3Settings.CRAWL_LOOP_WAIT_TIME); 053 054 /** 055 * Private HeritrixLauncher constructor. Sets up the HeritrixLauncher from the given order file and seedsfile. 056 * 057 * @param files Object encapsulating location of Heritrix3 crawldir and configuration files. 058 * @throws ArgumentNotValid If either seedsfile or orderfile does not exist. 059 */ 060 protected HeritrixLauncherAbstract(Heritrix3Files files) throws ArgumentNotValid { 061 if (!files.getOrderFile().isFile()) { 062 throw new ArgumentNotValid("File '" + files.getOrderFile().getName() + "' must exist in order for " 063 + "Heritrix to run. This filepath does not refer to existing file: " 064 + files.getOrderFile().getAbsolutePath()); 065 } 066 if (!files.getSeedsFile().isFile()) { 067 throw new ArgumentNotValid("File '" + files.getSeedsFile().getName() + "' must exist in order for " 068 + "Heritrix to run. This filepath does not refer to existing file: " 069 + files.getSeedsFile().getAbsolutePath()); 070 } 071 this.files = files; 072 this.args = new Object[] {files}; 073 } 074 075 /** 076 * Generic constructor to allow HeritrixLauncher to use any implementation of HeritrixController. 077 * 078 * @param args the arguments to be passed to the constructor or non-static factory method of the HeritrixController 079 * class specified in settings 080 */ 081 public HeritrixLauncherAbstract(Object... args) { 082 this.args = args; 083 } 084 085 /** 086 * Launches the crawl and monitors its progress. 087 * 088 * @throws IOFailure 089 */ 090 public abstract void doCrawl() throws IOFailure; 091 092 /** 093 * @return an instance of the wrapper class for Heritrix files. 094 */ 095 protected Heritrix3Files getHeritrixFiles() { 096 return files; 097 } 098 099 /** 100 * @return the optional arguments used to initialize the chosen Heritrix controller implementation. 101 */ 102 protected Object[] getControllerArguments() { 103 return args; 104 } 105 106 public void setupOrderfile(Heritrix3Files files) { 107 // Here the last changes of the template is performed 108 log.info("Make the template ready for Heritrix3"); 109 makeTemplateReadyForHeritrix3(files); 110 } 111 112 /** 113 * Updates the archivefile_prefix, and location of the deduplication index if needed. 114 * @param files a set of files associated with a Heritrix3 job 115 * @throws IOFailure 116 */ 117 /** 118 * This method prepares the crawler-beans.cxml file used by the Heritrix3 crawler. </p> 1. alters the crawler-beans.cxml in the 119 * following-way: (overriding whatever is in the crawler-beans.cxml)</br> 120 * <ol> 121 * <li>sets the prefix of the archive files to the unique prefix defined in Heritrix3Files</li> 122 * <p> 123 * <li>if deduplication is enabled, sets the node pointing to index directory for deduplication (see step 3)</li> 124 * </ol> 125 * 2. saves the orderfile back to disk</p> 126 * <p> 127 * 3. if deduplication is enabled in the order.xml, it writes the absolute path of the lucene index used by the 128 * deduplication processor. 129 * 130 * @throws IOFailure - When the orderfile could not be saved to disk 131 * @throws IllegalState - When the orderfile is not a H3 template 132 */ 133 public static void makeTemplateReadyForHeritrix3(Heritrix3Files files) throws IOFailure { 134 HeritrixTemplate templ = HeritrixTemplate.read(files.getOrderXmlFile()); 135 if (templ instanceof H3HeritrixTemplate) { 136 H3HeritrixTemplate template = (H3HeritrixTemplate) templ; 137 template.setArchiveFilePrefix(files.getArchiveFilePrefix()); 138 139 if (template.IsDeduplicationEnabled()) { 140 template.setDeduplicationIndexLocation(files.getIndexDir().getAbsolutePath()); 141 } 142 // Remove superfluous placeholders in the template (maybe unnecessary) 143 template.removePlaceholders(); 144 files.writeOrderXml(template); 145 } else { 146 throw new IllegalState("The template is not a H3 template!"); 147 } 148 } 149 150}