001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.harvesting; 024 025import dk.netarkivet.common.exceptions.ArgumentNotValid; 026import dk.netarkivet.common.exceptions.IOFailure; 027import dk.netarkivet.common.exceptions.IllegalState; 028import dk.netarkivet.common.utils.Settings; 029import dk.netarkivet.harvester.HarvesterSettings; 030import dk.netarkivet.harvester.datamodel.H1HeritrixTemplate; 031import dk.netarkivet.harvester.datamodel.HeritrixTemplate; 032 033/** 034 * A HeritrixLauncher object wraps around an instance of the web crawler Heritrix. The object is constructed with the 035 * necessary information to do a crawl. The crawl is performed when doOneCrawl() is called. doOneCrawl() monitors 036 * progress and returns when the crawl is finished or must be stopped because it has stalled. 037 */ 038public abstract class HeritrixLauncher { 039 040 /** Class encapsulating placement of various files. */ 041 private HeritrixFiles files; 042 043 /** the arguments passed to the HeritricController constructor. */ 044 private Object[] args; 045 046 /** The period to wait in seconds before checking if Heritrix has done anything. */ 047 protected static final int CRAWL_CONTROL_WAIT_PERIOD = Settings.getInt(HarvesterSettings.CRAWL_LOOP_WAIT_TIME); 048 049 /** 050 * Private HeritrixLauncher constructor. Sets up the HeritrixLauncher from the given order file and seedsfile. 051 * 052 * @param files Object encapsulating location of Heritrix crawldir and configuration files. 053 * @throws ArgumentNotValid If either seedsfile or orderfile does not exist. 054 */ 055 protected HeritrixLauncher(HeritrixFiles files) throws ArgumentNotValid { 056 if (!files.getOrderXmlFile().isFile()) { 057 throw new ArgumentNotValid("File '" + files.getOrderXmlFile().getName() + "' must exist in order for " 058 + "Heritrix to run. This filepath does not refer to existing file: " 059 + files.getOrderXmlFile().getAbsolutePath()); 060 } 061 if (!files.getSeedsTxtFile().isFile()) { 062 throw new ArgumentNotValid("File '" + files.getSeedsTxtFile().getName() + "' must exist in order for " 063 + "Heritrix to run. This filepath does not refer to existing file: " 064 + files.getSeedsTxtFile().getAbsolutePath()); 065 } 066 this.files = files; 067 this.args = new Object[] {files}; 068 } 069 070 /** 071 * Generic constructor to allow HeritrixLauncher to use any implementation of HeritrixController. 072 * 073 * @param args the arguments to be passed to the constructor or non-static factory method of the HeritrixController 074 * class specified in settings 075 */ 076 public HeritrixLauncher(Object... args) { 077 this.args = args; 078 } 079 080 /** 081 * Launches the crawl and monitors its progress. 082 * 083 * @throws IOFailure 084 */ 085 public abstract void doCrawl() throws IOFailure; 086 087 /** 088 * @return an instance of the wrapper class for Heritrix files. 089 */ 090 protected HeritrixFiles getHeritrixFiles() { 091 return files; 092 } 093 094 /** 095 * @return the optional arguments used to initialize the chosen Heritrix1 controller implementation. 096 */ 097 protected Object[] getControllerArguments() { 098 return args; 099 } 100 101 public void setupOrderfile(HeritrixFiles files) { 102 makeTemplateReadyForHeritrix1(files); 103 } 104 105 /** 106 * 107 * Updates the diskpath value, archivefile_prefix, seedsfile, and deduplication -information. 108 * @param files Files associated with a Heritrix1 crawl-job. 109 * @throws IOFailure 110 * 111 * 112 * This method prepares the orderfile used by the Heritrix crawler. 113 * </p> 1. Verify that the template is in fact a H1HeritrixTemplate 114 * </p> 2. alters the orderfile in the 115 * following-way: (overriding whatever is in the orderfile)</br> 116 * <ol> 117 * <li>sets the disk-path to the outputdir specified in HeritrixFiles.</li> 118 * <li>sets the seedsfile to the seedsfile specified in HeritrixFiles.</li> 119 * <li>sets the prefix of the arcfiles to unique prefix defined in HeritrixFiles</li> 120 * <li>checks that the arcs-file dir is 'arcs' - to ensure that we know where the arc-files are when crawl finishes</li> 121 * <p> 122 * <li>if deduplication is enabled, sets the node pointing to index directory for deduplication (see step 3)</li> 123 * </ol> 124 * 3. saves the orderfile back to disk</p> 125 * <p> 126 * 4. if deduplication is enabled in the order.xml, it writes the absolute path of the lucene index used by the 127 * deduplication processor. 128 * 129 * @throws IOFailure - When the orderfile could not be saved to disk 130 * When a specific element cannot be found in the document. 131 */ 132 public static void makeTemplateReadyForHeritrix1(HeritrixFiles files) throws IOFailure { 133 HeritrixTemplate templ = HeritrixTemplate.read(files.getOrderXmlFile()); 134 // Verify that the template in the job is a Heritrix3Template 135 if (templ instanceof H1HeritrixTemplate) { 136 templ.setDiskPath(files.getCrawlDir().getAbsolutePath()); 137 templ.setArchiveFilePrefix(files.getArchiveFilePrefix()); 138 templ.setSeedsFilePath(files.getSeedsTxtFile().getAbsolutePath()); 139 if (templ.IsDeduplicationEnabled()) { 140 templ.setDeduplicationIndexLocation(files.getIndexDir().getAbsolutePath()); 141 } 142 files.writeOrderXml(templ); 143 } else { 144 throw new IllegalState("The template is not a H1 template!"); 145 } 146 } 147 148 149 150 151 152 153}