001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.harvesting;
024
025import dk.netarkivet.common.exceptions.ArgumentNotValid;
026import dk.netarkivet.common.exceptions.IOFailure;
027import dk.netarkivet.common.exceptions.IllegalState;
028import dk.netarkivet.common.utils.Settings;
029import dk.netarkivet.harvester.HarvesterSettings;
030import dk.netarkivet.harvester.datamodel.H1HeritrixTemplate;
031import dk.netarkivet.harvester.datamodel.HeritrixTemplate;
032
033/**
034 * A HeritrixLauncher object wraps around an instance of the web crawler Heritrix. The object is constructed with the
035 * necessary information to do a crawl. The crawl is performed when doOneCrawl() is called. doOneCrawl() monitors
036 * progress and returns when the crawl is finished or must be stopped because it has stalled.
037 */
038public abstract class HeritrixLauncher {
039
040    /** Class encapsulating placement of various files. */
041    private HeritrixFiles files;
042
043    /** the arguments passed to the HeritricController constructor. */
044    private Object[] args;
045
046    /** The period to wait in seconds before checking if Heritrix has done anything. */
047    protected static final int CRAWL_CONTROL_WAIT_PERIOD = Settings.getInt(HarvesterSettings.CRAWL_LOOP_WAIT_TIME);
048
049    /**
050     * Private HeritrixLauncher constructor. Sets up the HeritrixLauncher from the given order file and seedsfile.
051     *
052     * @param files Object encapsulating location of Heritrix crawldir and configuration files.
053     * @throws ArgumentNotValid If either seedsfile or orderfile does not exist.
054     */
055    protected HeritrixLauncher(HeritrixFiles files) throws ArgumentNotValid {
056        if (!files.getOrderXmlFile().isFile()) {
057            throw new ArgumentNotValid("File '" + files.getOrderXmlFile().getName() + "' must exist in order for "
058                    + "Heritrix to run. This filepath does not refer to existing file: "
059                    + files.getOrderXmlFile().getAbsolutePath());
060        }
061        if (!files.getSeedsTxtFile().isFile()) {
062            throw new ArgumentNotValid("File '" + files.getSeedsTxtFile().getName() + "' must exist in order for "
063                    + "Heritrix to run. This filepath does not refer to existing file: "
064                    + files.getSeedsTxtFile().getAbsolutePath());
065        }
066        this.files = files;
067        this.args = new Object[] {files};
068    }
069
070    /**
071     * Generic constructor to allow HeritrixLauncher to use any implementation of HeritrixController.
072     *
073     * @param args the arguments to be passed to the constructor or non-static factory method of the HeritrixController
074     * class specified in settings
075     */
076    public HeritrixLauncher(Object... args) {
077        this.args = args;
078    }
079
080    /**
081     * Launches the crawl and monitors its progress.
082     *
083     * @throws IOFailure
084     */
085    public abstract void doCrawl() throws IOFailure;
086
087    /**
088     * @return an instance of the wrapper class for Heritrix files.
089     */
090    protected HeritrixFiles getHeritrixFiles() {
091        return files;
092    }
093
094    /**
095     * @return the optional arguments used to initialize the chosen Heritrix1 controller implementation.
096     */
097    protected Object[] getControllerArguments() {
098        return args;
099    }
100
101    public void setupOrderfile(HeritrixFiles files) {
102        makeTemplateReadyForHeritrix1(files);
103    }
104    
105    /**
106     * 
107     * Updates the diskpath value, archivefile_prefix, seedsfile, and deduplication -information.
108     * @param files Files associated with a Heritrix1 crawl-job.
109     * @throws IOFailure
110     *  
111     *
112     * This method prepares the orderfile used by the Heritrix crawler. 
113     * </p> 1. Verify that the template is in fact a H1HeritrixTemplate 
114     * </p> 2. alters the orderfile in the
115     * following-way: (overriding whatever is in the orderfile)</br>
116     * <ol>
117     * <li>sets the disk-path to the outputdir specified in HeritrixFiles.</li>
118     * <li>sets the seedsfile to the seedsfile specified in HeritrixFiles.</li>
119     * <li>sets the prefix of the arcfiles to unique prefix defined in HeritrixFiles</li>
120     * <li>checks that the arcs-file dir is 'arcs' - to ensure that we know where the arc-files are when crawl finishes</li>
121     * <p>
122     * <li>if deduplication is enabled, sets the node pointing to index directory for deduplication (see step 3)</li>
123     * </ol>
124     * 3. saves the orderfile back to disk</p>
125     * <p>
126     * 4. if deduplication is enabled in the order.xml, it writes the absolute path of the lucene index used by the
127     * deduplication processor.
128     *
129     * @throws IOFailure - When the orderfile could not be saved to disk 
130     *                     When a specific element cannot be found in the document. 
131     */
132    public static void makeTemplateReadyForHeritrix1(HeritrixFiles files) throws IOFailure {
133        HeritrixTemplate templ = HeritrixTemplate.read(files.getOrderXmlFile());
134        // Verify that the template in the job is a Heritrix3Template
135        if (templ instanceof H1HeritrixTemplate) {
136                templ.setDiskPath(files.getCrawlDir().getAbsolutePath());
137                templ.setArchiveFilePrefix(files.getArchiveFilePrefix());
138                templ.setSeedsFilePath(files.getSeedsTxtFile().getAbsolutePath());
139                if (templ.IsDeduplicationEnabled()) {
140                        templ.setDeduplicationIndexLocation(files.getIndexDir().getAbsolutePath());
141                }
142                files.writeOrderXml(templ);
143        } else {
144                throw new IllegalState("The template is not a H1 template!");
145        }
146    }
147
148    
149    
150    
151    
152
153}