001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.heritrix3;
024
025import org.slf4j.Logger;
026import org.slf4j.LoggerFactory;
027
028import dk.netarkivet.common.exceptions.ArgumentNotValid;
029import dk.netarkivet.common.exceptions.IOFailure;
030import dk.netarkivet.common.exceptions.IllegalState;
031import dk.netarkivet.common.utils.Settings;
032import dk.netarkivet.harvester.datamodel.H3HeritrixTemplate;
033import dk.netarkivet.harvester.datamodel.HeritrixTemplate;
034
035/**
036 * A HeritrixLauncher object wraps around an instance of the web crawler Heritrix3. The object is constructed with the
037 * necessary information to do a crawl. The crawl is performed when doOneCrawl() is called. doOneCrawl() monitors
038 * progress and returns when the crawl is finished or must be stopped because it has stalled.
039 */
040public abstract class HeritrixLauncherAbstract {
041
042    /** The logger for this class. */
043    private static final Logger log = LoggerFactory.getLogger(HeritrixLauncherAbstract.class);
044    
045    /** Class encapsulating placement of various files. */
046    private Heritrix3Files files;
047
048    /** the arguments passed to the HeritrixController constructor. */
049    private Object[] args;
050
051    /** The period to wait in seconds before checking if Heritrix3 has done anything. */
052    protected static final int CRAWL_CONTROL_WAIT_PERIOD = Settings.getInt(Heritrix3Settings.CRAWL_LOOP_WAIT_TIME);
053
054    /**
055     * Private HeritrixLauncher constructor. Sets up the HeritrixLauncher from the given order file and seedsfile.
056     *
057     * @param files Object encapsulating location of Heritrix3 crawldir and configuration files.
058     * @throws ArgumentNotValid If either seedsfile or orderfile does not exist.
059     */
060    protected HeritrixLauncherAbstract(Heritrix3Files files) throws ArgumentNotValid {
061        if (!files.getOrderFile().isFile()) {
062            throw new ArgumentNotValid("File '" + files.getOrderFile().getName() + "' must exist in order for "
063                    + "Heritrix to run. This filepath does not refer to existing file: "
064                    + files.getOrderFile().getAbsolutePath());
065        }
066        if (!files.getSeedsFile().isFile()) {
067            throw new ArgumentNotValid("File '" + files.getSeedsFile().getName() + "' must exist in order for "
068                    + "Heritrix to run. This filepath does not refer to existing file: "
069                    + files.getSeedsFile().getAbsolutePath());
070        }
071        this.files = files;
072        this.args = new Object[] {files};
073    }
074
075    /**
076     * Generic constructor to allow HeritrixLauncher to use any implementation of HeritrixController.
077     *
078     * @param args the arguments to be passed to the constructor or non-static factory method of the HeritrixController
079     * class specified in settings
080     */
081    public HeritrixLauncherAbstract(Object... args) {
082        this.args = args;
083    }
084
085    /**
086     * Launches the crawl and monitors its progress.
087     *
088     * @throws IOFailure
089     */
090    public abstract void doCrawl() throws IOFailure;
091
092    /**
093     * @return an instance of the wrapper class for Heritrix files.
094     */
095    protected Heritrix3Files getHeritrixFiles() {
096        return files;
097    }
098
099    /**
100     * @return the optional arguments used to initialize the chosen Heritrix controller implementation.
101     */
102    protected Object[] getControllerArguments() {
103        return args;
104    }
105
106    public void setupOrderfile(Heritrix3Files files) {
107        // Here the last changes of the template is performed
108        log.info("Make the template ready for Heritrix3");
109        makeTemplateReadyForHeritrix3(files);
110    }
111
112    /**
113     * Updates the archivefile_prefix, and location of the deduplication index if needed.
114     * @param files a set of files associated with a Heritrix3 job
115     * @throws IOFailure 
116     */
117    /**
118     * This method prepares the crawler-beans.cxml file used by the Heritrix3 crawler. </p> 1. alters the crawler-beans.cxml in the
119     * following-way: (overriding whatever is in the crawler-beans.cxml)</br>
120     * <ol>
121     * <li>sets the prefix of the archive files to the unique prefix defined in Heritrix3Files</li>
122     * <p>
123     * <li>if deduplication is enabled, sets the node pointing to index directory for deduplication (see step 3)</li>
124     * </ol>
125     * 2. saves the orderfile back to disk</p>
126     * <p>
127     * 3. if deduplication is enabled in the order.xml, it writes the absolute path of the lucene index used by the
128     * deduplication processor.
129     *
130     * @throws IOFailure - When the orderfile could not be saved to disk 
131     * @throws IllegalState - When the orderfile is not a H3 template                  
132     */
133    public static void makeTemplateReadyForHeritrix3(Heritrix3Files files) throws IOFailure {           
134        HeritrixTemplate templ = HeritrixTemplate.read(files.getOrderXmlFile());
135        if (templ instanceof H3HeritrixTemplate) {
136                H3HeritrixTemplate template = (H3HeritrixTemplate) templ;
137                template.setArchiveFilePrefix(files.getArchiveFilePrefix());
138
139                if (template.IsDeduplicationEnabled()) {
140                        template.setDeduplicationIndexLocation(files.getIndexDir().getAbsolutePath());
141                }
142                // Remove superfluous placeholders in the template (maybe unnecessary)
143                template.removePlaceholders();
144                files.writeOrderXml(template);
145        } else {
146                throw new IllegalState("The template is not a H3 template!");
147        }
148    }
149
150}