Source code

001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.harvester.harvesting;
025
026import java.io.File;
027import java.io.FileOutputStream;
028import java.io.IOException;
029import java.io.InputStream;
030import java.io.OutputStream;
031
032import org.apache.commons.io.IOUtils;
033import org.slf4j.Logger;
034import org.slf4j.LoggerFactory;
035
036import dk.netarkivet.common.CommonSettings;
037import dk.netarkivet.common.Constants;
038import dk.netarkivet.common.exceptions.ArgumentNotValid;
039import dk.netarkivet.common.exceptions.IOFailure;
040import dk.netarkivet.common.utils.FileUtils;
041import dk.netarkivet.common.utils.Settings;
042import dk.netarkivet.common.utils.StreamUtils;
043import dk.netarkivet.harvester.datamodel.HeritrixTemplate;
044
045/**
046 * This class encapsulates all the files that Heritrix gets from our system, and all files we read from Heritrix.
047 */
048public class HeritrixFiles {
049
050    /** The logger. */
051    private static final Logger log = LoggerFactory.getLogger(HeritrixFiles.class);
052
053    /** The directory that crawls are performed in. */
054    private final File crawlDir;
055    /** The job ID this object represents files for. */
056    private final Long jobID;
057    /** The job ID this object represents files for. */
058    private final Long harvestID;
059
060    /** The prefix we put on generated ARC or WARC files. */
061    private final String arcFilePrefix;
062
063    /** The JMX password file to be used by Heritrix 1.X. */
064    private final File jmxPasswordFile;
065    /** The JMX access file to be used by Heritrix 1.X. */
066    private final File jmxAccessFile;
067
068    /** The name of the order.xml file. */
069    private static final String ORDER_XML_FILENAME = "order.xml";
070
071    /** The name of the seeds.txt file. */
072    private static final String SEEDS_TXT_FILENAME = "seeds.txt";
073
074    /** The name of the recoverBackup.gz file. */
075    private static final String RECOVERBACKUP_GZ_FILENAME = "recoverBackup.gz";
076
077    /** The name of the index directory. */
078    private File indexDir;
079    
080    /** The name of the progress statistics log. */
081    private static final String PROGRESS_STATISTICS_LOG_FILENAME = "progress-statistics.log";
082    /** The name of the crawl log. */
083    private static final String CRAWL_LOG_FILENAME = "crawl.log";
084    /** The name of the stdout/stderr file from Heritrix. */
085    private static final String OUTPUT_FILENAME = "heritrix.out";
086    
087    /** The version of Heritrix. */
088    private Version version;
089
090    /**
091     * Create a new HeritrixFiles object for a job.
092     *
093     * @param crawlDir The dir, where the crawl-files are placed. Assumes, that crawlDir exists already.
094     * @param harvestJob The harvestjob behind this instance of HeritrixFiles
095     * @param jmxPasswordFile The jmx password file to be used by Heritrix 1. The existence of this file is checked
096     * another place.
097     * @param jmxAccessFile The JMX access file to be used by Heritrix 1. The existence of this file is checked another
098     * place.
099     * @throws ArgumentNotValid if null crawlDir, or non-positive jobID and harvestID.
100     */
101    public HeritrixFiles(File crawlDir, JobInfo harvestJob, File jmxPasswordFile, File jmxAccessFile) {
102        ArgumentNotValid.checkNotNull(crawlDir, "crawlDir");
103        ArgumentNotValid.checkNotNull(harvestJob, "harvestJob");
104        ArgumentNotValid.checkNotNull(jmxPasswordFile, "jmxPasswordFile");
105        ArgumentNotValid.checkNotNull(jmxAccessFile, "jmxAccessFile");
106        this.crawlDir = crawlDir;
107        this.jobID = harvestJob.getJobID();
108        this.harvestID = harvestJob.getOrigHarvestDefinitionID();
109        this.arcFilePrefix = harvestJob.getHarvestFilenamePrefix();
110        this.jmxPasswordFile = jmxPasswordFile;
111        this.jmxAccessFile = jmxAccessFile;
112        this.version = Version.HERITRIX_1;
113    }
114
115    public HeritrixFiles(File crawlDir, JobInfo harvestJob, File jmxPasswordFile, File jmxAccessFile, 
116                Version version) {
117        ArgumentNotValid.checkNotNull(crawlDir, "crawlDir");
118        ArgumentNotValid.checkNotNull(harvestJob, "harvestJob");
119        this.crawlDir = crawlDir;
120        this.jobID = harvestJob.getJobID();
121        this.harvestID = harvestJob.getOrigHarvestDefinitionID();
122        this.arcFilePrefix = harvestJob.getHarvestFilenamePrefix();
123        this.jmxPasswordFile = jmxPasswordFile;
124        this.jmxAccessFile = jmxAccessFile;
125        this.version = version;
126    }
127        
128    public static HeritrixFiles getH1HeritrixFilesWithDefaultJmxFiles(File crawlDir, JobInfo harvestJob) {
129        return new HeritrixFiles(crawlDir, harvestJob, 
130                                new File(Settings.get(CommonSettings.JMX_PASSWORD_FILE)), 
131                                new File(Settings.get(CommonSettings.JMX_ACCESS_FILE)), Version.HERITRIX_1);
132    }
133    
134    public static HeritrixFiles getH3HeritrixFiles(File crawlDir, JobInfo harvestJob) {
135        return new HeritrixFiles(crawlDir, harvestJob, null, null, Version.HERITRIX_3);
136    }
137    
138    /*
139    /**
140     * Alternate constructor that by default reads the jmxPasswordFile, and jmxAccessFile from the current settings.
141     *
142     * @param crawlDir The dir, where the crawl-files are placed
143     * @param harvestJob The harvestjob behind this instance of HeritrixFiles
144     */
145    /*
146    public HeritrixFiles(File crawlDir, JobInfo harvestJob) {
147        this(crawlDir, harvestJob, new File(Settings.get(CommonSettings.JMX_PASSWORD_FILE)), new File(
148                Settings.get(CommonSettings.JMX_ACCESS_FILE)));
149    }
150    */
151    
152    /**
153     * Returns the directory that crawls are performed inside.
154     *
155     * @return A directory (that is created as part of harvest setup) that all of Heritrix' files live in.
156     */
157    public File getCrawlDir() {
158        return crawlDir;
159    }
160
161    /**
162     * Returns the prefix used to generate Archive files (ARC or WARC).
163     *
164     * @return The archive file prefix, currently jobID-harvestID.
165     */
166    public String getArchiveFilePrefix() {
167        return this.arcFilePrefix;
168    }
169
170    /**
171     * Returns the order.xml file object.
172     *
173     * @return A file object for the order.xml file (which may not have been written yet).
174     */
175    public File getOrderXmlFile() {
176        return new File(crawlDir, ORDER_XML_FILENAME);
177    }
178
179    /**
180     * Returns the seeds.txt file object.
181     *
182     * @return A file object for the seeds.txt file (which may not have been written yet).
183     */
184    public File getSeedsTxtFile() {
185        return new File(crawlDir, SEEDS_TXT_FILENAME);
186    }
187
188    /**
189     * Returns the recoverbackup file object.
190     *
191     * @return A file object for the recoverbackup.gz. file (which may or may not exist).
192     */
193    public File getRecoverBackupGzFile() {
194        return new File(crawlDir, RECOVERBACKUP_GZ_FILENAME);
195    }
196
197    /**
198     * Try to write the recover-backup file.
199     *
200     * @param recoverlog The recoverlog in the form of an InputStream
201     * @return true, if operation succeeds, otherwise false
202     */
203    public boolean writeRecoverBackupfile(InputStream recoverlog) {
204        OutputStream os = null;
205        try {
206            os = new FileOutputStream(getRecoverBackupGzFile());
207            StreamUtils.copyInputStreamToOutputStream(recoverlog, os);
208        } catch (IOException e) {
209            log.debug("The writing of the recoverlog failed: ", e);
210            return false;
211        } finally {
212            IOUtils.closeQuietly(os);
213        }
214        return true;
215    }
216
217    /**
218     * Writes the given content to the seeds.txt file.
219     *
220     * @param seeds The intended content of seeds.txt
221     * @throws ArgumentNotValid if seeds is null or empty
222     */
223    public void writeSeedsTxt(String seeds) {
224        ArgumentNotValid.checkNotNullOrEmpty(seeds, "String seeds");
225        log.debug("Writing seeds to disk as file: {}", getSeedsTxtFile().getAbsolutePath());
226        FileUtils.writeBinaryFile(getSeedsTxtFile(), seeds.getBytes());
227    }
228
229    /**
230     * Writes the given order.xml content to the order.xml file.
231     *
232     * @param doc The intended content of order.xml
233     * @throws ArgumentNotValid, if doc is null or empty
234     */
235    public void writeOrderXml(HeritrixTemplate doc) {
236        ArgumentNotValid.checkNotNull(doc, "Document doc");
237        ArgumentNotValid.checkTrue(doc.hasContent(), "HeritrixTemplate document must not be empty");
238        log.debug("Writing order-file to disk as file: {}", getOrderXmlFile().getAbsolutePath());
239        doc.writeToFile(getOrderXmlFile());
240    }
241
242    /**
243     * Get the file that contains output from Heritrix on stdout/stderr.
244     *
245     * @return File that contains output from Heritrix on stdout/stderr.
246     */
247    public File getHeritrixOutput() {
248        return new File(crawlDir, OUTPUT_FILENAME);
249    }
250
251    /**
252     * Set the deduplicate index dir.
253     *
254     * @param indexDir the cache dir containing unzipped files
255     * @throws ArgumentNotValid if indexDir is not a directory or is null
256     */
257    public void setIndexDir(File indexDir) {
258        ArgumentNotValid.checkNotNull(indexDir, "File indexDir");
259        ArgumentNotValid.checkTrue(indexDir.isDirectory(), "indexDir '" + indexDir + "' should be a directory");
260        this.indexDir = indexDir;
261        log.debug("Setting deduplication index dir '{}'", indexDir);
262    }
263
264    /**
265     * Returns the index directory, if one has been set.
266     *
267     * @return the index directory or null if no index has been set.
268     */
269    public File getIndexDir() {
270        return indexDir;
271    }
272
273    /**
274     * Return a list of disposable heritrix-files. Currently the list consists of the File "state.job", and the
275     * directories: "checkpoints", "state", "scratch".
276     *
277     * @return a list of disposable heritrix-files.
278     */
279    public File[] getDisposableFiles() {
280        return new File[] {new File(crawlDir, "state.job"), new File(crawlDir, "state"),
281                new File(crawlDir, "checkpoints"), new File(crawlDir, "scratch")};
282    }
283
284    /**
285     * Retrieve the crawlLog as a File object.
286     *
287     * @return the crawlLog as a File object.
288     */
289    public File getCrawlLog() {
290        File logDir = new File(crawlDir, "logs");
291        return new File(logDir, CRAWL_LOG_FILENAME);
292    }
293
294    /**
295     * Retrieve the progress statistics log as a File object.
296     *
297     * @return the progress statistics log as a File object.
298     */
299    public File getProgressStatisticsLog() {
300        File logDir = new File(crawlDir, "logs");
301        return new File(logDir, PROGRESS_STATISTICS_LOG_FILENAME);
302    }
303
304    /**
305     * Get the job ID.
306     *
307     * @return Job ID this heritrix files object is for.
308     */
309    public Long getJobID() {
310        return jobID;
311    }
312
313    /**
314     * Get the harvest ID.
315     *
316     * @return Harvest ID this heritrix files object is for.
317     */
318    public Long getHarvestID() {
319        return harvestID;
320    }
321
322    /**
323     * Delete statefile etc. and move crawl directory to oldjobs.
324     *
325     * @param oldJobsDir Directory to move the rest of any existing files to.
326     */
327    public void cleanUpAfterHarvest(File oldJobsDir) {
328        // delete disposable files
329        for (File disposable : getDisposableFiles()) {
330            if (disposable.exists()) {
331                try {
332                    FileUtils.removeRecursively(disposable);
333                } catch (IOFailure e) {
334                    // Log harmless trouble
335                    log.debug("Couldn't delete leftover file '{}'", disposable.getAbsolutePath(), e);
336                }
337            }
338        }
339        // move the rest to oldjobs
340        FileUtils.createDir(oldJobsDir);
341        File destDir = new File(oldJobsDir, crawlDir.getName());
342        boolean success = crawlDir.renameTo(destDir);
343        if (!success) {
344            log.warn("Failed to rename jobdir '{}' to '{}'", crawlDir, destDir);
345        }
346    }
347
348    /**
349     * Helper method to delete the crawl.log and progress statistics log. Will log errors but otherwise continue.
350     */
351    public void deleteFinalLogs() {
352        try {
353            FileUtils.remove(getCrawlLog());
354        } catch (IOFailure e) {
355            // Log harmless trouble
356            log.debug("Couldn't delete crawl log file.", e);
357        }
358        try {
359            FileUtils.remove(getProgressStatisticsLog());
360        } catch (IOFailure e) {
361            // Log harmless trouble
362            log.debug("Couldn't delete progress statistics log file.", e);
363        }
364    }
365
366    /**
367     * Return the directory, where Heritrix writes its arcfiles.
368     *
369     * @return the directory, where Heritrix writes its arcfiles.
370     */
371    public File getArcsDir() {
372        return new File(crawlDir, Constants.ARCDIRECTORY_NAME);
373    }
374
375    /**
376     * Return the directory, where Heritrix writes its warcfiles.
377     *
378     * @return the directory, where Heritrix writes its warcfiles.
379     */
380    public File getWarcsDir() {
381        return new File(crawlDir, Constants.WARCDIRECTORY_NAME);
382    }
383
384    /**
385     * Method for retrieving the jmxremote.password file.
386     *
387     * @return the jmxPasswordFile.
388     */
389    public File getJmxPasswordFile() {
390        return jmxPasswordFile;
391    }
392
393    /**
394     * Method for retrieving the jmxremote.access file.
395     *
396     * @return the jmxAccessFile.
397     */
398    public File getJmxAccessFile() {
399        return jmxAccessFile;
400    }
401
402    public static enum Version {
403        HERITRIX_1,
404        HERITRIX_3;
405    }
406}