Source code

001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.heritrix3;
024
025import java.io.File;
026import java.io.FilenameFilter;
027import java.util.Arrays;
028import java.util.LinkedList;
029import java.util.List;
030
031import org.slf4j.Logger;
032import org.slf4j.LoggerFactory;
033
034import dk.netarkivet.common.Constants;
035import dk.netarkivet.common.exceptions.ArgumentNotValid;
036import dk.netarkivet.common.exceptions.IOFailure;
037import dk.netarkivet.common.exceptions.PermissionDenied;
038import dk.netarkivet.common.exceptions.IllegalState;
039import dk.netarkivet.common.utils.FileUtils;
040import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriter;
041
042/**
043 * Encapsulation of files to be ingested into the archive. These files are presently placed subdirectories under the
044 * crawldir.
045 */
046public class IngestableFiles {
047
048    private static final Logger log = LoggerFactory.getLogger(IngestableFiles.class);
049
050    /** Subdir with final metadata file in it. */
051    protected static final String METADATA_SUB_DIR = "metadata";
052
053    /** Subdir with temporary metadata file in it. */
054    private static final String TMP_SUB_DIR = "tmp-meta";
055
056    /** jobId for present harvestjob. */
057    private long jobId;
058
059    /** crawlDir for present harvestjob. */
060    private File crawlDir;
061
062    /**
063     * Writer to this jobs metadatafile. This is closed when the metadata is marked as ready.
064     */
065    private MetadataFileWriter writer = null;
066
067    /** Whether we've had an error in metadata generation. */
068    private boolean error = false;
069
070    private String harvestnamePrefix;
071
072    private Long harvestId;
073
074    private File heritrixJobDir;        
075    /**
076     * Constructor for this class. HeritrixFiles contains information about crawlDir, jobId, and harvestnameprefix for a
077     * specific finished harvestjob.
078     *
079     * @param files An instance of Heritrix3Files
080     * @throws ArgumentNotValid if null-arguments are given; if jobID < 1; if crawlDir does not exist
081     */
082    public IngestableFiles(Heritrix3Files files) {
083        ArgumentNotValid.checkNotNull(files, "files");
084        ArgumentNotValid.checkNotNull(files.getCrawlDir(), "crawlDir");
085        ArgumentNotValid.checkPositive(files.getJobID(), "jobID");
086        ArgumentNotValid.checkNotNullOrEmpty(files.getArchiveFilePrefix(), "harvestnamePrefix");
087        this.heritrixJobDir = files.getHeritrixJobDir();
088        this.crawlDir = files.getCrawlDir();
089        if (!crawlDir.exists()) {
090            throw new ArgumentNotValid("The given crawlDir (" + crawlDir.getAbsolutePath() + ") does not exist");
091        }
092        this.jobId = files.getJobID();
093        this.harvestnamePrefix = files.getArchiveFilePrefix();
094        this.harvestId = files.getHarvestID();
095        // Create subdir 'metadata' if not already exists.
096        FileUtils.createDir(getMetadataDir());
097        // Create/scratch subdir 'tmp-meta'
098        FileUtils.removeRecursively(getTmpMetadataDir());
099        FileUtils.createDir(getTmpMetadataDir());
100    }
101
102    /**
103     * Check, if the metadatafile already exists. If this is true, metadata has been successfully generated. If false,
104     * either metadata has not finished being generated, or there was an error generating them.
105     *
106     * @return true, if it does exist; false otherwise.
107     */
108    public boolean isMetadataReady() {
109        return getMetadataFile().isFile();
110    }
111
112    /**
113     * Return true if the metadata generation process is known to have failed.
114     *
115     * @return True if metadata generation is finished without success, false if generation is still ongoing or has been
116     * successfully done.
117     */
118    public boolean isMetadataFailed() {
119        return error;
120    }
121
122    /**
123     * Marks generated metadata as final, closes the writer, and moves the temporary metadata file to its final
124     * position, if successful.
125     *
126     * @param success True if metadata was successfully generated, false otherwise.
127     * @throws PermissionDenied If the metadata has already been marked as ready, or if no metadata file exists upon
128     * success.
129     * @throws IOFailure if there is an error marking the metadata as ready.
130     */
131    public void setMetadataGenerationSucceeded(boolean success) {
132        if (isMetadataReady()) {
133            throw new PermissionDenied("Metadata file " + getMetadataFile().getAbsolutePath() + " already exists");
134        }
135
136        if (success) {
137            writer.close(); // close writer down
138            if (!getTmpMetadataFile().exists()) {
139                String message = "No metadata was generated despite claims that metadata generation was successful.";
140                throw new PermissionDenied(message);
141            }
142            getTmpMetadataFile().renameTo(getMetadataFile());
143        } else {
144            error = true;
145        }
146    }
147
148    /**
149     * Get a MetaDatafileWriter for the temporary metadata file. Successive calls to this method on the same object will
150     * return the same writer. Once the metadata have been finalized, calling this method will fail.
151     *
152     * @return a MetaDatafileWriter for the temporary metadata file.
153     * @throws PermissionDenied if metadata generation is already finished.
154     */
155    public MetadataFileWriter getMetadataWriter() {
156        if (isMetadataReady()) {
157            throw new PermissionDenied("Metadata file " + getMetadataFile().getAbsolutePath() + " already exists");
158        }
159        if (isMetadataFailed()) {
160            throw new PermissionDenied("Metadata generation of file " + getMetadataFile().getAbsolutePath()
161                    + " has already failed.");
162        }
163        if (writer == null) {
164            writer = MetadataFileWriter.createWriter(getTmpMetadataFile());
165        }
166        return writer;
167    }
168
169    /**
170     * Gets the files containing the metadata.
171     *
172     * @return the files in the metadata dir
173     * @throws IllegalState if the metadata file is not ready, either because generation is still going on or there
174     * was an error generating the metadata.
175     */
176    public List<File> getMetadataArcFiles() {
177        // Our one known metadata file must exist.
178        if (!isMetadataReady()) {
179            throw new IllegalState("Metadata file " + getMetadataFile().getAbsolutePath() + " does not exist");
180        }
181        return Arrays.asList(new File[] {getMetadataFile()});
182    }
183
184    /**
185     * Constructs the metadata subdir from the crawlDir.
186     *
187     * @return The metadata subdir as a File
188     */
189    private File getMetadataDir() {
190        return new File(crawlDir, METADATA_SUB_DIR);
191    }
192
193    /**
194     * Constructs the single metadata arc file from the crawlDir and the jobID.
195     * 
196     * @return metadata arc file as a File
197     */
198    protected File getMetadataFile() {
199        return new File(getMetadataDir(), MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobId)));
200    }
201
202    /**
203     * Constructs the TEMPORARY metadata subdir from the crawlDir.
204     *
205     * @return The tmp-metadata subdir as a File
206     */
207    public File getTmpMetadataDir() {
208        return new File(crawlDir, TMP_SUB_DIR);
209    }
210
211    /**
212     * Constructs the TEMPORARY metadata arc file from the crawlDir and the jobID.
213     *
214     * @return tmp-metadata arc file as a File
215     */
216    private File getTmpMetadataFile() {
217        return new File(getTmpMetadataDir(), MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobId)));
218    }
219
220    /**
221     * Get a list of all ARC files that should get ingested. Any open files should be closed with closeOpenFiles first.
222     *
223     * @return The ARC files that are ready to get ingested.
224     */
225    public List<File> getArcFiles() {
226        File arcsdir = getArcsDir();
227        if (arcsdir.exists()) {
228            if (!arcsdir.isDirectory()) {
229                throw new IOFailure(arcsdir.getPath() + " is not a directory");
230            }
231            return Arrays.asList(arcsdir.listFiles(FileUtils.ARCS_FILTER));
232        } else {
233            return new LinkedList<File>();
234        }
235    }
236
237    /**
238     * @return the arcs dir in the our crawl directory.
239     */
240    public File getArcsDir() {
241        return new File(heritrixJobDir, "latest/" + Constants.ARCDIRECTORY_NAME);
242    }
243
244    /**
245     * @return the warcs dir in the our crawl directory.
246     */
247    public File getWarcsDir() {
248        return new File(heritrixJobDir, "latest/" + Constants.WARCDIRECTORY_NAME);
249    }
250
251    /**
252     * @return the warcs dir in the our crawl directory.
253     */
254    public File getReportsDir() {
255        return new File(heritrixJobDir, "latest/reports");
256    }
257    
258    
259    
260    
261    
262    /**
263     * Get a list of all WARC files that should get ingested. Any open files should be closed with closeOpenFiles first.
264     *
265     * @return The WARC files that are ready to get ingested.
266     */
267    public List<File> getWarcFiles() {
268        File warcsdir = getWarcsDir();
269        if (warcsdir.exists()) {
270            if (!warcsdir.isDirectory()) {
271                throw new IOFailure(warcsdir.getPath() + " is not a directory");
272            }
273            //log
274
275            return Arrays.asList(warcsdir.listFiles(FileUtils.WARCS_FILTER));
276        } else {
277            return new LinkedList<File>();
278        }
279    }
280    
281    public File getHeritrix3JobDir() {
282        return this.heritrixJobDir;
283    }
284
285    /**
286     * Close any ".open" files left by a crashed Heritrix. ARC and/or WARC files ending in .open indicate that Heritrix
287     * is still writing to them. If Heritrix has died, we can just rename them before we upload. This must not be done
288     * while harvesting is still in progress.
289     *
290     * @param waitSeconds How many seconds to wait before closing files. This may be done in order to allow Heritrix to
291     * finish writing before we close the files.
292     */
293    public void closeOpenFiles(int waitSeconds) {
294        // wait for Heritrix threads to create and close last arc or warc files
295        try {
296            Thread.sleep(waitSeconds * 1000L);
297        } catch (InterruptedException e) {
298            log.debug("Thread woken prematurely from sleep.", e);
299        }
300
301        closeOpenFiles(Constants.ARCDIRECTORY_NAME, FileUtils.OPEN_ARCS_FILTER);
302        closeOpenFiles(Constants.WARCDIRECTORY_NAME, FileUtils.OPEN_WARCS_FILTER);
303    }
304
305    /**
306     * Given an archive sub-directory name and a filter to match against this method tries to rename the matched files.
307     * Files that can not be renamed generate a log message. The filter should always match files that end with ".open"
308     * as a minimum.
309     *
310     * @param archiveDirName archive directory name, currently "arc" or "warc"
311     * @param filter filename filter used to select ".open" files to rename
312     */
313    protected void closeOpenFiles(String archiveDirName, FilenameFilter filter) {
314        File arcsdir = new File(crawlDir, archiveDirName);
315        log.debug("Trying to close open archive files in directory {}", arcsdir);
316        File[] files = arcsdir.listFiles(filter);
317        if (files != null) {
318            for (File file : files) {
319                final String fname = file.getAbsolutePath();
320                // Note: Due to regexp we know filename is at least 5 characters
321                File tofile = new File(fname.substring(0, fname.length() - 5));
322                if (!file.renameTo(tofile)) {
323                    log.warn("Failed to rename '{}' to '{}'", file.getAbsolutePath(), tofile.getAbsolutePath());
324                }
325            }
326        }
327    }
328
329    /**
330     * Remove any temporary files.
331     */
332    public void cleanup() {
333        log.debug("Removing the directory '{}'", getTmpMetadataDir());
334        FileUtils.removeRecursively(getTmpMetadataDir());
335        writer = null;
336    }
337
338    /**
339     * @return the jobID of the harvest job being processed.
340     */
341    public long getJobId() {
342        return this.jobId;
343    }
344
345    /**
346     * @return the harvestID of the harvest job being processed.
347     */
348    public long getHarvestID() {
349        return this.harvestId;
350    }
351
352    /**
353     * @return the harvestnamePrefix of the harvest job being processed.
354     */
355    public String getHarvestnamePrefix() {
356        return this.harvestnamePrefix;
357    }
358
359    /**
360     * @return the crawlDir of the harvest job being processed.
361     */
362    public File getCrawlDir() {
363        return this.crawlDir;
364    }
365
366}