001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.harvesting;
024
025import java.io.File;
026import java.io.FilenameFilter;
027import java.util.Arrays;
028import java.util.LinkedList;
029import java.util.List;
030
031import org.slf4j.Logger;
032import org.slf4j.LoggerFactory;
033
034import dk.netarkivet.common.Constants;
035import dk.netarkivet.common.exceptions.ArgumentNotValid;
036import dk.netarkivet.common.exceptions.IOFailure;
037import dk.netarkivet.common.exceptions.PermissionDenied;
038import dk.netarkivet.common.utils.FileUtils;
039import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriter;
040
041/**
042 * Encapsulation of files to be ingested into the archive. These files are presently placed subdirectories under the
043 * crawldir.
044 */
045public class IngestableFiles {
046
047    private static final Logger log = LoggerFactory.getLogger(IngestableFiles.class);
048
049    /** Subdir with final metadata file in it. */
050    protected static final String METADATA_SUB_DIR = "metadata";
051
052    /** Subdir with temporary metadata file in it. */
053    private static final String TMP_SUB_DIR = "tmp-meta";
054
055    /** jobId for present harvestjob. */
056    private long jobId;
057
058    /** crawlDir for present harvestjob. */
059    private File crawlDir;
060
061    /**
062     * Writer to this jobs metadatafile. This is closed when the metadata is marked as ready.
063     */
064    private MetadataFileWriter writer = null;
065
066    /** Whether we've had an error in metadata generation. */
067    private boolean error = false;
068
069    private String harvestnamePrefix;
070
071    private Long harvestId;
072
073    /**
074     * Constructor for this class. HeritrixFiles contains information about crawlDir, jobId, and harvestnameprefix for a
075     * specific finished harvestjob.
076     *
077     * @param files An instance of HeritrixFiles
078     * @throws ArgumentNotValid if null-arguments are given; if jobID < 1; if crawlDir does not exist
079     */
080    public IngestableFiles(HeritrixFiles files) {
081        ArgumentNotValid.checkNotNull(files, "files");
082        ArgumentNotValid.checkNotNull(files.getCrawlDir(), "crawlDir");
083        ArgumentNotValid.checkPositive(files.getJobID(), "jobID");
084        ArgumentNotValid.checkNotNullOrEmpty(files.getArchiveFilePrefix(), "harvestnamePrefix");
085        this.crawlDir = files.getCrawlDir();
086        if (!crawlDir.exists()) {
087            throw new ArgumentNotValid("The given crawlDir (" + crawlDir.getAbsolutePath() + ") does not exist");
088        }
089        this.jobId = files.getJobID();
090        this.harvestnamePrefix = files.getArchiveFilePrefix();
091        this.harvestId = files.getHarvestID();
092        // Create subdir 'metadata' if not already exists.
093        FileUtils.createDir(getMetadataDir());
094        // Create/scratch subdir 'tmp-meta'
095        FileUtils.removeRecursively(getTmpMetadataDir());
096        FileUtils.createDir(getTmpMetadataDir());
097    }
098
099    /**
100     * Check, if the metadatafile already exists. If this is true, metadata has been successfully generated. If false,
101     * either metadata has not finished being generated, or there was an error generating them.
102     *
103     * @return true, if it does exist; false otherwise.
104     */
105    public boolean isMetadataReady() {
106        return getMetadataFile().isFile();
107    }
108
109    /**
110     * Return true if the metadata generation process is known to have failed.
111     *
112     * @return True if metadata generation is finished without success, false if generation is still ongoing or has been
113     * successfully done.
114     */
115    public boolean isMetadataFailed() {
116        return error;
117    }
118
119    /**
120     * Marks generated metadata as final, closes the writer, and moves the temporary metadata file to its final
121     * position, if successful.
122     *
123     * @param success True if metadata was successfully generated, false otherwise.
124     * @throws PermissionDenied If the metadata has already been marked as ready, or if no metadata file exists upon
125     * success.
126     * @throws IOFailure if there is an error marking the metadata as ready.
127     */
128    public void setMetadataGenerationSucceeded(boolean success) {
129        if (isMetadataReady()) {
130            throw new PermissionDenied("Metadata file " + getMetadataFile().getAbsolutePath() + " already exists");
131        }
132
133        if (success) {
134            writer.close(); // close writer down
135            if (!getTmpMetadataFile().exists()) {
136                String message = "No metadata was generated despite claims that metadata generation was successfull.";
137                throw new PermissionDenied(message);
138            }
139            getTmpMetadataFile().renameTo(getMetadataFile());
140        } else {
141            error = true;
142        }
143    }
144
145    /**
146     * Get a MetaDatafileWriter for the temporary metadata file. Successive calls to this method on the same object will
147     * return the same writer. Once the metadata have been finalized, calling this method will fail.
148     *
149     * @return a MetaDatafileWriter for the temporary metadata file.
150     * @throws PermissionDenied if metadata generation is already finished.
151     */
152    public MetadataFileWriter getMetadataWriter() {
153        if (isMetadataReady()) {
154            throw new PermissionDenied("Metadata file " + getMetadataFile().getAbsolutePath() + " already exists");
155        }
156        if (isMetadataFailed()) {
157            throw new PermissionDenied("Metadata generation of file " + getMetadataFile().getAbsolutePath()
158                    + " has already failed.");
159        }
160        if (writer == null) {
161            writer = MetadataFileWriter.createWriter(getTmpMetadataFile());
162        }
163        return writer;
164    }
165
166    /**
167     * Gets the files containing the metadata.
168     *
169     * @return the files in the metadata dir
170     * @throws PermissionDenied if the metadata file is not ready, either because generation is still going on or there
171     * was an error generating the metadata.
172     */
173    public List<File> getMetadataArcFiles() {
174        // Our one known metadata file must exist.
175        if (!isMetadataReady()) {
176            throw new PermissionDenied("Metadata file " + getMetadataFile().getAbsolutePath() + " does not exist");
177        }
178        return Arrays.asList(new File[] {getMetadataFile()});
179    }
180
181    /**
182     * Constructs the metadata subdir from the crawlDir.
183     *
184     * @return The metadata subdir as a File
185     */
186    private File getMetadataDir() {
187        return new File(crawlDir, METADATA_SUB_DIR);
188    }
189
190    /**
191     * Constructs the single metadata arc file from the crawlDir and the jobID.
192     *
193     * @return metadata arc file as a File
194     */
195    protected File getMetadataFile() {
196        return new File(getMetadataDir(), MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobId)));
197    }
198
199    /**
200     * Constructs the TEMPORARY metadata subdir from the crawlDir.
201     *
202     * @return The tmp-metadata subdir as a File
203     */
204    public File getTmpMetadataDir() {
205        return new File(crawlDir, TMP_SUB_DIR);
206    }
207
208    /**
209     * Constructs the TEMPORARY metadata arc file from the crawlDir and the jobID.
210     *
211     * @return tmp-metadata arc file as a File
212     */
213    private File getTmpMetadataFile() {
214        return new File(getTmpMetadataDir(), MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobId)));
215    }
216
217    /**
218     * Get a list of all ARC files that should get ingested. Any open files should be closed with closeOpenFiles first.
219     *
220     * @return The ARC files that are ready to get ingested.
221     */
222    public List<File> getArcFiles() {
223        File arcsdir = getArcsDir();
224        if (arcsdir.exists()) {
225            if (!arcsdir.isDirectory()) {
226                throw new IOFailure(arcsdir.getPath() + " is not a directory");
227            }
228            return Arrays.asList(arcsdir.listFiles(FileUtils.ARCS_FILTER));
229        } else {
230            return new LinkedList<File>();
231        }
232    }
233
234    /**
235     * @return the arcs dir in the our crawl directory.
236     */
237    public File getArcsDir() {
238        return new File(crawlDir, Constants.ARCDIRECTORY_NAME);
239    }
240
241    /**
242     * @return the warcs dir in the our crawl directory.
243     */
244    public File getWarcsDir() {
245        return new File(crawlDir, Constants.WARCDIRECTORY_NAME);
246    }
247
248    /**
249     * Get a list of all WARC files that should get ingested. Any open files should be closed with closeOpenFiles first.
250     *
251     * @return The WARC files that are ready to get ingested.
252     */
253    public List<File> getWarcFiles() {
254        File warcsdir = getWarcsDir();
255        if (warcsdir.exists()) {
256            if (!warcsdir.isDirectory()) {
257                throw new IOFailure(warcsdir.getPath() + " is not a directory");
258            }
259            return Arrays.asList(warcsdir.listFiles(FileUtils.WARCS_FILTER));
260        } else {
261            return new LinkedList<File>();
262        }
263    }
264
265    /**
266     * Close any ".open" files left by a crashed Heritrix. ARC and/or WARC files ending in .open indicate that Heritrix
267     * is still writing to them. If Heritrix has died, we can just rename them before we upload. This must not be done
268     * while harvesting is still in progress.
269     *
270     * @param waitSeconds How many seconds to wait before closing files. This may be done in order to allow Heritrix to
271     * finish writing before we close the files.
272     */
273    public void closeOpenFiles(int waitSeconds) {
274        // wait for Heritrix threads to create and close last arc or warc files
275        try {
276            Thread.sleep(waitSeconds * 1000L);
277        } catch (InterruptedException e) {
278            log.debug("Thread woken prematurely from sleep.", e);
279        }
280
281        closeOpenFiles(Constants.ARCDIRECTORY_NAME, FileUtils.OPEN_ARCS_FILTER);
282        closeOpenFiles(Constants.WARCDIRECTORY_NAME, FileUtils.OPEN_WARCS_FILTER);
283    }
284
285    /**
286     * Given an archive sub-directory name and a filter to match against this method tries to rename the matched files.
287     * Files that can not be renamed generate a log message. The filter should always match files that end with ".open"
288     * as a minimum.
289     *
290     * @param archiveDirName archive directory name, currently "arc" or "warc"
291     * @param filter filename filter used to select ".open" files to rename
292     */
293    protected void closeOpenFiles(String archiveDirName, FilenameFilter filter) {
294        File arcsdir = new File(crawlDir, archiveDirName);
295        File[] files = arcsdir.listFiles(filter);
296        if (files != null) {
297            for (File file : files) {
298                final String fname = file.getAbsolutePath();
299                // Note: Due to regexp we know filename is at least 5 characters
300                File tofile = new File(fname.substring(0, fname.length() - 5));
301                if (!file.renameTo(tofile)) {
302                    log.warn("Failed to rename '{}' to '{}'", file.getAbsolutePath(), tofile.getAbsolutePath());
303                }
304            }
305        }
306    }
307
308    /**
309     * Remove any temporary files.
310     */
311    public void cleanup() {
312        FileUtils.removeRecursively(getTmpMetadataDir());
313        writer = null;
314    }
315
316    /**
317     * @return the jobID of the harvest job being processed.
318     */
319    public long getJobId() {
320        return this.jobId;
321    }
322
323    /**
324     * @return the harvestID of the harvest job being processed.
325     */
326    public long getHarvestID() {
327        return this.harvestId;
328    }
329
330    /**
331     * @return the harvestnamePrefix of the harvest job being processed.
332     */
333    public String getHarvestnamePrefix() {
334        return this.harvestnamePrefix;
335    }
336
337    /**
338     * @return the crawlDir of the harvest job being processed.
339     */
340    public File getCrawlDir() {
341        return this.crawlDir;
342    }
343
344}