001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.heritrix3;
024
025import java.io.File;
026import java.io.FilenameFilter;
027import java.util.Arrays;
028import java.util.LinkedList;
029import java.util.List;
030
031import org.slf4j.Logger;
032import org.slf4j.LoggerFactory;
033
034import dk.netarkivet.common.Constants;
035import dk.netarkivet.common.exceptions.ArgumentNotValid;
036import dk.netarkivet.common.exceptions.IOFailure;
037import dk.netarkivet.common.exceptions.PermissionDenied;
038import dk.netarkivet.common.exceptions.IllegalState;
039import dk.netarkivet.common.utils.FileUtils;
040import dk.netarkivet.common.utils.Settings;
041import dk.netarkivet.harvester.HarvesterSettings;
042import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriter;
043
044/**
045 * Encapsulation of files to be ingested into the archive. These files are presently placed subdirectories under the
046 * crawldir.
047 */
048public class IngestableFiles {
049
050    private static final Logger log = LoggerFactory.getLogger(IngestableFiles.class);
051
052    /** Subdir with final metadata file in it. */
053    protected static final String METADATA_SUB_DIR = "metadata";
054
055    /** Subdir with temporary metadata file in it. */
056    private static final String TMP_SUB_DIR = "tmp-meta";
057
058    /** jobId for present harvestjob. */
059    private long jobId;
060
061    /** crawlDir for present harvestjob. */
062    private File crawlDir;
063
064    /**
065     * Writer to this jobs metadatafile. This is closed when the metadata is marked as ready.
066     */
067    private MetadataFileWriter writer = null;
068
069    /** Whether we've had an error in metadata generation. */
070    private boolean error = false;
071
072    private String harvestnamePrefix;
073
074    public static final String METADATA_FILENAME_FORMAT = Settings.get(HarvesterSettings.METADATA_FILENAME_FORMAT);
075
076    private Long harvestId;
077
078    private File heritrixJobDir;        
079    /**
080     * Constructor for this class. HeritrixFiles contains information about crawlDir, jobId, and harvestnameprefix for a
081     * specific finished harvestjob.
082     *
083     * @param files An instance of Heritrix3Files
084     * @throws ArgumentNotValid if null-arguments are given; if jobID < 1; if crawlDir does not exist
085     */
086    public IngestableFiles(Heritrix3Files files) {
087        ArgumentNotValid.checkNotNull(files, "files");
088        ArgumentNotValid.checkNotNull(files.getCrawlDir(), "crawlDir");
089        ArgumentNotValid.checkPositive(files.getJobID(), "jobID");
090        ArgumentNotValid.checkNotNullOrEmpty(files.getArchiveFilePrefix(), "harvestnamePrefix");
091        this.heritrixJobDir = files.getHeritrixJobDir();
092        this.crawlDir = files.getCrawlDir();
093        if (!crawlDir.exists()) {
094            throw new ArgumentNotValid("The given crawlDir (" + crawlDir.getAbsolutePath() + ") does not exist");
095        }
096        this.jobId = files.getJobID();
097        this.harvestnamePrefix = files.getArchiveFilePrefix();
098        this.harvestId = files.getHarvestID();
099        // Create subdir 'metadata' if not already exists.
100        FileUtils.createDir(getMetadataDir());
101        // Create/scratch subdir 'tmp-meta'
102        if (getTmpMetadataDir().isDirectory()) {
103                FileUtils.removeRecursively(getTmpMetadataDir());
104                log.warn("Removed directory {} with contents", getTmpMetadataDir());
105        }
106        FileUtils.createDir(getTmpMetadataDir());
107    }
108
109    /**
110     * Check, if the metadatafile already exists. If this is true, metadata has been successfully generated. If false,
111     * either metadata has not finished being generated, or there was an error generating them.
112     *
113     * @return true, if it does exist; false otherwise.
114     */
115    public boolean isMetadataReady() {
116        return getMetadataFile().isFile();
117    }
118
119    /**
120     * Return true if the metadata generation process is known to have failed.
121     *
122     * @return True if metadata generation is finished without success, false if generation is still ongoing or has been
123     * successfully done.
124     */
125    public boolean isMetadataFailed() {
126        return error;
127    }
128
129    /**
130     * Marks generated metadata as final, closes the writer, and moves the temporary metadata file to its final
131     * position.
132     *
133     * @throws PermissionDenied If the metadata has already been marked as ready, or if no metadata file exists upon
134     * success.
135     * @throws IOFailure if there is an error marking the metadata as ready.
136     */
137    public void closeMetadataFile() {
138        if (isMetadataReady()) {
139            throw new PermissionDenied("Metadata file " + getMetadataFile().getAbsolutePath() + " already exists");
140        }
141        writer.close(); // close writer down
142        if (!getTmpMetadataFile().exists()) {
143            String message = "No metadata was generated despite claims that metadata generation was successful.";
144            throw new PermissionDenied(message);
145        }
146        getTmpMetadataFile().renameTo(getMetadataFile());
147    }
148    
149    /**
150     * Set error state. 
151     * @param isError True, if error, otherwise false;
152     */
153    public void setErrorState(boolean isError) {
154        error = isError;
155    }
156    
157    /**
158     * Get a MetaDatafileWriter for the temporary metadata file. Successive calls to this method on the same object will
159     * return the same writer. Once the metadata have been finalized, calling this method will fail.
160     *
161     * @return a MetaDatafileWriter for the temporary metadata file.
162     * @throws PermissionDenied if metadata generation is already finished.
163     */
164    public MetadataFileWriter getMetadataWriter() {
165        if (isMetadataReady()) {
166            throw new PermissionDenied("Metadata file " + getMetadataFile().getAbsolutePath() + " already exists");
167        }
168        if (isMetadataFailed()) {
169            throw new PermissionDenied("Metadata generation of file " + getMetadataFile().getAbsolutePath()
170                    + " has already failed.");
171        }
172        if (writer == null) {
173            writer = MetadataFileWriter.createWriter(getTmpMetadataFile());
174        }
175        return writer;
176    }
177
178    /**
179     * Gets the files containing the metadata.
180     *
181     * @return the files in the metadata dir
182     * @throws IllegalState if the metadata file is not ready, either because generation is still going on or there
183     * was an error generating the metadata.
184     */
185    public List<File> getMetadataArcFiles() {
186        // Our one known metadata file must exist.
187        if (!isMetadataReady()) {
188            throw new IllegalState("Metadata file " + getMetadataFile().getAbsolutePath() + " does not exist");
189        }
190        return Arrays.asList(new File[] {getMetadataFile()});
191    }
192
193    /**
194     * Constructs the metadata subdir from the crawlDir.
195     *
196     * @return The metadata subdir as a File
197     */
198    private File getMetadataDir() {
199        return new File(crawlDir, METADATA_SUB_DIR);
200    }
201
202    /**
203     * Constructs the single metadata arc file from the crawlDir and the jobID.
204     * 
205     * @return metadata arc file as a File
206     */
207    protected File getMetadataFile() {
208        return new File(getMetadataDir(), MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobId), harvestId));
209    }
210
211    /**
212     * Constructs the TEMPORARY metadata subdir from the crawlDir.
213     *
214     * @return The tmp-metadata subdir as a File
215     */
216    public File getTmpMetadataDir() {
217        return new File(crawlDir, TMP_SUB_DIR);
218    }
219
220    /**
221     * Constructs the TEMPORARY metadata arc file from the crawlDir and the jobID.
222     *
223     * @return tmp-metadata arc file as a File
224     */
225    private File getTmpMetadataFile() {
226        return new File(getTmpMetadataDir(), MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobId), harvestId));
227    }
228
229    /**
230     * Get a list of all ARC files that should get ingested. Any open files should be closed with closeOpenFiles first.
231     *
232     * @return The ARC files that are ready to get ingested.
233     */
234    public List<File> getArcFiles() {
235        File arcsdir = getArcsDir();
236        if (arcsdir.exists()) {
237            if (!arcsdir.isDirectory()) {
238                throw new IOFailure(arcsdir.getPath() + " is not a directory");
239            }
240            return Arrays.asList(arcsdir.listFiles(FileUtils.ARCS_FILTER));
241        } else {
242            return new LinkedList<File>();
243        }
244    }
245
246    /**
247     * @return the arcs dir in the our crawl directory.
248     */
249    public File getArcsDir() {
250        return new File(heritrixJobDir, "latest/" + Constants.ARCDIRECTORY_NAME);
251    }
252
253    /**
254     * @return the warcs dir in the our crawl directory.
255     */
256    public File getWarcsDir() {
257        return new File(heritrixJobDir, "latest/" + Constants.WARCDIRECTORY_NAME);
258    }
259
260    /**
261     * @return the warcs dir in the our crawl directory.
262     */
263    public File getReportsDir() {
264        return new File(heritrixJobDir, "latest/reports");
265    }
266        
267    /**
268     * Get a list of all WARC files that should get ingested. Any open files should be closed with closeOpenFiles first.
269     *
270     * @return The WARC files that are ready to get ingested.
271     */
272    public List<File> getWarcFiles() {
273        File warcsdir = getWarcsDir();
274        if (warcsdir.exists()) {
275            if (!warcsdir.isDirectory()) {
276                throw new IOFailure(warcsdir.getPath() + " is not a directory");
277            }
278            //log
279
280            return Arrays.asList(warcsdir.listFiles(FileUtils.WARCS_FILTER));
281        } else {
282            return new LinkedList<File>();
283        }
284    }
285    
286    public File getHeritrix3JobDir() {
287        return this.heritrixJobDir;
288    }
289
290    /**
291     * Close any ".open" files left by a crashed Heritrix. ARC and/or WARC files ending in .open indicate that Heritrix
292     * is still writing to them. If Heritrix has died, we can just rename them before we upload. This must not be done
293     * while harvesting is still in progress.
294     *
295     * @param waitSeconds How many seconds to wait before closing files. This may be done in order to allow Heritrix to
296     * finish writing before we close the files.
297     */
298    public void closeOpenFiles(int waitSeconds) {
299        // wait for Heritrix threads to create and close last arc or warc files
300        try {
301            Thread.sleep(waitSeconds * 1000L);
302        } catch (InterruptedException e) {
303            log.debug("Thread woken prematurely from sleep.", e);
304        }
305
306        closeOpenFiles(Constants.ARCDIRECTORY_NAME, FileUtils.OPEN_ARCS_FILTER);
307        closeOpenFiles(Constants.WARCDIRECTORY_NAME, FileUtils.OPEN_WARCS_FILTER);
308    }
309
310    /**
311     * Given an archive sub-directory name and a filter to match against this method tries to rename the matched files.
312     * Files that can not be renamed generate a log message. The filter should always match files that end with ".open"
313     * as a minimum.
314     *
315     * @param archiveDirName archive directory name, currently "arc" or "warc"
316     * @param filter filename filter used to select ".open" files to rename
317     */
318    protected void closeOpenFiles(String archiveDirName, FilenameFilter filter) {
319        File arcsdir = new File(crawlDir, archiveDirName);
320        log.debug("Trying to close open archive files in directory {}", arcsdir);
321        File[] files = arcsdir.listFiles(filter);
322        if (files != null) {
323            for (File file : files) {
324                final String fname = file.getAbsolutePath();
325                // Note: Due to regexp we know filename is at least 5 characters
326                File tofile = new File(fname.substring(0, fname.length() - 5));
327                if (!file.renameTo(tofile)) {
328                    log.warn("Failed to rename '{}' to '{}'", file.getAbsolutePath(), tofile.getAbsolutePath());
329                }
330            }
331        }
332    }
333
334    /**
335     * Remove any temporary files.
336     */
337    public void cleanup() {
338        log.debug("Removing the directory '{}'", getTmpMetadataDir());
339        FileUtils.removeRecursively(getTmpMetadataDir());
340        writer = null;
341    }
342
343    /**
344     * @return the jobID of the harvest job being processed.
345     */
346    public long getJobId() {
347        return this.jobId;
348    }
349
350    /**
351     * @return the harvestID of the harvest job being processed.
352     */
353    public long getHarvestID() {
354        return this.harvestId;
355    }
356
357    /**
358     * @return the harvestnamePrefix of the harvest job being processed.
359     */
360    public String getHarvestnamePrefix() {
361        return this.harvestnamePrefix;
362    }
363
364    /**
365     * @return the crawlDir of the harvest job being processed.
366     */
367    public File getCrawlDir() {
368        return this.crawlDir;
369    }
370
371}