001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.harvesting.metadata;
024
025import java.io.File;
026import java.io.FilenameFilter;
027import java.io.InputStream;
028import java.net.URI;
029import java.net.URISyntaxException;
030
031import org.slf4j.Logger;
032import org.slf4j.LoggerFactory;
033
034import dk.netarkivet.common.CommonSettings;
035import dk.netarkivet.common.exceptions.ArgumentNotValid;
036import dk.netarkivet.common.exceptions.IOFailure;
037import dk.netarkivet.common.exceptions.IllegalState;
038import dk.netarkivet.common.exceptions.UnknownID;
039import dk.netarkivet.common.utils.FileUtils;
040import dk.netarkivet.common.utils.Settings;
041import dk.netarkivet.harvester.HarvesterSettings;
042
043/**
044 * Abstract base class for Metadata file writer. Implementations must extend this class.
045 *
046 * @author nicl
047 */
048public abstract class MetadataFileWriter {
049
050    /** Logging output place. */
051    private static final Logger log = LoggerFactory.getLogger(MetadataFileWriter.class);
052
053    /** Constant representing the ARC format. */
054    public static final int MDF_ARC = 1;
055    /** Constant representing the WARC format. */
056    public static final int MDF_WARC = 2;
057    /** Constant representing the metadata Format. Recognized formats are either MDF_ARC or MDF_WARC */
058    protected static int metadataFormat = 0;
059
060    /** Constants used in constructing URI for CDX content. */
061    protected static final String CDX_URI_SCHEME = "metadata";
062    private static final String CDX_URI_AUTHORITY_HOST = Settings.get(CommonSettings.ORGANIZATION);
063    private static final String CDX_URI_PATH = "/crawl/index/cdx";
064    private static final String CDX_URI_VERSION_PARAMETERS = "majorversion=2&minorversion=0";
065    private static final String ALTERNATE_CDX_URI_VERSION_PARAMETERS = "majorversion=3&minorversion=0";
066
067    private static final String CDX_URI_HARVEST_ID_PARAMETER_NAME = "harvestid";
068    private static final String CDX_URI_JOB_ID_PARAMETER_NAME = "jobid";
069    private static final String CDX_URI_FILENAME_PARAMETER_NAME = "filename";
070
071    /**
072     * Initialize the used metadata format from settings.
073     */
074    protected static synchronized void initializeMetadataFormat() {
075        String metadataFormatSetting = Settings.get(HarvesterSettings.METADATA_FORMAT);
076        if ("arc".equalsIgnoreCase(metadataFormatSetting)) {
077            metadataFormat = MDF_ARC;
078        } else if ("warc".equalsIgnoreCase(metadataFormatSetting)) {
079            metadataFormat = MDF_WARC;
080        } else {
081            throw new ArgumentNotValid("Configuration of '" + HarvesterSettings.METADATA_FORMAT + "' is invalid! "
082                    + "Unrecognized format '" + metadataFormatSetting + "'.");
083        }
084    }
085
086    /**
087     * Generates a name for an archive(ARC/WARC) file containing metadata regarding a given job.
088     *
089     * @param jobID The number of the job that generated the archive file.
090     * @return A "flat" file name (i.e. no path) containing the jobID parameter and ending on "-metadata-N.(w)arc",
091     * where N is the serial number of the metadata files for this job, e.g. "42-metadata-1.(w)arc". Currently, only one
092     * file is ever made.
093     * @throws ArgumentNotValid if any parameter was null.
094     */
095    public static String getMetadataArchiveFileName(String jobID, Long harvestID) throws ArgumentNotValid {
096        ArgumentNotValid.checkNotNull(jobID, "jobID");
097        //retrieving the collectionName
098        String collectionName = "";
099        boolean isPrefix = false;
100        //try to retrieve settings for prefixing or not metadata files
101        String metadataFilenameFormat = "";
102        try {
103                metadataFilenameFormat = Settings.get(HarvesterSettings.METADATA_FILENAME_FORMAT);
104        } catch (UnknownID e) {
105                //nothing
106        }
107        if("prefix".equals(metadataFilenameFormat)) {
108            try {
109                //try to retrieve in both <heritrix> and <heritrix3> tags
110                collectionName = Settings.get(HarvesterSettings.HERITRIX_PREFIX_COLLECTION_NAME);
111                isPrefix = true;
112            } catch(UnknownID e) {
113                //nothing
114            }
115                }
116        if (metadataFormat == 0) {
117            initializeMetadataFormat();
118        }
119        boolean compressionOn = compressRecords();
120        String possibleGzSuffix = "";
121        if (compressionOn) {
122            possibleGzSuffix = ".gz";
123        }
124        int versionNumber = Settings.getInt(HarvesterSettings.METADATA_FILE_VERSION_NUMBER);
125        switch (metadataFormat) {
126        case MDF_ARC:
127            if(isPrefix) {
128                return collectionName + "-" + jobID + "-" + harvestID + "-metadata-" + versionNumber + ".arc" + possibleGzSuffix;
129            } else {
130                return jobID + "-metadata-" + versionNumber + ".arc" + possibleGzSuffix;
131            }
132        case MDF_WARC:
133            if(isPrefix) {
134                return collectionName + "-" + jobID + "-" + harvestID + "-metadata-" + versionNumber + ".warc" + possibleGzSuffix;
135            } else {
136                return jobID + "-metadata-" + versionNumber + ".warc" + possibleGzSuffix;
137            }
138        default:
139            throw new ArgumentNotValid("Configuration of '" + HarvesterSettings.METADATA_FORMAT + "' is invalid!");
140        }
141    }
142
143    /**
144     * Create a writer that writes data to the given archive file.
145     *
146     * @param metadataArchiveFile The archive file to write to.
147     * @return a writer that writes data to the given archive file.
148     */
149    public static MetadataFileWriter createWriter(File metadataArchiveFile) {
150        if (metadataFormat == 0) {
151            initializeMetadataFormat();
152        }
153        switch (metadataFormat) {
154        case MDF_ARC:
155            return MetadataFileWriterArc.createWriter(metadataArchiveFile);
156        case MDF_WARC:
157            return MetadataFileWriterWarc.createWriter(metadataArchiveFile);
158        default:
159            throw new ArgumentNotValid("Configuration of '" + HarvesterSettings.METADATA_FORMAT + "' is invalid!");
160        }
161    }
162
163    /**
164     * Close the metadatafile Writer.
165     */
166    public abstract void close();
167
168    /**
169     * @return the finished metadataFile
170     */
171    public abstract File getFile();
172
173    /**
174     * Write the given file to the metadata file.
175     *
176     * @param file A given file with metadata to write to the metadata archive file.
177     * @param uri The uri associated with the piece of metadata
178     * @param mime The mimetype associated with the piece of metadata
179     */
180    public abstract void writeFileTo(File file, String uri, String mime);
181
182    /**
183     * Writes a File to an ARCWriter, if available, otherwise logs the failure to the class-logger.
184     *
185     * @param fileToArchive the File to archive
186     * @param URL the URL with which it is stored in the arcfile
187     * @param mimetype The mimetype of the File-contents
188     * @return true, if file exists, and is written to the arcfile.
189     */
190    public abstract boolean writeTo(File fileToArchive, String URL, String mimetype);
191
192    /**
193     * Write a record to the archive file.
194     *
195     * @param uri record URI
196     * @param contentType content-type of record
197     * @param hostIP resource ip-address
198     * @param fetchBeginTimeStamp record datetime
199     * @param payload A byte array containing the payload
200     * @see org.archive.io.arc.ARCWriter#write(String uri, String contentType, String hostIP, long fetchBeginTimeStamp,
201     * long recordLength, InputStream in)
202     */
203    public abstract void write(String uri, String contentType, String hostIP, long fetchBeginTimeStamp, byte[] payload)
204            throws java.io.IOException;
205
206    /**
207     * Append the files contained in the directory to the metadata archive file, but only if the filename matches the
208     * supplied filter.
209     *
210     * @param parentDir directory containing the files to append to metadata
211     * @param filter filter describing which files to accept and which to ignore
212     * @param mimetype The content-type to write along with the files in the metadata output
213     * @param harvestId The harvestId of the harvest
214     * @param jobId The jobId of the harvest 
215     */
216    public void insertFiles(File parentDir, FilenameFilter filter, String mimetype, long harvestId, long jobId) {
217        // For each metadata source file in the parentDir that matches the filter ..
218        File[] metadataSourceFiles = parentDir.listFiles(filter);
219        log.debug("Now inserting " + metadataSourceFiles.length + " files from " + parentDir.getAbsolutePath() + "'.");
220        for (File metadataSourceFile : metadataSourceFiles) {
221            // ...write its content to the MetadataFileWriter
222            log.debug("Inserting the file '{}'", metadataSourceFile.getAbsolutePath());
223            writeFileTo(metadataSourceFile, getURIforFileName(metadataSourceFile, harvestId, jobId).toASCIIString(), mimetype);
224            // ...and delete it afterwards
225            try {
226                FileUtils.remove(metadataSourceFile);
227            } catch (IOFailure e) {
228                log.warn("Couldn't delete file '{}' after adding to metadata archive file, ignoring.",
229                        metadataSourceFile.getAbsolutePath(), e);
230            }
231        }
232    }
233
234    /**
235     * Parses the name of the given file and generates a URI representation of it.
236     *
237     * @param cdx A CDX file.
238     * @param harvestID The harvestId of the harvest
239     * @param jobId     The jobId of the harvest
240     * @return A URI appropriate for identifying the file's content in Netarkivet
241     * @throws UnknownID if something goes terribly wrong in the CDX URI construction
242     */
243    private static URI getURIforFileName(File cdx, long harvestId, long jobId) throws UnknownID {
244        String extensionToRemove = FileUtils.CDX_EXTENSION;
245        String filename = cdx.getName();
246        if (!filename.endsWith(extensionToRemove)) {
247            throw new IllegalState("Filename '" + cdx.getAbsolutePath() + "' has unexpected extension");
248        }
249        int suffix_index = cdx.getName().indexOf(extensionToRemove);
250        filename = filename.substring(0, suffix_index);
251        return getCDXURI("" + harvestId, "" + jobId, filename);
252    }
253
254    /**
255     * Reset the metadata format. Should only be used by a unittest.
256     */
257    public static void resetMetadataFormat() {
258        metadataFormat = 0;
259    }
260    
261    
262    /**
263     * Generates a URI identifying CDX info for one harvested (W)ARC file. In Netarkivet, all of the parameters below
264     * are in the (W)ARC file's name.
265     *
266     * @param harvestID The number of the harvest that generated the (W)ARC file.
267     * @param jobID The number of the job that generated the (W)ARC file.
268     * @param filename The name of the ARC or WARC file behind the cdx-data
269     * @return A URI in the proprietary schema "metadata".
270     * @throws ArgumentNotValid if any parameter is null.
271     * @throws UnknownID if something goes terribly wrong in our URI construction.
272     */
273    public static URI getCDXURI(String harvestID, String jobID, String filename) throws ArgumentNotValid, UnknownID {
274        ArgumentNotValid.checkNotNull(harvestID, "harvestID");
275        ArgumentNotValid.checkNotNull(jobID, "jobID");
276        ArgumentNotValid.checkNotNull(filename, "filename");
277        URI result;
278        try {
279            result = new URI(CDX_URI_SCHEME, null, // Don't include user info (e.g. "foo@")
280                    CDX_URI_AUTHORITY_HOST, -1, // Don't include port no. (e.g. ":8080")
281                    CDX_URI_PATH, getCDXURIQuery(harvestID, jobID, filename), null); // Don't include fragment (e.g.
282            // "#foo")
283        } catch (URISyntaxException e) {
284            throw new UnknownID("Failed to generate URI for " + harvestID + "," + jobID + "," + filename + ",", e);
285        }
286        return result;
287    }
288    
289    /**
290     * Generates a URI identifying CDX info for one harvested ARC file.
291     *
292     * @param jobID The number of the job that generated the ARC file.
293     * @param filename the filename.
294     * @return A URI in the proprietary schema "metadata".
295     * @throws ArgumentNotValid if any parameter is null.
296     * @throws UnknownID if something goes terribly wrong in our URI construction.
297     */
298    public static URI getAlternateCDXURI(long jobID, String filename) throws ArgumentNotValid, UnknownID {
299        ArgumentNotValid.checkNotNull(jobID, "jobID");
300        ArgumentNotValid.checkNotNull(filename, "filename");
301        URI result;
302        try {
303            result = new URI(CDX_URI_SCHEME, null, // Don't include user info (e.g. "foo@")
304                    CDX_URI_AUTHORITY_HOST, -1, // Don't include port no. (e.g. ":8080")
305                    CDX_URI_PATH, getAlternateCDXURIQuery(jobID, filename), null); // Don't include fragment (e.g.
306            // "#foo")
307        } catch (URISyntaxException e) {
308            throw new UnknownID("Failed to generate URI for " + jobID + "," + filename + ",", e);
309        }
310        return result;
311    }
312
313    /**
314     * Generate the query part of a CDX URI.
315     *
316     * @param harvestID The number of the harvest that generated the ARC file.
317     * @param jobID The number of the job that generated the ARC file.
318     * @param filename The name of the ARC file.
319     * @return An appropriate list of assigned parameters, separated by the "&" character.
320     */
321    private static String getCDXURIQuery(String harvestID, String jobID, String filename) {
322        String result = CDX_URI_VERSION_PARAMETERS;
323        result += "&" + CDX_URI_HARVEST_ID_PARAMETER_NAME + "=" + harvestID;
324        result += "&" + CDX_URI_JOB_ID_PARAMETER_NAME + "=" + jobID;
325        result += "&" + CDX_URI_FILENAME_PARAMETER_NAME + "=" + filename;
326
327        return result;
328    }
329
330    /**
331     * Generate the query part of a CDX URI. Alternate version
332     *
333     * @param jobID The number of the job that generated the (W)ARC file.
334     * @param filename the filename of the archive file
335     * @return An appropriate list of assigned parameters, separated by the "&" character.
336     */
337    private static String getAlternateCDXURIQuery(long jobID, String filename) {
338        String result = ALTERNATE_CDX_URI_VERSION_PARAMETERS;
339        result += "&" + CDX_URI_JOB_ID_PARAMETER_NAME + "=" + jobID;
340        result += "&" + CDX_URI_FILENAME_PARAMETER_NAME + "=" + filename;
341        return result;
342    }
343    
344    /**
345     * @return true, if we want to compress out metadata records, false, if not
346     */
347    public static boolean compressRecords() {
348        return Settings.getBoolean(HarvesterSettings.METADATA_COMPRESSION);
349    }
350    
351}