001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.harvesting.metadata; 024 025import java.io.File; 026import java.io.FilenameFilter; 027import java.io.InputStream; 028import java.net.URI; 029import java.net.URISyntaxException; 030 031import org.slf4j.Logger; 032import org.slf4j.LoggerFactory; 033 034import dk.netarkivet.common.CommonSettings; 035import dk.netarkivet.common.exceptions.ArgumentNotValid; 036import dk.netarkivet.common.exceptions.IOFailure; 037import dk.netarkivet.common.exceptions.IllegalState; 038import dk.netarkivet.common.exceptions.UnknownID; 039import dk.netarkivet.common.utils.FileUtils; 040import dk.netarkivet.common.utils.Settings; 041import dk.netarkivet.harvester.HarvesterSettings; 042 043/** 044 * Abstract base class for Metadata file writer. Implementations must extend this class. 045 * 046 * @author nicl 047 */ 048public abstract class MetadataFileWriter { 049 050 /** Logging output place. */ 051 private static final Logger log = LoggerFactory.getLogger(MetadataFileWriter.class); 052 053 /** Constant representing the ARC format. */ 054 public static final int MDF_ARC = 1; 055 /** Constant representing the WARC format. */ 056 public static final int MDF_WARC = 2; 057 /** Constant representing the metadata Format. Recognized formats are either MDF_ARC or MDF_WARC */ 058 protected static int metadataFormat = 0; 059 060 /** Constants used in constructing URI for CDX content. */ 061 protected static final String CDX_URI_SCHEME = "metadata"; 062 private static final String CDX_URI_AUTHORITY_HOST = Settings.get(CommonSettings.ORGANIZATION); 063 private static final String CDX_URI_PATH = "/crawl/index/cdx"; 064 private static final String CDX_URI_VERSION_PARAMETERS = "majorversion=2&minorversion=0"; 065 private static final String ALTERNATE_CDX_URI_VERSION_PARAMETERS = "majorversion=3&minorversion=0"; 066 067 private static final String CDX_URI_HARVEST_ID_PARAMETER_NAME = "harvestid"; 068 private static final String CDX_URI_JOB_ID_PARAMETER_NAME = "jobid"; 069 private static final String CDX_URI_FILENAME_PARAMETER_NAME = "filename"; 070 071 /** 072 * Initialize the used metadata format from settings. 073 */ 074 protected static synchronized void initializeMetadataFormat() { 075 String metadataFormatSetting = Settings.get(HarvesterSettings.METADATA_FORMAT); 076 if ("arc".equalsIgnoreCase(metadataFormatSetting)) { 077 metadataFormat = MDF_ARC; 078 } else if ("warc".equalsIgnoreCase(metadataFormatSetting)) { 079 metadataFormat = MDF_WARC; 080 } else { 081 throw new ArgumentNotValid("Configuration of '" + HarvesterSettings.METADATA_FORMAT + "' is invalid! " 082 + "Unrecognized format '" + metadataFormatSetting + "'."); 083 } 084 } 085 086 /** 087 * Generates a name for an archive(ARC/WARC) file containing metadata regarding a given job. 088 * 089 * @param jobID The number of the job that generated the archive file. 090 * @return A "flat" file name (i.e. no path) containing the jobID parameter and ending on "-metadata-N.(w)arc", 091 * where N is the serial number of the metadata files for this job, e.g. "42-metadata-1.(w)arc". Currently, only one 092 * file is ever made. 093 * @throws ArgumentNotValid if any parameter was null. 094 */ 095 public static String getMetadataArchiveFileName(String jobID, Long harvestID) throws ArgumentNotValid { 096 ArgumentNotValid.checkNotNull(jobID, "jobID"); 097 //retrieving the collectionName 098 String collectionName = ""; 099 boolean isPrefix = false; 100 //try to retrieve settings for prefixing or not metadata files 101 String metadataFilenameFormat = ""; 102 try { 103 metadataFilenameFormat = Settings.get(HarvesterSettings.METADATA_FILENAME_FORMAT); 104 } catch (UnknownID e) { 105 //nothing 106 } 107 if("prefix".equals(metadataFilenameFormat)) { 108 try { 109 //try to retrieve in both <heritrix> and <heritrix3> tags 110 collectionName = Settings.get(HarvesterSettings.HERITRIX_PREFIX_COLLECTION_NAME); 111 isPrefix = true; 112 } catch(UnknownID e) { 113 //nothing 114 } 115 } 116 if (metadataFormat == 0) { 117 initializeMetadataFormat(); 118 } 119 boolean compressionOn = compressRecords(); 120 String possibleGzSuffix = ""; 121 if (compressionOn) { 122 possibleGzSuffix = ".gz"; 123 } 124 int versionNumber = Settings.getInt(HarvesterSettings.METADATA_FILE_VERSION_NUMBER); 125 switch (metadataFormat) { 126 case MDF_ARC: 127 if(isPrefix) { 128 return collectionName + "-" + jobID + "-" + harvestID + "-metadata-" + versionNumber + ".arc" + possibleGzSuffix; 129 } else { 130 return jobID + "-metadata-" + versionNumber + ".arc" + possibleGzSuffix; 131 } 132 case MDF_WARC: 133 if(isPrefix) { 134 return collectionName + "-" + jobID + "-" + harvestID + "-metadata-" + versionNumber + ".warc" + possibleGzSuffix; 135 } else { 136 return jobID + "-metadata-" + versionNumber + ".warc" + possibleGzSuffix; 137 } 138 default: 139 throw new ArgumentNotValid("Configuration of '" + HarvesterSettings.METADATA_FORMAT + "' is invalid!"); 140 } 141 } 142 143 /** 144 * Create a writer that writes data to the given archive file. 145 * 146 * @param metadataArchiveFile The archive file to write to. 147 * @return a writer that writes data to the given archive file. 148 */ 149 public static MetadataFileWriter createWriter(File metadataArchiveFile) { 150 if (metadataFormat == 0) { 151 initializeMetadataFormat(); 152 } 153 switch (metadataFormat) { 154 case MDF_ARC: 155 return MetadataFileWriterArc.createWriter(metadataArchiveFile); 156 case MDF_WARC: 157 return MetadataFileWriterWarc.createWriter(metadataArchiveFile); 158 default: 159 throw new ArgumentNotValid("Configuration of '" + HarvesterSettings.METADATA_FORMAT + "' is invalid!"); 160 } 161 } 162 163 /** 164 * Close the metadatafile Writer. 165 */ 166 public abstract void close(); 167 168 /** 169 * @return the finished metadataFile 170 */ 171 public abstract File getFile(); 172 173 /** 174 * Write the given file to the metadata file. 175 * 176 * @param file A given file with metadata to write to the metadata archive file. 177 * @param uri The uri associated with the piece of metadata 178 * @param mime The mimetype associated with the piece of metadata 179 */ 180 public abstract void writeFileTo(File file, String uri, String mime); 181 182 /** 183 * Writes a File to an ARCWriter, if available, otherwise logs the failure to the class-logger. 184 * 185 * @param fileToArchive the File to archive 186 * @param URL the URL with which it is stored in the arcfile 187 * @param mimetype The mimetype of the File-contents 188 * @return true, if file exists, and is written to the arcfile. 189 */ 190 public abstract boolean writeTo(File fileToArchive, String URL, String mimetype); 191 192 /** 193 * Write a record to the archive file. 194 * 195 * @param uri record URI 196 * @param contentType content-type of record 197 * @param hostIP resource ip-address 198 * @param fetchBeginTimeStamp record datetime 199 * @param payload A byte array containing the payload 200 * @see org.archive.io.arc.ARCWriter#write(String uri, String contentType, String hostIP, long fetchBeginTimeStamp, 201 * long recordLength, InputStream in) 202 */ 203 public abstract void write(String uri, String contentType, String hostIP, long fetchBeginTimeStamp, byte[] payload) 204 throws java.io.IOException; 205 206 /** 207 * Append the files contained in the directory to the metadata archive file, but only if the filename matches the 208 * supplied filter. 209 * 210 * @param parentDir directory containing the files to append to metadata 211 * @param filter filter describing which files to accept and which to ignore 212 * @param mimetype The content-type to write along with the files in the metadata output 213 * @param harvestId The harvestId of the harvest 214 * @param jobId The jobId of the harvest 215 */ 216 public void insertFiles(File parentDir, FilenameFilter filter, String mimetype, long harvestId, long jobId) { 217 // For each metadata source file in the parentDir that matches the filter .. 218 File[] metadataSourceFiles = parentDir.listFiles(filter); 219 log.debug("Now inserting " + metadataSourceFiles.length + " files from " + parentDir.getAbsolutePath() + "'."); 220 for (File metadataSourceFile : metadataSourceFiles) { 221 // ...write its content to the MetadataFileWriter 222 log.debug("Inserting the file '{}'", metadataSourceFile.getAbsolutePath()); 223 writeFileTo(metadataSourceFile, getURIforFileName(metadataSourceFile, harvestId, jobId).toASCIIString(), mimetype); 224 // ...and delete it afterwards 225 try { 226 FileUtils.remove(metadataSourceFile); 227 } catch (IOFailure e) { 228 log.warn("Couldn't delete file '{}' after adding to metadata archive file, ignoring.", 229 metadataSourceFile.getAbsolutePath(), e); 230 } 231 } 232 } 233 234 /** 235 * Parses the name of the given file and generates a URI representation of it. 236 * 237 * @param cdx A CDX file. 238 * @param harvestID The harvestId of the harvest 239 * @param jobId The jobId of the harvest 240 * @return A URI appropriate for identifying the file's content in Netarkivet 241 * @throws UnknownID if something goes terribly wrong in the CDX URI construction 242 */ 243 private static URI getURIforFileName(File cdx, long harvestId, long jobId) throws UnknownID { 244 String extensionToRemove = FileUtils.CDX_EXTENSION; 245 String filename = cdx.getName(); 246 if (!filename.endsWith(extensionToRemove)) { 247 throw new IllegalState("Filename '" + cdx.getAbsolutePath() + "' has unexpected extension"); 248 } 249 int suffix_index = cdx.getName().indexOf(extensionToRemove); 250 filename = filename.substring(0, suffix_index); 251 return getCDXURI("" + harvestId, "" + jobId, filename); 252 } 253 254 /** 255 * Reset the metadata format. Should only be used by a unittest. 256 */ 257 public static void resetMetadataFormat() { 258 metadataFormat = 0; 259 } 260 261 262 /** 263 * Generates a URI identifying CDX info for one harvested (W)ARC file. In Netarkivet, all of the parameters below 264 * are in the (W)ARC file's name. 265 * 266 * @param harvestID The number of the harvest that generated the (W)ARC file. 267 * @param jobID The number of the job that generated the (W)ARC file. 268 * @param filename The name of the ARC or WARC file behind the cdx-data 269 * @return A URI in the proprietary schema "metadata". 270 * @throws ArgumentNotValid if any parameter is null. 271 * @throws UnknownID if something goes terribly wrong in our URI construction. 272 */ 273 public static URI getCDXURI(String harvestID, String jobID, String filename) throws ArgumentNotValid, UnknownID { 274 ArgumentNotValid.checkNotNull(harvestID, "harvestID"); 275 ArgumentNotValid.checkNotNull(jobID, "jobID"); 276 ArgumentNotValid.checkNotNull(filename, "filename"); 277 URI result; 278 try { 279 result = new URI(CDX_URI_SCHEME, null, // Don't include user info (e.g. "foo@") 280 CDX_URI_AUTHORITY_HOST, -1, // Don't include port no. (e.g. ":8080") 281 CDX_URI_PATH, getCDXURIQuery(harvestID, jobID, filename), null); // Don't include fragment (e.g. 282 // "#foo") 283 } catch (URISyntaxException e) { 284 throw new UnknownID("Failed to generate URI for " + harvestID + "," + jobID + "," + filename + ",", e); 285 } 286 return result; 287 } 288 289 /** 290 * Generates a URI identifying CDX info for one harvested ARC file. 291 * 292 * @param jobID The number of the job that generated the ARC file. 293 * @param filename the filename. 294 * @return A URI in the proprietary schema "metadata". 295 * @throws ArgumentNotValid if any parameter is null. 296 * @throws UnknownID if something goes terribly wrong in our URI construction. 297 */ 298 public static URI getAlternateCDXURI(long jobID, String filename) throws ArgumentNotValid, UnknownID { 299 ArgumentNotValid.checkNotNull(jobID, "jobID"); 300 ArgumentNotValid.checkNotNull(filename, "filename"); 301 URI result; 302 try { 303 result = new URI(CDX_URI_SCHEME, null, // Don't include user info (e.g. "foo@") 304 CDX_URI_AUTHORITY_HOST, -1, // Don't include port no. (e.g. ":8080") 305 CDX_URI_PATH, getAlternateCDXURIQuery(jobID, filename), null); // Don't include fragment (e.g. 306 // "#foo") 307 } catch (URISyntaxException e) { 308 throw new UnknownID("Failed to generate URI for " + jobID + "," + filename + ",", e); 309 } 310 return result; 311 } 312 313 /** 314 * Generate the query part of a CDX URI. 315 * 316 * @param harvestID The number of the harvest that generated the ARC file. 317 * @param jobID The number of the job that generated the ARC file. 318 * @param filename The name of the ARC file. 319 * @return An appropriate list of assigned parameters, separated by the "&" character. 320 */ 321 private static String getCDXURIQuery(String harvestID, String jobID, String filename) { 322 String result = CDX_URI_VERSION_PARAMETERS; 323 result += "&" + CDX_URI_HARVEST_ID_PARAMETER_NAME + "=" + harvestID; 324 result += "&" + CDX_URI_JOB_ID_PARAMETER_NAME + "=" + jobID; 325 result += "&" + CDX_URI_FILENAME_PARAMETER_NAME + "=" + filename; 326 327 return result; 328 } 329 330 /** 331 * Generate the query part of a CDX URI. Alternate version 332 * 333 * @param jobID The number of the job that generated the (W)ARC file. 334 * @param filename the filename of the archive file 335 * @return An appropriate list of assigned parameters, separated by the "&" character. 336 */ 337 private static String getAlternateCDXURIQuery(long jobID, String filename) { 338 String result = ALTERNATE_CDX_URI_VERSION_PARAMETERS; 339 result += "&" + CDX_URI_JOB_ID_PARAMETER_NAME + "=" + jobID; 340 result += "&" + CDX_URI_FILENAME_PARAMETER_NAME + "=" + filename; 341 return result; 342 } 343 344 /** 345 * @return true, if we want to compress out metadata records, false, if not 346 */ 347 public static boolean compressRecords() { 348 return Settings.getBoolean(HarvesterSettings.METADATA_COMPRESSION); 349 } 350 351}