001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.harvesting.metadata; 024 025import java.io.File; 026import java.io.FileInputStream; 027import java.io.FileOutputStream; 028import java.io.FilenameFilter; 029import java.io.IOException; 030import java.io.ObjectInputStream; 031import java.io.ObjectOutputStream; 032import java.io.Serializable; 033import java.util.ArrayList; 034import java.util.List; 035import java.util.UUID; 036import java.util.regex.Pattern; 037 038import org.slf4j.Logger; 039import org.slf4j.LoggerFactory; 040 041import dk.netarkivet.common.CommonSettings; 042import dk.netarkivet.common.exceptions.ArgumentNotValid; 043import dk.netarkivet.common.exceptions.IOFailure; 044import dk.netarkivet.common.exceptions.IllegalState; 045import dk.netarkivet.common.utils.Settings; 046import dk.netarkivet.common.utils.StringUtils; 047import dk.netarkivet.harvester.datamodel.AliasInfo; 048 049/** 050 * Class used to carry metadata in DoOneCrawl messages, including the URL and mimetype necessary to write the metadata 051 * to metadata (W)ARC files. 052 */ 053@SuppressWarnings({"serial"}) 054public class MetadataEntry implements Serializable { 055 056 /** The instance logger. */ 057 private static final Logger log = LoggerFactory.getLogger(MetadataEntry.class); 058 /** The URL for this metadataEntry: Used as the unique identifier for this bit of metadata in the Netarchive. */ 059 private String url; 060 /** The mimetype for this metadataEntry: Identifies which type of document this bit of metadata is. */ 061 private String mimeType; 062 063 /** the metadata itself as byte array. */ 064 private byte[] data; 065 066 /** Regular expression for a valid mimetype. */ 067 private static final String MIMETYPE_REGEXP = "\\w+/\\w+"; 068 /** The corresponding pattern for the regexp MIMETYPE_REGEXP. */ 069 private static final Pattern MIMETYPE_PATTERN = Pattern.compile(MIMETYPE_REGEXP); 070 071 /** 072 * The url should be valid according to RFC 2396 This URL_REGEXP is taken from org.archive.util.SURT v. 1.12 1: 073 * scheme:// 2: userinfo (if present) 3: @ (if present) 4: host 5: :port 6: path 074 */ 075 private static String URL_REGEXP = "^(\\w+://)(?:([-\\w\\.!~\\*'\\(\\)%;:&=+$,]+?)(@))?(\\S+?)(:\\d+)?(/\\settingsStructure*)?$"; 076 // 1 2 3 4 5 6 077 /** The corresponding pattern for the regexp URL_REGEXP. */ 078 private static final Pattern URL_PATTERN = Pattern.compile(URL_REGEXP); 079 080 /** Mimetype for metadata url. */ 081 private static final String MIMETYPE_TEXT_PLAIN = "text/plain"; 082 083 /** Suffix for both metadata URLs. */ 084 private static final String METADATA_URL_SUFFIX = "?majorversion=1&minorversion=0&harvestid=%s&harvestnum=%s&jobid=%s"; 085 086 /** Metadata URL template for aliases. */ 087 private static final String ALIAS_METADATA_URL_TEMPLATE = "metadata://%s/crawl/setup/aliases" + METADATA_URL_SUFFIX; 088 089 /** Common template prefix for all deduplication metadata URLs. */ 090 private static final String DUPLICATEREDUCTION_METADATA_URL_PREFIX_TEMPLATE = "metadata://%s/crawl/setup/duplicatereductionjobs"; 091 092 /** 093 * Constructor for this class. 094 * 095 * @param url the URL assigned to this metadata (needed for it to be searchable) 096 * @param mimeType the mimeType for this metadata (normally text/plain or text/xml) 097 * @param data the metadata itself 098 * @throws ArgumentNotValid if arguments are null or empty strings, or if argument url is not valid URL or if 099 * argument mimeType is not valid MimeType 100 */ 101 public MetadataEntry(String url, String mimeType, String data) { 102 ArgumentNotValid.checkNotNullOrEmpty(url, "url"); 103 ArgumentNotValid.checkNotNullOrEmpty(mimeType, "mimetype"); 104 ArgumentNotValid.checkNotNull(data, "data"); 105 setURL(url); // Ensures this is a valid url 106 setMimetype(mimeType); // Ensures this is a valid mimetype 107 this.mimeType = mimeType; 108 this.data = data.getBytes(); 109 } 110 111 /** 112 * Generate a MetadataEntry from a list of AliasInfo objects (VERSION 2) Expired aliases is skipped by this method. 113 * 114 * @param aliases the list of aliases (possibly empty) 115 * @param origHarvestDefinitionID The harvestdefinition that is behind the job with the given jobId 116 * @param harvestNum The number of the harvest that the job with the given jobid belongs to 117 * @param jobId The id of the Job, which this metadata belongs to 118 * @return null, if the list if empty (or only consists of expired aliases), otherwise returns a MetadataEntry from 119 * a list of AliasInfo objects containing unexpired aliases. 120 */ 121 public static MetadataEntry makeAliasMetadataEntry(List<AliasInfo> aliases, Long origHarvestDefinitionID, 122 int harvestNum, Long jobId) { 123 ArgumentNotValid.checkNotNull(aliases, "aliases"); 124 ArgumentNotValid.checkNotNull(origHarvestDefinitionID, "Long origHarvestDefinitionID"); 125 ArgumentNotValid.checkNotNegative(harvestNum, "int harvestNum"); 126 ArgumentNotValid.checkNotNull(jobId, "Long jobId"); 127 if (aliases.isEmpty()) { 128 return null; 129 } 130 // Remove any expired aliases from the aliases collection 131 List<AliasInfo> nonExpiredAliases = new ArrayList<AliasInfo>(); 132 for (AliasInfo alias : aliases) { 133 if (!alias.isExpired()) { 134 nonExpiredAliases.add(alias); 135 } 136 } 137 if (nonExpiredAliases.isEmpty()) { 138 log.warn("All the aliases for the domains in job {} are expired. Aborting creation of an alias MetadataEntry", jobId); 139 return null; 140 } 141 142 String organization = Settings.get(CommonSettings.ORGANIZATION); 143 // construct metadata-URL for AliasMetadataEntry 144 String metadataUrl = String.format(ALIAS_METADATA_URL_TEMPLATE, organization, origHarvestDefinitionID, 145 harvestNum, jobId); 146 147 StringBuffer sb = new StringBuffer(); 148 for (AliasInfo alias : nonExpiredAliases) { 149 sb.append(alias.getDomain()).append(" is an alias for ").append(alias.getAliasOf()).append("\n"); 150 } 151 return new MetadataEntry(metadataUrl, MIMETYPE_TEXT_PLAIN, sb.toString()); 152 } 153 154 /** 155 * Generate a MetadataEntry from a list of job ids for duplicate reduction. 156 * 157 * @param jobIDsForDuplicateReduction the list of jobids (possibly empty) 158 * @param origHarvestDefinitionID The harvestdefinition that is behind the job with the given jobId 159 * @param harvestNum The number of the harvest that the job with the given jobid belongs to 160 * @param jobId The id of the Job, which this metadata belongs to 161 * @return null, if the list is empty, otherwise returns a MetadataEntry from the list of jobids. 162 */ 163 public static MetadataEntry makeDuplicateReductionMetadataEntry(List<Long> jobIDsForDuplicateReduction, 164 Long origHarvestDefinitionID, int harvestNum, Long jobId) { 165 ArgumentNotValid.checkNotNull(jobIDsForDuplicateReduction, "List<Long> jobIDsForDuplicateReduction"); 166 ArgumentNotValid.checkNotNull(origHarvestDefinitionID, "Long origHarvestDefinitionID"); 167 ArgumentNotValid.checkNotNegative(harvestNum, "int harvestNum"); 168 ArgumentNotValid.checkNotNull(jobId, "Long jobId"); 169 170 String organization = Settings.get(CommonSettings.ORGANIZATION); 171 // construct a metadata-URL for this MetadataEntry 172 String metadataUrl = String.format(DUPLICATEREDUCTION_METADATA_URL_PREFIX_TEMPLATE + METADATA_URL_SUFFIX, 173 organization, origHarvestDefinitionID, harvestNum, jobId); 174 175 return new MetadataEntry(metadataUrl, MIMETYPE_TEXT_PLAIN, 176 StringUtils.conjoin(",", jobIDsForDuplicateReduction)); 177 } 178 179 /** 180 * @return Returns the data. 181 */ 182 public byte[] getData() { 183 return data; 184 } 185 186 /** 187 * @return Returns the mimeType. 188 */ 189 public String getMimeType() { 190 return mimeType; 191 } 192 193 /** 194 * Set the mimetype for this object. 195 * 196 * @param mimetype a given mimetype 197 * @throws ArgumentNotValid if the mimetype is not valid 198 */ 199 private void setMimetype(String mimetype) { 200 if (isMimetypeValid(mimetype)) { 201 this.mimeType = mimetype; 202 } else { 203 throw new ArgumentNotValid("The given MimeType is not valid: " + mimetype); 204 } 205 } 206 207 /** 208 * @return Returns the URL 209 */ 210 public String getURL() { 211 return url; 212 } 213 214 /** 215 * Set the url for this object. 216 * 217 * @param aUrl a given URL 218 * @throws ArgumentNotValid if the URL is not valid 219 */ 220 private void setURL(String aUrl) { 221 if (isURLValid(aUrl)) { 222 this.url = aUrl; 223 } else { 224 throw new ArgumentNotValid("The given URL is not valid: " + aUrl); 225 } 226 } 227 228 /** 229 * Method needed to de-serializable an object of this class. 230 * 231 * @param s the given ObjectInputStream 232 * @throws ClassNotFoundException If the class of the serialized object could not be found 233 * @throws IOException If an I/O error occurred while reading the serialized object 234 */ 235 private void readObject(ObjectInputStream s) throws ClassNotFoundException, IOException { 236 s.defaultReadObject(); 237 } 238 239 /** 240 * Method needed to serializable an object of this class. 241 * 242 * @param s the given ObjectOutputStream 243 * @throws IOException If an I/O error occurred while writing to the outputstream 244 */ 245 private void writeObject(ObjectOutputStream s) throws IOException { 246 s.defaultWriteObject(); 247 } 248 249 /** 250 * Utility method for testing the validity of the mimetype. We need do this, because the ARCWriter does not do this 251 * check properly 252 * 253 * @param mimetype the given mimetype 254 * @return true, if the mimetype match the pattern: \\w+/\\w+ 255 */ 256 private static boolean isMimetypeValid(String mimetype) { 257 return MIMETYPE_PATTERN.matcher(mimetype).matches(); 258 } 259 260 /** 261 * Utility method for testing the validity of the URL. We need do this, because the ARCWriter does not do this check 262 * properly. 263 * 264 * @param url the given URL 265 * @return true, if the URL match the pattern: 266 */ 267 private static boolean isURLValid(String url) { 268 return URL_PATTERN.matcher(url).matches(); 269 } 270 271 /** 272 * Checks, if this is a duplicate reduction MetadataEntry. 273 * 274 * @return true, if this is a duplicate reduction MetadataEntry, otherwise false. 275 */ 276 public boolean isDuplicateReductionMetadataEntry() { 277 return this.getURL().startsWith(MetadataEntry.getDuplicatereductionMetadataURLPrefix()); 278 } 279 280 private static String getDuplicatereductionMetadataURLPrefix() { 281 String organization = Settings.get(CommonSettings.ORGANIZATION); 282 return String.format(DUPLICATEREDUCTION_METADATA_URL_PREFIX_TEMPLATE, organization); 283 } 284 285 /** 286 * @return a string representation of this object 287 */ 288 public String toString() { 289 return "URL= " + getURL() + " ; mimetype= " + getMimeType() + " ; data= " + new String(getData()); 290 } 291 292 /** 293 * Store a list of metadata entries to disk. 294 * 295 * @param metadata the given metadata 296 * @param destinationDir the directory to store the metadata. 297 */ 298 public static void storeMetadataToDisk(List<MetadataEntry> metadata, File destinationDir) { 299 try { 300 for (MetadataEntry m : metadata) { 301 File mFile = new File(destinationDir, UUID.randomUUID().toString() + ".ser"); 302 FileOutputStream fos = new FileOutputStream(mFile); 303 ObjectOutputStream out = new ObjectOutputStream(fos); 304 out.writeObject(m); 305 fos.close(); 306 } 307 } catch (IOException e) { 308 throw new IOFailure("Unable to store metadata temporarily in directory ' " 309 + destinationDir.getAbsolutePath() + "'", e); 310 } 311 } 312 313 /** 314 * Retrieve a list of serialized metadata entries on disk. 315 * 316 * @param sourceDir the directory where the metadata is stored. 317 * @return the list of deserialized MetadataEntry object. 318 */ 319 public static List<MetadataEntry> getMetadataFromDisk(File sourceDir) { 320 List<MetadataEntry> metadata = new ArrayList<MetadataEntry>(); 321 FilenameFilter filter = new FilenameFilter() { 322 323 @Override 324 public boolean accept(File dir, String name) { 325 if (name.endsWith(".ser")) { 326 return true; 327 } 328 return false; 329 } 330 }; 331 332 for (String file : sourceDir.list(filter)) { 333 File metadataEntryFile = new File(sourceDir, file); 334 try { 335 FileInputStream fileIn = new FileInputStream(metadataEntryFile); 336 ObjectInputStream in = new ObjectInputStream(fileIn); 337 MetadataEntry o = (MetadataEntry) in.readObject(); 338 metadata.add(o); 339 in.close(); 340 fileIn.close(); 341 } catch (IOException e) { 342 throw new IOFailure("Unable to read the serialized metadata", e); 343 } catch (ClassNotFoundException e) { 344 throw new IllegalState("Unable to read the serialized metadata", e); 345 } 346 } 347 return metadata; 348 } 349 350}