001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.harvesting.metadata; 024 025import java.io.File; 026import java.io.FileInputStream; 027import java.io.FileOutputStream; 028import java.io.FilenameFilter; 029import java.io.IOException; 030import java.io.ObjectInputStream; 031import java.io.ObjectOutputStream; 032import java.io.Serializable; 033import java.util.ArrayList; 034import java.util.List; 035import java.util.UUID; 036import java.util.regex.Pattern; 037 038import dk.netarkivet.common.CommonSettings; 039import dk.netarkivet.common.exceptions.ArgumentNotValid; 040import dk.netarkivet.common.exceptions.IOFailure; 041import dk.netarkivet.common.exceptions.IllegalState; 042import dk.netarkivet.common.utils.Settings; 043import dk.netarkivet.common.utils.StringUtils; 044import dk.netarkivet.harvester.datamodel.AliasInfo; 045 046/** 047 * Class used to carry metadata in DoOneCrawl messages, including the URL and mimetype necessary to write the metadata 048 * to metadata (W)ARC files. 049 */ 050@SuppressWarnings({"serial"}) 051public class MetadataEntry implements Serializable { 052 053 /** The URL for this metadataEntry: Used as the unique identifier for this bit of metadata in the Netarchive. */ 054 private String url; 055 /** The mimetype for this metadataEntry: Identifies which type of document this bit of metadata is. */ 056 private String mimeType; 057 058 /** the metadata itself as byte array. */ 059 private byte[] data; 060 061 /** Regular expression for a valid mimetype. */ 062 private static final String MIMETYPE_REGEXP = "\\w+/\\w+"; 063 /** The corresponding pattern for the regexp MIMETYPE_REGEXP. */ 064 private static final Pattern MIMETYPE_PATTERN = Pattern.compile(MIMETYPE_REGEXP); 065 066 /** 067 * The url should be valid according to RFC 2396 This URL_REGEXP is taken from org.archive.util.SURT v. 1.12 1: 068 * scheme:// 2: userinfo (if present) 3: @ (if present) 4: host 5: :port 6: path 069 */ 070 private static String URL_REGEXP = "^(\\w+://)(?:([-\\w\\.!~\\*'\\(\\)%;:&=+$,]+?)(@))?(\\S+?)(:\\d+)?(/\\settingsStructure*)?$"; 071 // 1 2 3 4 5 6 072 /** The corresponding pattern for the regexp URL_REGEXP. */ 073 private static final Pattern URL_PATTERN = Pattern.compile(URL_REGEXP); 074 075 /** Mimetype for metadata url. */ 076 private static final String MIMETYPE_TEXT_PLAIN = "text/plain"; 077 078 /** Suffix for both metadata URLs. */ 079 private static final String METADATA_URL_SUFFIX = "?majorversion=1&minorversion=0&harvestid=%s&harvestnum=%s&jobid=%s"; 080 081 /** Metadata URL template for aliases. */ 082 private static final String ALIAS_METADATA_URL_TEMPLATE = "metadata://%s/crawl/setup/aliases" + METADATA_URL_SUFFIX; 083 084 /** Common template prefix for all deduplication metadata URLs. */ 085 private static final String DUPLICATEREDUCTION_METADATA_URL_PREFIX_TEMPLATE = "metadata://%s/crawl/setup/duplicatereductionjobs"; 086 087 /** 088 * Constructor for this class. 089 * 090 * @param url the URL assigned to this metadata (needed for it to be searchable) 091 * @param mimeType the mimeType for this metadata (normally text/plain or text/xml) 092 * @param data the metadata itself 093 * @throws ArgumentNotValid if arguments are null or empty strings, or if argument url is not valid URL or if 094 * argument mimeType is not valid MimeType 095 */ 096 public MetadataEntry(String url, String mimeType, String data) { 097 ArgumentNotValid.checkNotNullOrEmpty(url, "url"); 098 ArgumentNotValid.checkNotNullOrEmpty(mimeType, "mimetype"); 099 ArgumentNotValid.checkNotNull(data, "data"); 100 setURL(url); // Ensures this is a valid url 101 setMimetype(mimeType); // Ensures this is a valid mimetype 102 this.mimeType = mimeType; 103 this.data = data.getBytes(); 104 } 105 106 /** 107 * Generate a MetadataEntry from a list of AliasInfo objects (VERSION 2) Expired aliases is skipped by this method. 108 * 109 * @param aliases the list of aliases (possibly empty) 110 * @param origHarvestDefinitionID The harvestdefinition that is behind the job with the given jobId 111 * @param harvestNum The number of the harvest that the job with the given jobid belongs to 112 * @param jobId The id of the Job, which this metadata belongs to 113 * @return null, if the list if empty (or only consists of expired aliases), otherwise returns a MetadataEntry from 114 * a list of AliasInfo objects containing unexpired aliases. 115 */ 116 public static MetadataEntry makeAliasMetadataEntry(List<AliasInfo> aliases, Long origHarvestDefinitionID, 117 int harvestNum, Long jobId) { 118 ArgumentNotValid.checkNotNull(aliases, "aliases"); 119 ArgumentNotValid.checkNotNull(origHarvestDefinitionID, "Long origHarvestDefinitionID"); 120 ArgumentNotValid.checkNotNegative(harvestNum, "int harvestNum"); 121 ArgumentNotValid.checkNotNull(jobId, "Long jobId"); 122 if (aliases.isEmpty()) { 123 return null; 124 } 125 // Remove any expired aliases from the aliases collection 126 List<AliasInfo> nonExpiredAliases = new ArrayList<AliasInfo>(); 127 for (AliasInfo alias : aliases) { 128 if (!alias.isExpired()) { 129 nonExpiredAliases.add(alias); 130 } 131 } 132 if (nonExpiredAliases.isEmpty()) { 133 return null; 134 } 135 136 String organization = Settings.get(CommonSettings.ORGANIZATION); 137 // construct metadata-URL for AliasMetadataEntry 138 String metadataUrl = String.format(ALIAS_METADATA_URL_TEMPLATE, organization, origHarvestDefinitionID, 139 harvestNum, jobId); 140 141 StringBuffer sb = new StringBuffer(); 142 for (AliasInfo alias : nonExpiredAliases) { 143 sb.append(alias.getDomain()).append(" is an alias for ").append(alias.getAliasOf()).append("\n"); 144 } 145 return new MetadataEntry(metadataUrl, MIMETYPE_TEXT_PLAIN, sb.toString()); 146 } 147 148 /** 149 * Generate a MetadataEntry from a list of job ids for duplicate reduction. 150 * 151 * @param jobIDsForDuplicateReduction the list of jobids (possibly empty) 152 * @param origHarvestDefinitionID The harvestdefinition that is behind the job with the given jobId 153 * @param harvestNum The number of the harvest that the job with the given jobid belongs to 154 * @param jobId The id of the Job, which this metadata belongs to 155 * @return null, if the list is empty, otherwise returns a MetadataEntry from the list of jobids. 156 */ 157 public static MetadataEntry makeDuplicateReductionMetadataEntry(List<Long> jobIDsForDuplicateReduction, 158 Long origHarvestDefinitionID, int harvestNum, Long jobId) { 159 ArgumentNotValid.checkNotNull(jobIDsForDuplicateReduction, "List<Long> jobIDsForDuplicateReduction"); 160 ArgumentNotValid.checkNotNull(origHarvestDefinitionID, "Long origHarvestDefinitionID"); 161 ArgumentNotValid.checkNotNegative(harvestNum, "int harvestNum"); 162 ArgumentNotValid.checkNotNull(jobId, "Long jobId"); 163 164 String organization = Settings.get(CommonSettings.ORGANIZATION); 165 // construct a metadata-URL for this MetadataEntry 166 String metadataUrl = String.format(DUPLICATEREDUCTION_METADATA_URL_PREFIX_TEMPLATE + METADATA_URL_SUFFIX, 167 organization, origHarvestDefinitionID, harvestNum, jobId); 168 169 return new MetadataEntry(metadataUrl, MIMETYPE_TEXT_PLAIN, 170 StringUtils.conjoin(",", jobIDsForDuplicateReduction)); 171 } 172 173 /** 174 * @return Returns the data. 175 */ 176 public byte[] getData() { 177 return data; 178 } 179 180 /** 181 * @return Returns the mimeType. 182 */ 183 public String getMimeType() { 184 return mimeType; 185 } 186 187 /** 188 * Set the mimetype for this object. 189 * 190 * @param mimetype a given mimetype 191 * @throws ArgumentNotValid if the mimetype is not valid 192 */ 193 private void setMimetype(String mimetype) { 194 if (isMimetypeValid(mimetype)) { 195 this.mimeType = mimetype; 196 } else { 197 throw new ArgumentNotValid("The given MimeType is not valid: " + mimetype); 198 } 199 } 200 201 /** 202 * @return Returns the URL 203 */ 204 public String getURL() { 205 return url; 206 } 207 208 /** 209 * Set the url for this object. 210 * 211 * @param aUrl a given URL 212 * @throws ArgumentNotValid if the URL is not valid 213 */ 214 private void setURL(String aUrl) { 215 if (isURLValid(aUrl)) { 216 this.url = aUrl; 217 } else { 218 throw new ArgumentNotValid("The given URL is not valid: " + aUrl); 219 } 220 } 221 222 /** 223 * Method needed to de-serializable an object of this class. 224 * 225 * @param s the given ObjectInputStream 226 * @throws ClassNotFoundException If the class of the serialized object could not be found 227 * @throws IOException If an I/O error occurred while reading the serialized object 228 */ 229 private void readObject(ObjectInputStream s) throws ClassNotFoundException, IOException { 230 s.defaultReadObject(); 231 } 232 233 /** 234 * Method needed to serializable an object of this class. 235 * 236 * @param s the given ObjectOutputStream 237 * @throws IOException If an I/O error occurred while writing to the outputstream 238 */ 239 private void writeObject(ObjectOutputStream s) throws IOException { 240 s.defaultWriteObject(); 241 } 242 243 /** 244 * Utility method for testing the validity of the mimetype. We need do this, because the ARCWriter does not do this 245 * check properly 246 * 247 * @param mimetype the given mimetype 248 * @return true, if the mimetype match the pattern: \\w+/\\w+ 249 */ 250 private static boolean isMimetypeValid(String mimetype) { 251 return MIMETYPE_PATTERN.matcher(mimetype).matches(); 252 } 253 254 /** 255 * Utility method for testing the validity of the URL. We need do this, because the ARCWriter does not do this check 256 * properly. 257 * 258 * @param url the given URL 259 * @return true, if the URL match the pattern: 260 */ 261 private static boolean isURLValid(String url) { 262 return URL_PATTERN.matcher(url).matches(); 263 } 264 265 /** 266 * Checks, if this is a duplicate reduction MetadataEntry. 267 * 268 * @return true, if this is a duplicate reduction MetadataEntry, otherwise false. 269 */ 270 public boolean isDuplicateReductionMetadataEntry() { 271 return this.getURL().startsWith(MetadataEntry.getDuplicatereductionMetadataURLPrefix()); 272 } 273 274 private static String getDuplicatereductionMetadataURLPrefix() { 275 String organization = Settings.get(CommonSettings.ORGANIZATION); 276 return String.format(DUPLICATEREDUCTION_METADATA_URL_PREFIX_TEMPLATE, organization); 277 } 278 279 /** 280 * @return a string representation of this object 281 */ 282 public String toString() { 283 return "URL= " + getURL() + " ; mimetype= " + getMimeType() + " ; data= " + new String(getData()); 284 } 285 286 /** 287 * Store a list of metadata entries to disk. 288 * 289 * @param metadata the given metadata 290 * @param destinationDir the directory to store the metadata. 291 */ 292 public static void storeMetadataToDisk(List<MetadataEntry> metadata, File destinationDir) { 293 try { 294 for (MetadataEntry m : metadata) { 295 File mFile = new File(destinationDir, UUID.randomUUID().toString() + ".ser"); 296 FileOutputStream fos = new FileOutputStream(mFile); 297 ObjectOutputStream out = new ObjectOutputStream(fos); 298 out.writeObject(m); 299 fos.close(); 300 } 301 } catch (IOException e) { 302 throw new IOFailure("Unable to store metadata temporarily in directory ' " 303 + destinationDir.getAbsolutePath() + "'", e); 304 } 305 } 306 307 /** 308 * Retrieve a list of serialized metadata entries on disk. 309 * 310 * @param sourceDir the directory where the metadata is stored. 311 * @return the list of deserialized MetadataEntry object. 312 */ 313 public static List<MetadataEntry> getMetadataFromDisk(File sourceDir) { 314 List<MetadataEntry> metadata = new ArrayList<MetadataEntry>(); 315 FilenameFilter filter = new FilenameFilter() { 316 317 @Override 318 public boolean accept(File dir, String name) { 319 if (name.endsWith(".ser")) { 320 return true; 321 } 322 return false; 323 } 324 }; 325 326 for (String file : sourceDir.list(filter)) { 327 File metadataEntryFile = new File(sourceDir, file); 328 try { 329 FileInputStream fileIn = new FileInputStream(metadataEntryFile); 330 ObjectInputStream in = new ObjectInputStream(fileIn); 331 MetadataEntry o = (MetadataEntry) in.readObject(); 332 metadata.add(o); 333 in.close(); 334 fileIn.close(); 335 } catch (IOException e) { 336 throw new IOFailure("Unable to read the serialized metadata", e); 337 } catch (ClassNotFoundException e) { 338 throw new IllegalState("Unable to read the serialized metadata", e); 339 } 340 } 341 return metadata; 342 } 343 344}