001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.harvesting.metadata;
024
025import java.io.File;
026import java.io.FileInputStream;
027import java.io.FileOutputStream;
028import java.io.FilenameFilter;
029import java.io.IOException;
030import java.io.ObjectInputStream;
031import java.io.ObjectOutputStream;
032import java.io.Serializable;
033import java.util.ArrayList;
034import java.util.List;
035import java.util.UUID;
036import java.util.regex.Pattern;
037
038import org.slf4j.Logger;
039import org.slf4j.LoggerFactory;
040
041import dk.netarkivet.common.CommonSettings;
042import dk.netarkivet.common.exceptions.ArgumentNotValid;
043import dk.netarkivet.common.exceptions.IOFailure;
044import dk.netarkivet.common.exceptions.IllegalState;
045import dk.netarkivet.common.utils.Settings;
046import dk.netarkivet.common.utils.StringUtils;
047import dk.netarkivet.harvester.datamodel.AliasInfo;
048
049/**
050 * Class used to carry metadata in DoOneCrawl messages, including the URL and mimetype necessary to write the metadata
051 * to metadata (W)ARC files.
052 */
053@SuppressWarnings({"serial"})
054public class MetadataEntry implements Serializable {
055
056    /** The instance logger. */
057    private static final Logger log = LoggerFactory.getLogger(MetadataEntry.class);
058    /** The URL for this metadataEntry: Used as the unique identifier for this bit of metadata in the Netarchive. */
059    private String url;
060    /** The mimetype for this metadataEntry: Identifies which type of document this bit of metadata is. */
061    private String mimeType;
062
063    /** the metadata itself as byte array. */
064    private byte[] data;
065
066    /** Regular expression for a valid mimetype. */
067    private static final String MIMETYPE_REGEXP = "\\w+/\\w+";
068    /** The corresponding pattern for the regexp MIMETYPE_REGEXP. */
069    private static final Pattern MIMETYPE_PATTERN = Pattern.compile(MIMETYPE_REGEXP);
070
071    /**
072     * The url should be valid according to RFC 2396 This URL_REGEXP is taken from org.archive.util.SURT v. 1.12 1:
073     * scheme:// 2: userinfo (if present) 3: @ (if present) 4: host 5: :port 6: path
074     */
075    private static String URL_REGEXP = "^(\\w+://)(?:([-\\w\\.!~\\*'\\(\\)%;:&=+$,]+?)(@))?(\\S+?)(:\\d+)?(/\\settingsStructure*)?$";
076    // 1 2 3 4 5 6
077    /** The corresponding pattern for the regexp URL_REGEXP. */
078    private static final Pattern URL_PATTERN = Pattern.compile(URL_REGEXP);
079
080    /** Mimetype for metadata url. */
081    private static final String MIMETYPE_TEXT_PLAIN = "text/plain";
082
083    /** Suffix for both metadata URLs. */
084    private static final String METADATA_URL_SUFFIX = "?majorversion=1&minorversion=0&harvestid=%s&harvestnum=%s&jobid=%s";
085
086    /** Metadata URL template for aliases. */
087    private static final String ALIAS_METADATA_URL_TEMPLATE = "metadata://%s/crawl/setup/aliases" + METADATA_URL_SUFFIX;
088
089    /** Common template prefix for all deduplication metadata URLs. */
090    private static final String DUPLICATEREDUCTION_METADATA_URL_PREFIX_TEMPLATE = "metadata://%s/crawl/setup/duplicatereductionjobs";
091
092    /**
093     * Constructor for this class.
094     *
095     * @param url the URL assigned to this metadata (needed for it to be searchable)
096     * @param mimeType the mimeType for this metadata (normally text/plain or text/xml)
097     * @param data the metadata itself
098     * @throws ArgumentNotValid if arguments are null or empty strings, or if argument url is not valid URL or if
099     * argument mimeType is not valid MimeType
100     */
101    public MetadataEntry(String url, String mimeType, String data) {
102        ArgumentNotValid.checkNotNullOrEmpty(url, "url");
103        ArgumentNotValid.checkNotNullOrEmpty(mimeType, "mimetype");
104        ArgumentNotValid.checkNotNull(data, "data");
105        setURL(url); // Ensures this is a valid url
106        setMimetype(mimeType); // Ensures this is a valid mimetype
107        this.mimeType = mimeType;
108        this.data = data.getBytes();
109    }
110
111    /**
112     * Generate a MetadataEntry from a list of AliasInfo objects (VERSION 2) Expired aliases is skipped by this method.
113     *
114     * @param aliases the list of aliases (possibly empty)
115     * @param origHarvestDefinitionID The harvestdefinition that is behind the job with the given jobId
116     * @param harvestNum The number of the harvest that the job with the given jobid belongs to
117     * @param jobId The id of the Job, which this metadata belongs to
118     * @return null, if the list if empty (or only consists of expired aliases), otherwise returns a MetadataEntry from
119     * a list of AliasInfo objects containing unexpired aliases.
120     */
121    public static MetadataEntry makeAliasMetadataEntry(List<AliasInfo> aliases, Long origHarvestDefinitionID,
122            int harvestNum, Long jobId) {
123        ArgumentNotValid.checkNotNull(aliases, "aliases");
124        ArgumentNotValid.checkNotNull(origHarvestDefinitionID, "Long origHarvestDefinitionID");
125        ArgumentNotValid.checkNotNegative(harvestNum, "int harvestNum");
126        ArgumentNotValid.checkNotNull(jobId, "Long jobId");
127        if (aliases.isEmpty()) {
128            return null;
129        }
130        // Remove any expired aliases from the aliases collection
131        List<AliasInfo> nonExpiredAliases = new ArrayList<AliasInfo>();
132        for (AliasInfo alias : aliases) {
133            if (!alias.isExpired()) {
134                nonExpiredAliases.add(alias);
135            }
136        }
137        if (nonExpiredAliases.isEmpty()) {
138            log.warn("All the aliases for the domains in job {} are expired. Aborting creation of an alias MetadataEntry", jobId);
139            return null;
140        }
141
142        String organization = Settings.get(CommonSettings.ORGANIZATION);
143        // construct metadata-URL for AliasMetadataEntry
144        String metadataUrl = String.format(ALIAS_METADATA_URL_TEMPLATE, organization, origHarvestDefinitionID,
145                harvestNum, jobId);
146
147        StringBuffer sb = new StringBuffer();
148        for (AliasInfo alias : nonExpiredAliases) {
149            sb.append(alias.getDomain()).append(" is an alias for ").append(alias.getAliasOf()).append("\n");
150        }
151        return new MetadataEntry(metadataUrl, MIMETYPE_TEXT_PLAIN, sb.toString());
152    }
153
154    /**
155     * Generate a MetadataEntry from a list of job ids for duplicate reduction.
156     *
157     * @param jobIDsForDuplicateReduction the list of jobids (possibly empty)
158     * @param origHarvestDefinitionID The harvestdefinition that is behind the job with the given jobId
159     * @param harvestNum The number of the harvest that the job with the given jobid belongs to
160     * @param jobId The id of the Job, which this metadata belongs to
161     * @return null, if the list is empty, otherwise returns a MetadataEntry from the list of jobids.
162     */
163    public static MetadataEntry makeDuplicateReductionMetadataEntry(List<Long> jobIDsForDuplicateReduction,
164            Long origHarvestDefinitionID, int harvestNum, Long jobId) {
165        ArgumentNotValid.checkNotNull(jobIDsForDuplicateReduction, "List<Long> jobIDsForDuplicateReduction");
166        ArgumentNotValid.checkNotNull(origHarvestDefinitionID, "Long origHarvestDefinitionID");
167        ArgumentNotValid.checkNotNegative(harvestNum, "int harvestNum");
168        ArgumentNotValid.checkNotNull(jobId, "Long jobId");
169
170        String organization = Settings.get(CommonSettings.ORGANIZATION);
171        // construct a metadata-URL for this MetadataEntry
172        String metadataUrl = String.format(DUPLICATEREDUCTION_METADATA_URL_PREFIX_TEMPLATE + METADATA_URL_SUFFIX,
173                organization, origHarvestDefinitionID, harvestNum, jobId);
174
175        return new MetadataEntry(metadataUrl, MIMETYPE_TEXT_PLAIN,
176                StringUtils.conjoin(",", jobIDsForDuplicateReduction));
177    }
178
179    /**
180     * @return Returns the data.
181     */
182    public byte[] getData() {
183        return data;
184    }
185
186    /**
187     * @return Returns the mimeType.
188     */
189    public String getMimeType() {
190        return mimeType;
191    }
192
193    /**
194     * Set the mimetype for this object.
195     *
196     * @param mimetype a given mimetype
197     * @throws ArgumentNotValid if the mimetype is not valid
198     */
199    private void setMimetype(String mimetype) {
200        if (isMimetypeValid(mimetype)) {
201            this.mimeType = mimetype;
202        } else {
203            throw new ArgumentNotValid("The given MimeType is not valid: " + mimetype);
204        }
205    }
206
207    /**
208     * @return Returns the URL
209     */
210    public String getURL() {
211        return url;
212    }
213
214    /**
215     * Set the url for this object.
216     *
217     * @param aUrl a given URL
218     * @throws ArgumentNotValid if the URL is not valid
219     */
220    private void setURL(String aUrl) {
221        if (isURLValid(aUrl)) {
222            this.url = aUrl;
223        } else {
224            throw new ArgumentNotValid("The given URL is not valid: " + aUrl);
225        }
226    }
227
228    /**
229     * Method needed to de-serializable an object of this class.
230     *
231     * @param s the given ObjectInputStream
232     * @throws ClassNotFoundException If the class of the serialized object could not be found
233     * @throws IOException If an I/O error occurred while reading the serialized object
234     */
235    private void readObject(ObjectInputStream s) throws ClassNotFoundException, IOException {
236        s.defaultReadObject();
237    }
238
239    /**
240     * Method needed to serializable an object of this class.
241     *
242     * @param s the given ObjectOutputStream
243     * @throws IOException If an I/O error occurred while writing to the outputstream
244     */
245    private void writeObject(ObjectOutputStream s) throws IOException {
246        s.defaultWriteObject();
247    }
248
249    /**
250     * Utility method for testing the validity of the mimetype. We need do this, because the ARCWriter does not do this
251     * check properly
252     *
253     * @param mimetype the given mimetype
254     * @return true, if the mimetype match the pattern: \\w+/\\w+
255     */
256    private static boolean isMimetypeValid(String mimetype) {
257        return MIMETYPE_PATTERN.matcher(mimetype).matches();
258    }
259
260    /**
261     * Utility method for testing the validity of the URL. We need do this, because the ARCWriter does not do this check
262     * properly.
263     *
264     * @param url the given URL
265     * @return true, if the URL match the pattern:
266     */
267    private static boolean isURLValid(String url) {
268        return URL_PATTERN.matcher(url).matches();
269    }
270
271    /**
272     * Checks, if this is a duplicate reduction MetadataEntry.
273     *
274     * @return true, if this is a duplicate reduction MetadataEntry, otherwise false.
275     */
276    public boolean isDuplicateReductionMetadataEntry() {
277        return this.getURL().startsWith(MetadataEntry.getDuplicatereductionMetadataURLPrefix());
278    }
279
280    private static String getDuplicatereductionMetadataURLPrefix() {
281        String organization = Settings.get(CommonSettings.ORGANIZATION);
282        return String.format(DUPLICATEREDUCTION_METADATA_URL_PREFIX_TEMPLATE, organization);
283    }
284
285    /**
286     * @return a string representation of this object
287     */
288    public String toString() {
289        return "URL= " + getURL() + " ; mimetype= " + getMimeType() + " ; data= " + new String(getData());
290    }
291
292    /**
293     * Store a list of metadata entries to disk.
294     *
295     * @param metadata the given metadata
296     * @param destinationDir the directory to store the metadata.
297     */
298    public static void storeMetadataToDisk(List<MetadataEntry> metadata, File destinationDir) {
299        try {
300            for (MetadataEntry m : metadata) {
301                File mFile = new File(destinationDir, UUID.randomUUID().toString() + ".ser");
302                FileOutputStream fos = new FileOutputStream(mFile);
303                ObjectOutputStream out = new ObjectOutputStream(fos);
304                out.writeObject(m);
305                fos.close();
306            }
307        } catch (IOException e) {
308            throw new IOFailure("Unable to store metadata temporarily in directory ' "
309                    + destinationDir.getAbsolutePath() + "'", e);
310        }
311    }
312
313    /**
314     * Retrieve a list of serialized metadata entries on disk.
315     *
316     * @param sourceDir the directory where the metadata is stored.
317     * @return the list of deserialized MetadataEntry object.
318     */
319    public static List<MetadataEntry> getMetadataFromDisk(File sourceDir) {
320        List<MetadataEntry> metadata = new ArrayList<MetadataEntry>();
321        FilenameFilter filter = new FilenameFilter() {
322
323            @Override
324            public boolean accept(File dir, String name) {
325                if (name.endsWith(".ser")) {
326                    return true;
327                }
328                return false;
329            }
330        };
331
332        for (String file : sourceDir.list(filter)) {
333            File metadataEntryFile = new File(sourceDir, file);
334            try {
335                FileInputStream fileIn = new FileInputStream(metadataEntryFile);
336                ObjectInputStream in = new ObjectInputStream(fileIn);
337                MetadataEntry o = (MetadataEntry) in.readObject();
338                metadata.add(o);
339                in.close();
340                fileIn.close();
341            } catch (IOException e) {
342                throw new IOFailure("Unable to read the serialized metadata", e);
343            } catch (ClassNotFoundException e) {
344                throw new IllegalState("Unable to read the serialized metadata", e);
345            }
346        }
347        return metadata;
348    }
349
350}