001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.harvesting.metadata;
024
025import java.io.File;
026import java.io.FileInputStream;
027import java.io.FileOutputStream;
028import java.io.FilenameFilter;
029import java.io.IOException;
030import java.io.ObjectInputStream;
031import java.io.ObjectOutputStream;
032import java.io.Serializable;
033import java.util.ArrayList;
034import java.util.List;
035import java.util.UUID;
036import java.util.regex.Pattern;
037
038import dk.netarkivet.common.CommonSettings;
039import dk.netarkivet.common.exceptions.ArgumentNotValid;
040import dk.netarkivet.common.exceptions.IOFailure;
041import dk.netarkivet.common.exceptions.IllegalState;
042import dk.netarkivet.common.utils.Settings;
043import dk.netarkivet.common.utils.StringUtils;
044import dk.netarkivet.harvester.datamodel.AliasInfo;
045
046/**
047 * Class used to carry metadata in DoOneCrawl messages, including the URL and mimetype necessary to write the metadata
048 * to metadata (W)ARC files.
049 */
050@SuppressWarnings({"serial"})
051public class MetadataEntry implements Serializable {
052
053    /** The URL for this metadataEntry: Used as the unique identifier for this bit of metadata in the Netarchive. */
054    private String url;
055    /** The mimetype for this metadataEntry: Identifies which type of document this bit of metadata is. */
056    private String mimeType;
057
058    /** the metadata itself as byte array. */
059    private byte[] data;
060
061    /** Regular expression for a valid mimetype. */
062    private static final String MIMETYPE_REGEXP = "\\w+/\\w+";
063    /** The corresponding pattern for the regexp MIMETYPE_REGEXP. */
064    private static final Pattern MIMETYPE_PATTERN = Pattern.compile(MIMETYPE_REGEXP);
065
066    /**
067     * The url should be valid according to RFC 2396 This URL_REGEXP is taken from org.archive.util.SURT v. 1.12 1:
068     * scheme:// 2: userinfo (if present) 3: @ (if present) 4: host 5: :port 6: path
069     */
070    private static String URL_REGEXP = "^(\\w+://)(?:([-\\w\\.!~\\*'\\(\\)%;:&=+$,]+?)(@))?(\\S+?)(:\\d+)?(/\\settingsStructure*)?$";
071    // 1 2 3 4 5 6
072    /** The corresponding pattern for the regexp URL_REGEXP. */
073    private static final Pattern URL_PATTERN = Pattern.compile(URL_REGEXP);
074
075    /** Mimetype for metadata url. */
076    private static final String MIMETYPE_TEXT_PLAIN = "text/plain";
077
078    /** Suffix for both metadata URLs. */
079    private static final String METADATA_URL_SUFFIX = "?majorversion=1&minorversion=0&harvestid=%s&harvestnum=%s&jobid=%s";
080
081    /** Metadata URL template for aliases. */
082    private static final String ALIAS_METADATA_URL_TEMPLATE = "metadata://%s/crawl/setup/aliases" + METADATA_URL_SUFFIX;
083
084    /** Common template prefix for all deduplication metadata URLs. */
085    private static final String DUPLICATEREDUCTION_METADATA_URL_PREFIX_TEMPLATE = "metadata://%s/crawl/setup/duplicatereductionjobs";
086
087    /**
088     * Constructor for this class.
089     *
090     * @param url the URL assigned to this metadata (needed for it to be searchable)
091     * @param mimeType the mimeType for this metadata (normally text/plain or text/xml)
092     * @param data the metadata itself
093     * @throws ArgumentNotValid if arguments are null or empty strings, or if argument url is not valid URL or if
094     * argument mimeType is not valid MimeType
095     */
096    public MetadataEntry(String url, String mimeType, String data) {
097        ArgumentNotValid.checkNotNullOrEmpty(url, "url");
098        ArgumentNotValid.checkNotNullOrEmpty(mimeType, "mimetype");
099        ArgumentNotValid.checkNotNull(data, "data");
100        setURL(url); // Ensures this is a valid url
101        setMimetype(mimeType); // Ensures this is a valid mimetype
102        this.mimeType = mimeType;
103        this.data = data.getBytes();
104    }
105
106    /**
107     * Generate a MetadataEntry from a list of AliasInfo objects (VERSION 2) Expired aliases is skipped by this method.
108     *
109     * @param aliases the list of aliases (possibly empty)
110     * @param origHarvestDefinitionID The harvestdefinition that is behind the job with the given jobId
111     * @param harvestNum The number of the harvest that the job with the given jobid belongs to
112     * @param jobId The id of the Job, which this metadata belongs to
113     * @return null, if the list if empty (or only consists of expired aliases), otherwise returns a MetadataEntry from
114     * a list of AliasInfo objects containing unexpired aliases.
115     */
116    public static MetadataEntry makeAliasMetadataEntry(List<AliasInfo> aliases, Long origHarvestDefinitionID,
117            int harvestNum, Long jobId) {
118        ArgumentNotValid.checkNotNull(aliases, "aliases");
119        ArgumentNotValid.checkNotNull(origHarvestDefinitionID, "Long origHarvestDefinitionID");
120        ArgumentNotValid.checkNotNegative(harvestNum, "int harvestNum");
121        ArgumentNotValid.checkNotNull(jobId, "Long jobId");
122        if (aliases.isEmpty()) {
123            return null;
124        }
125        // Remove any expired aliases from the aliases collection
126        List<AliasInfo> nonExpiredAliases = new ArrayList<AliasInfo>();
127        for (AliasInfo alias : aliases) {
128            if (!alias.isExpired()) {
129                nonExpiredAliases.add(alias);
130            }
131        }
132        if (nonExpiredAliases.isEmpty()) {
133            return null;
134        }
135
136        String organization = Settings.get(CommonSettings.ORGANIZATION);
137        // construct metadata-URL for AliasMetadataEntry
138        String metadataUrl = String.format(ALIAS_METADATA_URL_TEMPLATE, organization, origHarvestDefinitionID,
139                harvestNum, jobId);
140
141        StringBuffer sb = new StringBuffer();
142        for (AliasInfo alias : nonExpiredAliases) {
143            sb.append(alias.getDomain()).append(" is an alias for ").append(alias.getAliasOf()).append("\n");
144        }
145        return new MetadataEntry(metadataUrl, MIMETYPE_TEXT_PLAIN, sb.toString());
146    }
147
148    /**
149     * Generate a MetadataEntry from a list of job ids for duplicate reduction.
150     *
151     * @param jobIDsForDuplicateReduction the list of jobids (possibly empty)
152     * @param origHarvestDefinitionID The harvestdefinition that is behind the job with the given jobId
153     * @param harvestNum The number of the harvest that the job with the given jobid belongs to
154     * @param jobId The id of the Job, which this metadata belongs to
155     * @return null, if the list is empty, otherwise returns a MetadataEntry from the list of jobids.
156     */
157    public static MetadataEntry makeDuplicateReductionMetadataEntry(List<Long> jobIDsForDuplicateReduction,
158            Long origHarvestDefinitionID, int harvestNum, Long jobId) {
159        ArgumentNotValid.checkNotNull(jobIDsForDuplicateReduction, "List<Long> jobIDsForDuplicateReduction");
160        ArgumentNotValid.checkNotNull(origHarvestDefinitionID, "Long origHarvestDefinitionID");
161        ArgumentNotValid.checkNotNegative(harvestNum, "int harvestNum");
162        ArgumentNotValid.checkNotNull(jobId, "Long jobId");
163
164        String organization = Settings.get(CommonSettings.ORGANIZATION);
165        // construct a metadata-URL for this MetadataEntry
166        String metadataUrl = String.format(DUPLICATEREDUCTION_METADATA_URL_PREFIX_TEMPLATE + METADATA_URL_SUFFIX,
167                organization, origHarvestDefinitionID, harvestNum, jobId);
168
169        return new MetadataEntry(metadataUrl, MIMETYPE_TEXT_PLAIN,
170                StringUtils.conjoin(",", jobIDsForDuplicateReduction));
171    }
172
173    /**
174     * @return Returns the data.
175     */
176    public byte[] getData() {
177        return data;
178    }
179
180    /**
181     * @return Returns the mimeType.
182     */
183    public String getMimeType() {
184        return mimeType;
185    }
186
187    /**
188     * Set the mimetype for this object.
189     *
190     * @param mimetype a given mimetype
191     * @throws ArgumentNotValid if the mimetype is not valid
192     */
193    private void setMimetype(String mimetype) {
194        if (isMimetypeValid(mimetype)) {
195            this.mimeType = mimetype;
196        } else {
197            throw new ArgumentNotValid("The given MimeType is not valid: " + mimetype);
198        }
199    }
200
201    /**
202     * @return Returns the URL
203     */
204    public String getURL() {
205        return url;
206    }
207
208    /**
209     * Set the url for this object.
210     *
211     * @param aUrl a given URL
212     * @throws ArgumentNotValid if the URL is not valid
213     */
214    private void setURL(String aUrl) {
215        if (isURLValid(aUrl)) {
216            this.url = aUrl;
217        } else {
218            throw new ArgumentNotValid("The given URL is not valid: " + aUrl);
219        }
220    }
221
222    /**
223     * Method needed to de-serializable an object of this class.
224     *
225     * @param s the given ObjectInputStream
226     * @throws ClassNotFoundException If the class of the serialized object could not be found
227     * @throws IOException If an I/O error occurred while reading the serialized object
228     */
229    private void readObject(ObjectInputStream s) throws ClassNotFoundException, IOException {
230        s.defaultReadObject();
231    }
232
233    /**
234     * Method needed to serializable an object of this class.
235     *
236     * @param s the given ObjectOutputStream
237     * @throws IOException If an I/O error occurred while writing to the outputstream
238     */
239    private void writeObject(ObjectOutputStream s) throws IOException {
240        s.defaultWriteObject();
241    }
242
243    /**
244     * Utility method for testing the validity of the mimetype. We need do this, because the ARCWriter does not do this
245     * check properly
246     *
247     * @param mimetype the given mimetype
248     * @return true, if the mimetype match the pattern: \\w+/\\w+
249     */
250    private static boolean isMimetypeValid(String mimetype) {
251        return MIMETYPE_PATTERN.matcher(mimetype).matches();
252    }
253
254    /**
255     * Utility method for testing the validity of the URL. We need do this, because the ARCWriter does not do this check
256     * properly.
257     *
258     * @param url the given URL
259     * @return true, if the URL match the pattern:
260     */
261    private static boolean isURLValid(String url) {
262        return URL_PATTERN.matcher(url).matches();
263    }
264
265    /**
266     * Checks, if this is a duplicate reduction MetadataEntry.
267     *
268     * @return true, if this is a duplicate reduction MetadataEntry, otherwise false.
269     */
270    public boolean isDuplicateReductionMetadataEntry() {
271        return this.getURL().startsWith(MetadataEntry.getDuplicatereductionMetadataURLPrefix());
272    }
273
274    private static String getDuplicatereductionMetadataURLPrefix() {
275        String organization = Settings.get(CommonSettings.ORGANIZATION);
276        return String.format(DUPLICATEREDUCTION_METADATA_URL_PREFIX_TEMPLATE, organization);
277    }
278
279    /**
280     * @return a string representation of this object
281     */
282    public String toString() {
283        return "URL= " + getURL() + " ; mimetype= " + getMimeType() + " ; data= " + new String(getData());
284    }
285
286    /**
287     * Store a list of metadata entries to disk.
288     *
289     * @param metadata the given metadata
290     * @param destinationDir the directory to store the metadata.
291     */
292    public static void storeMetadataToDisk(List<MetadataEntry> metadata, File destinationDir) {
293        try {
294            for (MetadataEntry m : metadata) {
295                File mFile = new File(destinationDir, UUID.randomUUID().toString() + ".ser");
296                FileOutputStream fos = new FileOutputStream(mFile);
297                ObjectOutputStream out = new ObjectOutputStream(fos);
298                out.writeObject(m);
299                fos.close();
300            }
301        } catch (IOException e) {
302            throw new IOFailure("Unable to store metadata temporarily in directory ' "
303                    + destinationDir.getAbsolutePath() + "'", e);
304        }
305    }
306
307    /**
308     * Retrieve a list of serialized metadata entries on disk.
309     *
310     * @param sourceDir the directory where the metadata is stored.
311     * @return the list of deserialized MetadataEntry object.
312     */
313    public static List<MetadataEntry> getMetadataFromDisk(File sourceDir) {
314        List<MetadataEntry> metadata = new ArrayList<MetadataEntry>();
315        FilenameFilter filter = new FilenameFilter() {
316
317            @Override
318            public boolean accept(File dir, String name) {
319                if (name.endsWith(".ser")) {
320                    return true;
321                }
322                return false;
323            }
324        };
325
326        for (String file : sourceDir.list(filter)) {
327            File metadataEntryFile = new File(sourceDir, file);
328            try {
329                FileInputStream fileIn = new FileInputStream(metadataEntryFile);
330                ObjectInputStream in = new ObjectInputStream(fileIn);
331                MetadataEntry o = (MetadataEntry) in.readObject();
332                metadata.add(o);
333                in.close();
334                fileIn.close();
335            } catch (IOException e) {
336                throw new IOFailure("Unable to read the serialized metadata", e);
337            } catch (ClassNotFoundException e) {
338                throw new IllegalState("Unable to read the serialized metadata", e);
339            }
340        }
341        return metadata;
342    }
343
344}