001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.common.utils.archive;
024
025import java.io.IOException;
026import java.io.InputStream;
027import java.io.OutputStream;
028import java.util.regex.Pattern;
029
030import org.slf4j.Logger;
031import org.slf4j.LoggerFactory;
032
033import dk.netarkivet.common.Constants;
034import dk.netarkivet.common.exceptions.IOFailure;
035
036/** A batch job that extracts metadata. */
037@SuppressWarnings({"serial"})
038public class GetMetadataArchiveBatchJob extends ArchiveBatchJob {
039
040    /** The logger for this class. */
041    private static final Logger log = LoggerFactory.getLogger(GetMetadataArchiveBatchJob.class);
042
043    /** The pattern for matching the urls. */
044    private final Pattern urlMatcher;
045    /** The pattern for the mimetype matcher. */
046    private final Pattern mimeMatcher;
047
048    /**
049     * Constructor.
050     *
051     * @param urlMatcher A pattern for matching URLs of the desired entries. If null, a .* pattern will be used.
052     * @param mimeMatcher A pattern for matching mime-types of the desired entries. If null, a .* pattern will be used.
053     * <p>
054     * The batchJobTimeout is set to one day.
055     */
056    public GetMetadataArchiveBatchJob(Pattern urlMatcher, Pattern mimeMatcher) {
057        this.urlMatcher = urlMatcher;
058        this.mimeMatcher = mimeMatcher;
059
060        batchJobTimeout = Constants.ONE_DAY_IN_MILLIES;
061    }
062
063    /**
064     * Initialize method. Run before the arc-records are being processed. Currently does nothing.
065     *
066     * @param os The output stream to print any pre-processing data.
067     */
068    @Override
069    public void initialize(OutputStream os) {
070    }
071
072    /**
073     * The method for processing the arc-records.
074     *
075     * @param record The arc-record to process.
076     * @param os The output stream to write the results of the processing.
077     * @throws IOFailure In an IOException is caught during handling of the arc record.
078     */
079    @Override
080    public void processRecord(ArchiveRecordBase record, OutputStream os) throws IOFailure {
081        ArchiveHeaderBase header = record.getHeader();
082        InputStream in = record.getInputStream();
083
084        if (header.getUrl() == null) {
085            return;
086        }
087        log.info(header.getUrl() + " - " + header.getMimetype());
088        if (urlMatcher.matcher(header.getUrl()).matches() && mimeMatcher.matcher(header.getMimetype()).matches()) {
089            try {
090                byte[] buf = new byte[Constants.IO_BUFFER_SIZE];
091                int bytesRead;
092                while ((bytesRead = in.read(buf)) != -1) {
093                    os.write(buf, 0, bytesRead);
094                }
095            } catch (IOException e) {
096                // TODO is getOffset() correct using the IA archiveReader?
097                String message = "Error writing body of Archive entry '" + header.getArchiveFile() + "' offset '"
098                        + header.getOffset() + "'";
099                throw new IOFailure(message, e);
100            }
101        }
102
103        try {
104            in.close();
105        } catch (IOException e) {
106            String message = "Error closing Archive input stream";
107            throw new IOFailure(message, e);
108        }
109    }
110
111    /**
112     * Method for post-processing the data. Currently does nothing.
113     *
114     * @param os The output stream to write the results of the post-processing data.
115     */
116    @Override
117    public void finish(OutputStream os) {
118    }
119
120    /**
121     * Humanly readable description of this instance.
122     *
123     * @return The human readable description of this instance.
124     */
125    @Override
126    public String toString() {
127        return getClass().getName() + ", with arguments: URLMatcher = " + urlMatcher + ", mimeMatcher = " + mimeMatcher;
128    }
129
130}