001/*
002 * #%L
003 * Netarchivesuite - common
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.common.utils.cdx;
025
026import java.io.IOException;
027import java.io.OutputStream;
028import java.util.regex.Pattern;
029
030import org.archive.io.arc.ARCRecord;
031
032import dk.netarkivet.common.Constants;
033import dk.netarkivet.common.exceptions.IOFailure;
034import dk.netarkivet.common.utils.arc.ARCBatchJob;
035
036/**
037 * Job to get cdx records out of metadata files.
038 */
039@SuppressWarnings({"serial"})
040public class GetCDXRecordsBatchJob extends ARCBatchJob {
041
042    /** The URL pattern used to retrieve the CDX-records. */
043    private final Pattern URLMatcher;
044    /** The MIME pattern used to retrieve the CDX-records. */
045    private final Pattern mimeMatcher;
046
047    /**
048     * Constructor.
049     */
050    public GetCDXRecordsBatchJob() {
051        URLMatcher = Pattern.compile(Constants.ALL_PATTERN);
052        mimeMatcher = Pattern.compile(Constants.CDX_MIME_PATTERN);
053        batchJobTimeout = 7 * Constants.ONE_DAY_IN_MILLIES;
054    }
055
056    /**
057     * Initialize job. Does nothing
058     *
059     * @param os The output stream (unused in this implementation)
060     */
061    public void initialize(OutputStream os) {
062    }
063
064    /**
065     * Process a single ARCRecord if the record contains cdx.
066     *
067     * @param sar The record we want to process
068     * @param os The output stream to write the result to
069     */
070    public void processRecord(ARCRecord sar, OutputStream os) {
071        if (URLMatcher.matcher(sar.getMetaData().getUrl()).matches()
072                && mimeMatcher.matcher(sar.getMetaData().getMimetype()).matches()) {
073            try {
074                try {
075                    byte[] buf = new byte[Constants.IO_BUFFER_SIZE];
076                    int bytesRead;
077                    while ((bytesRead = sar.read(buf)) != -1) {
078                        os.write(buf, 0, bytesRead);
079                    }
080                } finally {
081                    // TODO Should we close ARCRecord here???
082                    // if (is != null) {
083                    // is.close();
084                    // }
085                }
086            } catch (IOException e) {
087                String message = "Error writing body of ARC entry '" + sar.getMetaData().getArcFile() + "' offset '"
088                        + sar.getMetaData().getOffset() + "'";
089                throw new IOFailure(message, e);
090            }
091        }
092    }
093
094    /**
095     * Finish job. Does nothing
096     *
097     * @param os The Outputstream (unused in this implementation)
098     */
099    public void finish(OutputStream os) {
100    }
101
102}