Source code

001/*
002 * #%L
003 * Netarchivesuite - wayback
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.wayback.batch;
025
026import java.io.OutputStream;
027import java.util.regex.Pattern;
028
029import dk.netarkivet.common.utils.archive.ArchiveBatchJob;
030import dk.netarkivet.common.utils.archive.ArchiveRecordBase;
031
032/**
033 * This batch batch job takes deduplication records from a crawl log in a metadata arcfile and converts them to cdx
034 * records for use in wayback.
035 */
036@SuppressWarnings({"serial"})
037public class DeduplicationCDXExtractionBatchJob extends ArchiveBatchJob {
038
039    /**
040     * A utility which has methods for converting a deduplicate crawl-log entry to a CDX entry.
041     */
042    private DeduplicateToCDXAdapter adapter;
043
044    /**
045     * A regular expression representing the url in a metadata arcfile of a crawl log entry.
046     */
047    private static final String CRAWL_LOG_URL_PATTERN_STRING = "metadata://(.*)crawl[.]log(.*)";
048
049    /**
050     * A Pattern representing a compiled expression representing the url in a metadata arcfile of a crawl log entry.
051     */
052    private Pattern crawlLogUrlPattern;
053
054    /**
055     * Initializes various fields of this class.
056     *
057     * @param os unused parameter
058     */
059    @Override
060    public void initialize(OutputStream os) {
061        adapter = new DeduplicateToCDXAdapter();
062        crawlLogUrlPattern = Pattern.compile(CRAWL_LOG_URL_PATTERN_STRING);
063    }
064
065    /**
066     * If the ArchiveRecord is a crawl-log entry then any duplicate entries in the crawl log are converted to CDX
067     * entries and written to the output. Otherwise this method returns without doing anything. If the ArchiveRecord is
068     * a WarcRecord, and the record is the warcinfo, the record is skipped.
069     *
070     * @param record The ArchiveRecord to be processed
071     * @param os the stream to which output is written
072     */
073    @Override
074    public void processRecord(ArchiveRecordBase record, OutputStream os) {
075        if (record.bIsWarc && record.getHeader().getHeaderStringValue("warc-type").equalsIgnoreCase("warcinfo")) {
076            // Skip the warc-info record
077            return;
078        }
079        if (crawlLogUrlPattern.matcher(record.getHeader().getUrl()).matches()) {
080            adapter.adaptStream(record.getInputStream(), os);
081        } else {
082            return;
083        }
084    }
085
086    /**
087     * Does nothing.
088     *
089     * @param os an outputstream
090     */
091    @Override
092    public void finish(OutputStream os) {
093        // Nothing to finalise
094    }
095
096}