001/* 002 * #%L 003 * Netarchivesuite - wayback 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.wayback.batch; 025 026import java.io.OutputStream; 027import java.util.regex.Pattern; 028 029import dk.netarkivet.common.utils.archive.ArchiveBatchJob; 030import dk.netarkivet.common.utils.archive.ArchiveRecordBase; 031 032/** 033 * This batch batch job takes deduplication records from a crawl log in a metadata arcfile and converts them to cdx 034 * records for use in wayback. 035 */ 036@SuppressWarnings({"serial"}) 037public class DeduplicationCDXExtractionBatchJob extends ArchiveBatchJob { 038 039 /** 040 * A utility which has methods for converting a deduplicate crawl-log entry to a CDX entry. 041 */ 042 private DeduplicateToCDXAdapter adapter; 043 044 /** 045 * A regular expression representing the url in a metadata arcfile of a crawl log entry. 046 */ 047 private static final String CRAWL_LOG_URL_PATTERN_STRING = "metadata://(.*)crawl[.]log(.*)"; 048 049 /** 050 * A Pattern representing a compiled expression representing the url in a metadata arcfile of a crawl log entry. 051 */ 052 private Pattern crawlLogUrlPattern; 053 054 /** 055 * Initializes various fields of this class. 056 * 057 * @param os unused parameter 058 */ 059 @Override 060 public void initialize(OutputStream os) { 061 adapter = new DeduplicateToCDXAdapter(); 062 crawlLogUrlPattern = Pattern.compile(CRAWL_LOG_URL_PATTERN_STRING); 063 } 064 065 /** 066 * If the ArchiveRecord is a crawl-log entry then any duplicate entries in the crawl log are converted to CDX 067 * entries and written to the output. Otherwise this method returns without doing anything. If the ArchiveRecord is 068 * a WarcRecord, and the record is the warcinfo, the record is skipped. 069 * 070 * @param record The ArchiveRecord to be processed 071 * @param os the stream to which output is written 072 */ 073 @Override 074 public void processRecord(ArchiveRecordBase record, OutputStream os) { 075 if (record.bIsWarc && record.getHeader().getHeaderStringValue("warc-type").equalsIgnoreCase("warcinfo")) { 076 // Skip the warc-info record 077 return; 078 } 079 if (crawlLogUrlPattern.matcher(record.getHeader().getUrl()).matches()) { 080 adapter.adaptStream(record.getInputStream(), os); 081 } else { 082 return; 083 } 084 } 085 086 /** 087 * Does nothing. 088 * 089 * @param os an outputstream 090 */ 091 @Override 092 public void finish(OutputStream os) { 093 // Nothing to finalise 094 } 095 096}