001/* 002 * #%L 003 * Netarchivesuite - wayback 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.wayback.batch; 024 025import java.io.InputStream; 026import java.io.OutputStream; 027 028/** 029 * Interface describing a class which can be used to convert duplicate records in a crawl log to wayback-compatible cdx 030 * records. 031 */ 032public interface DeduplicateToCDXAdapterInterface { 033 034 /** 035 * Takes a deduplicate line from a crawl log and converts it to a line in a cdx file suitable for searching in 036 * wayback. The target url in the line is canonicalized by this method. The type of canonicalization is determined 037 * by the default canonicalizer from the wayback settings.xml file. If the input String is not a crawl-log duplicate 038 * line, null is returned. 039 * 040 * @param line a line from a crawl log 041 * @return a line for a cdx file or null if the input is not a duplicate line 042 */ 043 String adaptLine(String line); 044 045 /** 046 * Scans an input stream from a crawl log and converts all lines containing deduplicate information to cdx records 047 * which it outputs to an output stream. 048 * 049 * @param is the input stream 050 * @param os the output stream 051 */ 052 void adaptStream(InputStream is, OutputStream os); 053 054}