001/*
002 * #%L
003 * Netarchivesuite - wayback
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.wayback.batch;
024
025import java.io.InputStream;
026import java.io.OutputStream;
027
028/**
029 * Interface describing a class which can be used to convert duplicate records in a crawl log to wayback-compatible cdx
030 * records.
031 */
032public interface DeduplicateToCDXAdapterInterface {
033
034    /**
035     * Takes a deduplicate line from a crawl log and converts it to a line in a cdx file suitable for searching in
036     * wayback. The target url in the line is canonicalized by this method. The type of canonicalization is determined
037     * by the default canonicalizer from the wayback settings.xml file. If the input String is not a crawl-log duplicate
038     * line, null is returned.
039     *
040     * @param line a line from a crawl log
041     * @return a line for a cdx file or null if the input is not a duplicate line
042     */
043    String adaptLine(String line);
044
045    /**
046     * Scans an input stream from a crawl log and converts all lines containing deduplicate information to cdx records
047     * which it outputs to an output stream.
048     *
049     * @param is the input stream
050     * @param os the output stream
051     */
052    void adaptStream(InputStream is, OutputStream os);
053
054}