001/*
002 * #%L
003 * Netarchivesuite - wayback
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.wayback;
025
026import java.io.File;
027import java.io.FileInputStream;
028import java.io.FileNotFoundException;
029import java.io.IOException;
030
031import dk.netarkivet.common.exceptions.ArgumentNotValid;
032import dk.netarkivet.wayback.batch.DeduplicateToCDXAdapter;
033
034/**
035 * A simple command line application to generate cdx files from local crawl-log files.
036 */
037
038public class DeduplicateToCDXApplication {
039
040    /**
041     * Takes an array of file names (relative or full paths) of crawl.log files from which duplicate records are to be
042     * extracted. Writes the concatenated cdx files of all duplicate records in these files to standard out. An
043     * exception will be thrown if any of the files cannot be read for any reason or if the argument is null
044     *
045     * @param localCrawlLogs a list of file names
046     * @throws FileNotFoundException if one of the files cannot be found
047     */
048    public void generateCDX(String[] localCrawlLogs) throws IOException {
049        ArgumentNotValid.checkNotNull(localCrawlLogs, "localCrawlLogs");
050        DeduplicateToCDXAdapter adapter = new DeduplicateToCDXAdapter();
051        for (String filename : localCrawlLogs) {
052            File file = new File(filename);
053            FileInputStream inputStream = new FileInputStream(file);
054            adapter.adaptStream(inputStream, System.out);
055            inputStream.close();
056        }
057    }
058
059    /**
060     * An application to generate unsorted cdx files from duplicate records present in a crawl.log file. The only
061     * parameters are a list of file-paths. Output is written to standard out.
062     *
063     * @param args the file names (relative or absolute paths)
064     * @throws FileNotFoundException if one or more of the files does not exist
065     */
066    public static void main(String[] args) throws IOException {
067        if (args.length == 0) {
068            System.err.println("No files specified on command line");
069            System.err.println("Usage: java dk.netarkivet.wayback.DeduplicateToCDXApplication <files>");
070        } else {
071            DeduplicateToCDXApplication app = new DeduplicateToCDXApplication();
072            app.generateCDX(args);
073        }
074
075    }
076
077}