001/* 002 * #%L 003 * Netarchivesuite - wayback 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.wayback; 025 026import java.io.File; 027import java.io.FileInputStream; 028import java.io.FileNotFoundException; 029import java.io.IOException; 030 031import dk.netarkivet.common.exceptions.ArgumentNotValid; 032import dk.netarkivet.wayback.batch.DeduplicateToCDXAdapter; 033 034/** 035 * A simple command line application to generate cdx files from local crawl-log files. 036 */ 037 038public class DeduplicateToCDXApplication { 039 040 /** 041 * Takes an array of file names (relative or full paths) of crawl.log files from which duplicate records are to be 042 * extracted. Writes the concatenated cdx files of all duplicate records in these files to standard out. An 043 * exception will be thrown if any of the files cannot be read for any reason or if the argument is null 044 * 045 * @param localCrawlLogs a list of file names 046 * @throws FileNotFoundException if one of the files cannot be found 047 */ 048 public void generateCDX(String[] localCrawlLogs) throws IOException { 049 ArgumentNotValid.checkNotNull(localCrawlLogs, "localCrawlLogs"); 050 DeduplicateToCDXAdapter adapter = new DeduplicateToCDXAdapter(); 051 for (String filename : localCrawlLogs) { 052 File file = new File(filename); 053 FileInputStream inputStream = new FileInputStream(file); 054 adapter.adaptStream(inputStream, System.out); 055 inputStream.close(); 056 } 057 } 058 059 /** 060 * An application to generate unsorted cdx files from duplicate records present in a crawl.log file. The only 061 * parameters are a list of file-paths. Output is written to standard out. 062 * 063 * @param args the file names (relative or absolute paths) 064 * @throws FileNotFoundException if one or more of the files does not exist 065 */ 066 public static void main(String[] args) throws IOException { 067 if (args.length == 0) { 068 System.err.println("No files specified on command line"); 069 System.err.println("Usage: java dk.netarkivet.wayback.DeduplicateToCDXApplication <files>"); 070 } else { 071 DeduplicateToCDXApplication app = new DeduplicateToCDXApplication(); 072 app.generateCDX(args); 073 } 074 075 } 076 077}