001/* 002 * #%L 003 * Netarchivesuite - common 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.common.tools; 024 025import java.io.File; 026import java.util.ArrayList; 027import java.util.List; 028 029import dk.netarkivet.common.exceptions.IOFailure; 030import dk.netarkivet.common.utils.FileUtils; 031import dk.netarkivet.common.utils.batch.BatchLocalFiles; 032import dk.netarkivet.common.utils.cdx.WARCExtractCDXJob; 033 034/** 035 * Command line tool for extracting CDX information from given WARC files. 036 * <p> 037 * Usage: java dk.netarkivet.common.tools.ExtractCDX file1.ext [file2.ext ...] > myindex.cdx 038 * <p> 039 * "ext" can be warc or warc.gz 040 * <p> 041 * Note: Does not depend on logging - communicates failures on stderr. 042 */ 043public class WARCExtractCDX { 044 045 /** 046 * Main method. Extracts CDX from all given files and outputs the index on stdout. 047 * 048 * @param argv A list of (absolute paths to) files to index. 049 */ 050 public static void main(String[] argv) { 051 if (argv.length == 0) { 052 System.err.println("Missing parameter: " + "Must supply one or more WARC file(s) to be indexed"); 053 dieWithUsage(); 054 } 055 List<File> arcFiles = new ArrayList<File>(); 056 for (String arg : argv) { 057 File f = toArcFile(arg); 058 arcFiles.add(f); 059 } 060 File[] arcFileArray = arcFiles.toArray(new File[] {}); 061 BatchLocalFiles batchRunner = new BatchLocalFiles(arcFileArray); 062 batchRunner.run(new WARCExtractCDXJob(), System.out); 063 } 064 065 /** 066 * Verifies that the filename (absolute path) points to an existing file and that it is an arc or warc file. 067 * 068 * @param filename The filename to verify. 069 * @return The arc or warc file, as a File. 070 */ 071 private static File toArcFile(String filename) { 072 File f; 073 try { 074 f = FileUtils.makeValidFileFromExisting(filename).getAbsoluteFile(); 075 if (!FileUtils.WARCS_FILTER.accept(f.getParentFile(), f.getName())) { 076 dieWithError("Could not accept " + filename + ": was not an warc file"); 077 } 078 return f; 079 } catch (IOFailure e) { 080 dieWithError("Could not accept " + filename + ":" + e); 081 return null; // Compiler does not recognize System.exit() 082 } 083 } 084 085 /** 086 * Prints out a message on stderr and exits with an error code. 087 * 088 * @param msg The message to print. 089 */ 090 private static void dieWithError(String msg) { 091 System.err.println(msg); 092 System.err.println("Exiting - output is not OK"); 093 System.exit(1); 094 } 095 096 /** 097 * Prints out proper usage of this tool on stderr and exits with an error code. 098 */ 099 private static void dieWithUsage() { 100 System.err.println("Usage: java " + WARCExtractCDX.class.getName() + " file1.warc[.gz] [file2.warc[.gz] ...]"); 101 System.exit(1); 102 } 103 104}