001/*
002 * #%L
003 * Netarchivesuite - common
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.common.tools;
024
025import java.io.File;
026import java.util.ArrayList;
027import java.util.List;
028
029import dk.netarkivet.common.exceptions.IOFailure;
030import dk.netarkivet.common.utils.FileUtils;
031import dk.netarkivet.common.utils.batch.BatchLocalFiles;
032import dk.netarkivet.common.utils.cdx.WARCExtractCDXJob;
033
034/**
035 * Command line tool for extracting CDX information from given WARC files.
036 * <p>
037 * Usage: java dk.netarkivet.common.tools.ExtractCDX file1.ext [file2.ext ...] > myindex.cdx
038 * <p>
039 * "ext" can be warc or warc.gz
040 * <p>
041 * Note: Does not depend on logging - communicates failures on stderr.
042 */
043public class WARCExtractCDX {
044
045    /**
046     * Main method. Extracts CDX from all given files and outputs the index on stdout.
047     *
048     * @param argv A list of (absolute paths to) files to index.
049     */
050    public static void main(String[] argv) {
051        if (argv.length == 0) {
052            System.err.println("Missing parameter: " + "Must supply one or more WARC file(s) to be indexed");
053            dieWithUsage();
054        }
055        List<File> arcFiles = new ArrayList<File>();
056        for (String arg : argv) {
057            File f = toArcFile(arg);
058            arcFiles.add(f);
059        }
060        File[] arcFileArray = arcFiles.toArray(new File[] {});
061        BatchLocalFiles batchRunner = new BatchLocalFiles(arcFileArray);
062        batchRunner.run(new WARCExtractCDXJob(), System.out);
063    }
064
065    /**
066     * Verifies that the filename (absolute path) points to an existing file and that it is an arc or warc file.
067     *
068     * @param filename The filename to verify.
069     * @return The arc or warc file, as a File.
070     */
071    private static File toArcFile(String filename) {
072        File f;
073        try {
074            f = FileUtils.makeValidFileFromExisting(filename).getAbsoluteFile();
075            if (!FileUtils.WARCS_FILTER.accept(f.getParentFile(), f.getName())) {
076                dieWithError("Could not accept " + filename + ": was not an warc file");
077            }
078            return f;
079        } catch (IOFailure e) {
080            dieWithError("Could not accept " + filename + ":" + e);
081            return null; // Compiler does not recognize System.exit()
082        }
083    }
084
085    /**
086     * Prints out a message on stderr and exits with an error code.
087     *
088     * @param msg The message to print.
089     */
090    private static void dieWithError(String msg) {
091        System.err.println(msg);
092        System.err.println("Exiting - output is not OK");
093        System.exit(1);
094    }
095
096    /**
097     * Prints out proper usage of this tool on stderr and exits with an error code.
098     */
099    private static void dieWithUsage() {
100        System.err.println("Usage: java " + WARCExtractCDX.class.getName() + " file1.warc[.gz] [file2.warc[.gz] ...]");
101        System.exit(1);
102    }
103
104}