001/*
002 * #%L
003 * Netarchivesuite - common
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.common.utils.cdx;
025
026import java.io.File;
027import java.io.FileOutputStream;
028import java.io.OutputStream;
029import java.util.HashMap;
030import java.util.Map;
031
032import org.slf4j.Logger;
033import org.slf4j.LoggerFactory;
034
035import dk.netarkivet.common.exceptions.ArgumentNotValid;
036import dk.netarkivet.common.utils.ExceptionUtils;
037import dk.netarkivet.common.utils.FileUtils;
038import dk.netarkivet.common.utils.archive.ArchiveProfile;
039import dk.netarkivet.common.utils.batch.BatchLocalFiles;
040
041/**
042 * Utility class for creating CDX-files. The CDX-format is described here:
043 * http://www.archive.org/web/researcher/cdx_file_format.php
044 */
045public class CDXUtils {
046
047    /** The logger. */
048    private static final Logger log = LoggerFactory.getLogger(CDXUtils.class);
049
050    /**
051     * Add cdx info for a given archive file to a given OutputStream. Note, any exceptions are logged on level FINE but
052     * otherwise ignored.
053     *
054     * @param archivefile A file with archive records
055     * @param cdxstream An output stream to add CDX lines to
056     */
057    public static void writeCDXInfo(File archivefile, OutputStream cdxstream) {
058        ArchiveExtractCDXJob job = new ArchiveExtractCDXJob();
059        BatchLocalFiles runner = new BatchLocalFiles(new File[] {archivefile});
060        runner.run(job, cdxstream);
061        log.trace("Created index for {} records on file '{}'", job.noOfRecordsProcessed(), archivefile);
062        Exception[] exceptions = job.getExceptionArray();
063        if (exceptions.length > 0) {
064            StringBuilder msg = new StringBuilder();
065            for (Exception e : exceptions) {
066                msg.append(ExceptionUtils.getStackTrace(e));
067                msg.append('\n');
068            }
069            log.debug("Exceptions during generation of index on file '{}': {}", archivefile, msg.toString());
070        }
071        log.debug("Created index of {} records on file '{}'", job.noOfRecordsProcessed(), archivefile);
072    }
073
074    /**
075     * Applies createCDXRecord() to all ARC/WARC files in a directory, creating one CDX file per ARC/WARC file. Note,
076     * any exceptions during index generation are logged at level FINE but otherwise ignored. Exceptions creating any
077     * cdx file are logged at level WARNING but otherwise ignored. CDX files are named as the ARC/WARC files except
078     * ".(w)arc" or ".(w)arc.gz" is extended with ".cdx"
079     *
080     * @param archiveProfile archive profile including filters, patterns, etc.
081     * @param archiveFileDirectory A directory with archive files to generate index for
082     * @param cdxFileDirectory A directory to generate CDX files in
083     * @throws ArgumentNotValid if any of directories are null or is not an existing directory, or if cdxFileDirectory
084     * is not writable.
085     */
086    public static void generateCDX(ArchiveProfile archiveProfile, File archiveFileDirectory, File cdxFileDirectory)
087            throws ArgumentNotValid {
088        ArgumentNotValid.checkNotNull(archiveProfile, "ArchiveProfile archiveProfile");
089        ArgumentNotValid.checkNotNull(archiveFileDirectory, "File archiveFileDirectory");
090        ArgumentNotValid.checkNotNull(cdxFileDirectory, "File cdxFileDirectory");
091        if (!archiveFileDirectory.isDirectory() || !archiveFileDirectory.canRead()) {
092            throw new ArgumentNotValid("The directory for arc files '" + archiveFileDirectory
093                    + "' is not a readable directory");
094        }
095        if (!cdxFileDirectory.isDirectory() || !cdxFileDirectory.canWrite()) {
096            throw new ArgumentNotValid("The directory for cdx files '" + archiveFileDirectory
097                    + "' is not a writable directory");
098        }
099        Map<File, Exception> exceptions = new HashMap<File, Exception>();
100        File[] filesToProcess = archiveFileDirectory.listFiles(archiveProfile.filename_filter);
101        if (filesToProcess.length == 0) {
102            log.warn("Found no related arcfiles to process in the archive dir '{}'.",
103                    archiveFileDirectory.getAbsolutePath());
104        } else {
105            log.debug("Found {} related arcfiles to process in the archive dir '{}'.", filesToProcess.length,
106                    archiveFileDirectory.getAbsolutePath());
107        }
108        for (File arcfile : filesToProcess) {
109            File cdxfile = new File(cdxFileDirectory, arcfile.getName() + FileUtils.CDX_EXTENSION);
110            try {
111                OutputStream cdxstream = null;
112                try {
113                    cdxstream = new FileOutputStream(cdxfile);
114                    writeCDXInfo(arcfile, cdxstream);
115                } finally {
116                    if (cdxstream != null) {
117                        cdxstream.close();
118                    }
119                }
120            } catch (Exception e) {
121                exceptions.put(cdxfile, e);
122            }
123        }
124        // Log any errors
125        if (exceptions.size() > 0) {
126            StringBuilder errorMsg = new StringBuilder("Exceptions during cdx file generation:\n");
127            for (Map.Entry<File, Exception> fileException : exceptions.entrySet()) {
128                errorMsg.append("Could not create cdxfile '");
129                errorMsg.append(fileException.getKey().getAbsolutePath());
130                errorMsg.append("':\n");
131                errorMsg.append(ExceptionUtils.getStackTrace(fileException.getValue()));
132                errorMsg.append('\n');
133            }
134            log.debug(errorMsg.toString());
135        }
136    }
137
138}