001/* 002 * #%L 003 * Netarchivesuite - common 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.common.utils.cdx; 025 026import java.io.File; 027import java.io.FileOutputStream; 028import java.io.OutputStream; 029import java.util.HashMap; 030import java.util.Map; 031 032import org.slf4j.Logger; 033import org.slf4j.LoggerFactory; 034 035import dk.netarkivet.common.exceptions.ArgumentNotValid; 036import dk.netarkivet.common.utils.ExceptionUtils; 037import dk.netarkivet.common.utils.FileUtils; 038import dk.netarkivet.common.utils.archive.ArchiveProfile; 039import dk.netarkivet.common.utils.batch.BatchLocalFiles; 040 041/** 042 * Utility class for creating CDX-files. The CDX-format is described here: 043 * http://www.archive.org/web/researcher/cdx_file_format.php 044 */ 045public class CDXUtils { 046 047 /** The logger. */ 048 private static final Logger log = LoggerFactory.getLogger(CDXUtils.class); 049 050 /** 051 * Add cdx info for a given archive file to a given OutputStream. Note, any exceptions are logged on level FINE but 052 * otherwise ignored. 053 * 054 * @param archivefile A file with archive records 055 * @param cdxstream An output stream to add CDX lines to 056 */ 057 public static void writeCDXInfo(File archivefile, OutputStream cdxstream) { 058 ArchiveExtractCDXJob job = new ArchiveExtractCDXJob(); 059 BatchLocalFiles runner = new BatchLocalFiles(new File[] {archivefile}); 060 runner.run(job, cdxstream); 061 log.trace("Created index for {} records on file '{}'", job.noOfRecordsProcessed(), archivefile); 062 Exception[] exceptions = job.getExceptionArray(); 063 if (exceptions.length > 0) { 064 StringBuilder msg = new StringBuilder(); 065 for (Exception e : exceptions) { 066 msg.append(ExceptionUtils.getStackTrace(e)); 067 msg.append('\n'); 068 } 069 log.debug("Exceptions during generation of index on file '{}': {}", archivefile, msg.toString()); 070 } 071 log.debug("Created index of {} records on file '{}'", job.noOfRecordsProcessed(), archivefile); 072 } 073 074 /** 075 * Applies createCDXRecord() to all ARC/WARC files in a directory, creating one CDX file per ARC/WARC file. Note, 076 * any exceptions during index generation are logged at level FINE but otherwise ignored. Exceptions creating any 077 * cdx file are logged at level WARNING but otherwise ignored. CDX files are named as the ARC/WARC files except 078 * ".(w)arc" or ".(w)arc.gz" is extended with ".cdx" 079 * 080 * @param archiveProfile archive profile including filters, patterns, etc. 081 * @param archiveFileDirectory A directory with archive files to generate index for 082 * @param cdxFileDirectory A directory to generate CDX files in 083 * @throws ArgumentNotValid if any of directories are null or is not an existing directory, or if cdxFileDirectory 084 * is not writable. 085 */ 086 public static void generateCDX(ArchiveProfile archiveProfile, File archiveFileDirectory, File cdxFileDirectory) 087 throws ArgumentNotValid { 088 ArgumentNotValid.checkNotNull(archiveProfile, "ArchiveProfile archiveProfile"); 089 ArgumentNotValid.checkNotNull(archiveFileDirectory, "File archiveFileDirectory"); 090 ArgumentNotValid.checkNotNull(cdxFileDirectory, "File cdxFileDirectory"); 091 if (!archiveFileDirectory.isDirectory() || !archiveFileDirectory.canRead()) { 092 throw new ArgumentNotValid("The directory for arc files '" + archiveFileDirectory 093 + "' is not a readable directory"); 094 } 095 if (!cdxFileDirectory.isDirectory() || !cdxFileDirectory.canWrite()) { 096 throw new ArgumentNotValid("The directory for cdx files '" + archiveFileDirectory 097 + "' is not a writable directory"); 098 } 099 Map<File, Exception> exceptions = new HashMap<File, Exception>(); 100 File[] filesToProcess = archiveFileDirectory.listFiles(archiveProfile.filename_filter); 101 if (filesToProcess.length == 0) { 102 log.warn("Found no related arcfiles to process in the archive dir '{}'.", 103 archiveFileDirectory.getAbsolutePath()); 104 } else { 105 log.debug("Found {} related arcfiles to process in the archive dir '{}'.", filesToProcess.length, 106 archiveFileDirectory.getAbsolutePath()); 107 } 108 for (File arcfile : filesToProcess) { 109 File cdxfile = new File(cdxFileDirectory, arcfile.getName() + FileUtils.CDX_EXTENSION); 110 try { 111 OutputStream cdxstream = null; 112 try { 113 cdxstream = new FileOutputStream(cdxfile); 114 writeCDXInfo(arcfile, cdxstream); 115 } finally { 116 if (cdxstream != null) { 117 cdxstream.close(); 118 } 119 } 120 } catch (Exception e) { 121 exceptions.put(cdxfile, e); 122 } 123 } 124 // Log any errors 125 if (exceptions.size() > 0) { 126 StringBuilder errorMsg = new StringBuilder("Exceptions during cdx file generation:\n"); 127 for (Map.Entry<File, Exception> fileException : exceptions.entrySet()) { 128 errorMsg.append("Could not create cdxfile '"); 129 errorMsg.append(fileException.getKey().getAbsolutePath()); 130 errorMsg.append("':\n"); 131 errorMsg.append(ExceptionUtils.getStackTrace(fileException.getValue())); 132 errorMsg.append('\n'); 133 } 134 log.debug(errorMsg.toString()); 135 } 136 } 137 138}