001/* 002 * #%L 003 * Netarchivesuite - common 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.common.utils.cdx; 024 025import java.io.IOException; 026import java.io.InputStream; 027import java.io.OutputStream; 028import java.util.HashMap; 029import java.util.Map; 030 031import org.archive.io.arc.ARCRecord; 032import org.slf4j.Logger; 033import org.slf4j.LoggerFactory; 034 035import dk.netarkivet.common.Constants; 036import dk.netarkivet.common.exceptions.IOFailure; 037import dk.netarkivet.common.utils.ChecksumCalculator; 038import dk.netarkivet.common.utils.arc.ARCBatchJob; 039import dk.netarkivet.common.utils.batch.ARCBatchFilter; 040 041/** 042 * Batch job that extracts information to create a CDX file. 043 * <p> 044 * A CDX file contains sorted lines of metadata from the ARC files, with each line followed by the file and offset the 045 * record was found at, and optionally a checksum. The timeout of this job is 7 days. See 046 * http://www.archive.org/web/researcher/cdx_file_format.php 047 */ 048 049@SuppressWarnings({"rawtypes", "serial"}) 050public class ExtractCDXJob extends ARCBatchJob { 051 052 /** Logger for this class. */ 053 private static final Logger log = LoggerFactory.getLogger(ExtractCDXJob.class); 054 055 /** An encoding for the standard included metadata fields without checksum. */ 056 private static final String[] STD_FIELDS_EXCL_CHECKSUM = {"A", "e", "b", "m", "n", "g", "v"}; 057 058 /** An encoding for the standard included metadata fields with checksum. */ 059 private static final String[] STD_FIELDS_INCL_CHECKSUM = {"A", "e", "b", "m", "n", "g", "v", "c"}; 060 061 /** The fields to be included in CDX output. */ 062 private String[] fields; 063 064 /** True if we put an MD5 in each CDX line as well. */ 065 private boolean includeChecksum; 066 067 /** 068 * Constructs a new job for extracting CDX indexes. 069 * 070 * @param includeChecksum If true, an MD5 checksum is also written for each record. If false, it is not. 071 */ 072 public ExtractCDXJob(boolean includeChecksum) { 073 this.fields = includeChecksum ? STD_FIELDS_INCL_CHECKSUM : STD_FIELDS_EXCL_CHECKSUM; 074 this.includeChecksum = includeChecksum; 075 batchJobTimeout = 7 * Constants.ONE_DAY_IN_MILLIES; 076 } 077 078 /** 079 * Equivalent to ExtractCDXJob(true). 080 */ 081 public ExtractCDXJob() { 082 this(true); 083 } 084 085 /** 086 * Filter out the filedesc: headers. 087 * 088 * @return The filter that defines what ARC records are wanted in the output CDX file. 089 * @see dk.netarkivet.common.utils.arc.ARCBatchJob#getFilter() 090 */ 091 @Override 092 public ARCBatchFilter getFilter() { 093 // Per default we want to index all records except ARC file headers: 094 return ARCBatchFilter.EXCLUDE_FILE_HEADERS; 095 } 096 097 /** 098 * Initialize any data needed (none). 099 * 100 * @see dk.netarkivet.common.utils.arc.ARCBatchJob#initialize(OutputStream) 101 */ 102 @Override 103 public void initialize(OutputStream os) { 104 } 105 106 /** 107 * Process this entry, reading metadata into the output stream. 108 * 109 * @throws IOFailure on trouble reading arc record data 110 * @see dk.netarkivet.common.utils.arc.ARCBatchJob#processRecord(ARCRecord, OutputStream) 111 */ 112 @Override 113 public void processRecord(ARCRecord sar, OutputStream os) { 114 log.trace("Processing ARCRecord with offset: {}", sar.getMetaData().getOffset()); 115 /* 116 * Fields are stored in a map so that it's easy to pull them out when looking at the fieldarray. 117 */ 118 Map<String, String> fieldsread = new HashMap<String, String>(); 119 fieldsread.put("A", sar.getMetaData().getUrl()); 120 fieldsread.put("e", sar.getMetaData().getIp()); 121 fieldsread.put("b", sar.getMetaData().getDate()); 122 fieldsread.put("m", sar.getMetaData().getMimetype()); 123 fieldsread.put("n", Long.toString(sar.getMetaData().getLength())); 124 125 /* 126 * Note about offset: The original dk.netarkivet.ArcUtils.ExtractCDX yields offsets that are consistently 1 127 * lower than this version, which pulls the offset value from the org.archive.io.arc-classes. This difference is 128 * that the former classes count the preceeding newline as part of the ARC header. 129 */ 130 fieldsread.put("v", Long.toString(sar.getMetaData().getOffset())); 131 fieldsread.put("g", sar.getMetaData().getArcFile().getName()); 132 133 /* Only include checksum if necessary: */ 134 if (includeChecksum) { 135 // To avoid taking all of the record into an array, we 136 // slurp it directly from the ARCRecord. This leaves the 137 // sar in an inconsistent state, so it must not be used 138 // afterwards. 139 InputStream instream = sar; // Note: ARCRecord extends InputStream 140 fieldsread.put("c", ChecksumCalculator.calculateMd5(instream)); 141 } 142 143 printFields(fieldsread, os); 144 } 145 146 /** 147 * End of the batch job. 148 * 149 * @see dk.netarkivet.common.utils.arc.ARCBatchJob#finish(OutputStream) 150 */ 151 @Override 152 public void finish(OutputStream os) { 153 } 154 155 /** 156 * Print the values found for a set of fields. Prints the '-' character for any null values. 157 * 158 * @param fieldsread A hashtable of values indexed by field letters 159 * @param outstream The outputstream to write the values to 160 */ 161 private void printFields(Map fieldsread, OutputStream outstream) { 162 StringBuffer sb = new StringBuffer(); 163 164 for (int i = 0; i < fields.length; i++) { 165 Object o = fieldsread.get(fields[i]); 166 sb.append((i > 0) ? " " : ""); 167 sb.append((o == null) ? "-" : o.toString()); 168 } 169 sb.append("\n"); 170 try { 171 outstream.write(sb.toString().getBytes("UTF-8")); 172 } catch (IOException e) { 173 throw new IOFailure("Error writing CDX line '" + sb + "' to batch outstream", e); 174 } 175 } 176 177 /** 178 * @return Humanly readable description of this instance. 179 */ 180 public String toString() { 181 return getClass().getName() + ", with Filter: " + getFilter() + ", include checksum = " + includeChecksum; 182 } 183 184}