Source code

001/*
002 * #%L
003 * Netarchivesuite - common
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.common.utils.cdx;
024
025import java.io.IOException;
026import java.io.InputStream;
027import java.io.OutputStream;
028import java.util.HashMap;
029import java.util.Map;
030
031import org.archive.io.arc.ARCRecord;
032import org.slf4j.Logger;
033import org.slf4j.LoggerFactory;
034
035import dk.netarkivet.common.Constants;
036import dk.netarkivet.common.exceptions.IOFailure;
037import dk.netarkivet.common.utils.ChecksumCalculator;
038import dk.netarkivet.common.utils.arc.ARCBatchJob;
039import dk.netarkivet.common.utils.batch.ARCBatchFilter;
040
041/**
042 * Batch job that extracts information to create a CDX file.
043 * <p>
044 * A CDX file contains sorted lines of metadata from the ARC files, with each line followed by the file and offset the
045 * record was found at, and optionally a checksum. The timeout of this job is 7 days. See
046 * http://www.archive.org/web/researcher/cdx_file_format.php
047 */
048
049@SuppressWarnings({"rawtypes", "serial"})
050public class ExtractCDXJob extends ARCBatchJob {
051
052    /** Logger for this class. */
053    private static final Logger log = LoggerFactory.getLogger(ExtractCDXJob.class);
054
055    /** An encoding for the standard included metadata fields without checksum. */
056    private static final String[] STD_FIELDS_EXCL_CHECKSUM = {"A", "e", "b", "m", "n", "g", "v"};
057
058    /** An encoding for the standard included metadata fields with checksum. */
059    private static final String[] STD_FIELDS_INCL_CHECKSUM = {"A", "e", "b", "m", "n", "g", "v", "c"};
060
061    /** The fields to be included in CDX output. */
062    private String[] fields;
063
064    /** True if we put an MD5 in each CDX line as well. */
065    private boolean includeChecksum;
066
067    /**
068     * Constructs a new job for extracting CDX indexes.
069     *
070     * @param includeChecksum If true, an MD5 checksum is also written for each record. If false, it is not.
071     */
072    public ExtractCDXJob(boolean includeChecksum) {
073        this.fields = includeChecksum ? STD_FIELDS_INCL_CHECKSUM : STD_FIELDS_EXCL_CHECKSUM;
074        this.includeChecksum = includeChecksum;
075        batchJobTimeout = 7 * Constants.ONE_DAY_IN_MILLIES;
076    }
077
078    /**
079     * Equivalent to ExtractCDXJob(true).
080     */
081    public ExtractCDXJob() {
082        this(true);
083    }
084
085    /**
086     * Filter out the filedesc: headers.
087     *
088     * @return The filter that defines what ARC records are wanted in the output CDX file.
089     * @see dk.netarkivet.common.utils.arc.ARCBatchJob#getFilter()
090     */
091    @Override
092    public ARCBatchFilter getFilter() {
093        // Per default we want to index all records except ARC file headers:
094        return ARCBatchFilter.EXCLUDE_FILE_HEADERS;
095    }
096
097    /**
098     * Initialize any data needed (none).
099     *
100     * @see dk.netarkivet.common.utils.arc.ARCBatchJob#initialize(OutputStream)
101     */
102    @Override
103    public void initialize(OutputStream os) {
104    }
105
106    /**
107     * Process this entry, reading metadata into the output stream.
108     *
109     * @throws IOFailure on trouble reading arc record data
110     * @see dk.netarkivet.common.utils.arc.ARCBatchJob#processRecord(ARCRecord, OutputStream)
111     */
112    @Override
113    public void processRecord(ARCRecord sar, OutputStream os) {
114        log.trace("Processing ARCRecord with offset: {}", sar.getMetaData().getOffset());
115        /*
116         * Fields are stored in a map so that it's easy to pull them out when looking at the fieldarray.
117         */
118        Map<String, String> fieldsread = new HashMap<String, String>();
119        fieldsread.put("A", sar.getMetaData().getUrl());
120        fieldsread.put("e", sar.getMetaData().getIp());
121        fieldsread.put("b", sar.getMetaData().getDate());
122        fieldsread.put("m", sar.getMetaData().getMimetype());
123        fieldsread.put("n", Long.toString(sar.getMetaData().getLength()));
124
125        /*
126         * Note about offset: The original dk.netarkivet.ArcUtils.ExtractCDX yields offsets that are consistently 1
127         * lower than this version, which pulls the offset value from the org.archive.io.arc-classes. This difference is
128         * that the former classes count the preceeding newline as part of the ARC header.
129         */
130        fieldsread.put("v", Long.toString(sar.getMetaData().getOffset()));
131        fieldsread.put("g", sar.getMetaData().getArcFile().getName());
132
133        /* Only include checksum if necessary: */
134        if (includeChecksum) {
135            // To avoid taking all of the record into an array, we
136            // slurp it directly from the ARCRecord. This leaves the
137            // sar in an inconsistent state, so it must not be used
138            // afterwards.
139            InputStream instream = sar; // Note: ARCRecord extends InputStream
140            fieldsread.put("c", ChecksumCalculator.calculateMd5(instream));
141        }
142
143        printFields(fieldsread, os);
144    }
145
146    /**
147     * End of the batch job.
148     *
149     * @see dk.netarkivet.common.utils.arc.ARCBatchJob#finish(OutputStream)
150     */
151    @Override
152    public void finish(OutputStream os) {
153    }
154
155    /**
156     * Print the values found for a set of fields. Prints the '-' character for any null values.
157     *
158     * @param fieldsread A hashtable of values indexed by field letters
159     * @param outstream The outputstream to write the values to
160     */
161    private void printFields(Map fieldsread, OutputStream outstream) {
162        StringBuffer sb = new StringBuffer();
163
164        for (int i = 0; i < fields.length; i++) {
165            Object o = fieldsread.get(fields[i]);
166            sb.append((i > 0) ? " " : "");
167            sb.append((o == null) ? "-" : o.toString());
168        }
169        sb.append("\n");
170        try {
171            outstream.write(sb.toString().getBytes("UTF-8"));
172        } catch (IOException e) {
173            throw new IOFailure("Error writing CDX line '" + sb + "' to batch outstream", e);
174        }
175    }
176
177    /**
178     * @return Humanly readable description of this instance.
179     */
180    public String toString() {
181        return getClass().getName() + ", with Filter: " + getFilter() + ", include checksum = " + includeChecksum;
182    }
183
184}