Source code

001/*
002 * #%L
003 * Netarchivesuite - common
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.common.utils.cdx;
024
025import java.io.IOException;
026import java.io.OutputStream;
027import java.util.HashMap;
028import java.util.Map;
029
030import org.archive.io.warc.WARCRecord;
031import org.jwat.common.ByteCountingPushBackInputStream;
032import org.jwat.common.ContentType;
033import org.jwat.common.HttpHeader;
034import org.slf4j.Logger;
035import org.slf4j.LoggerFactory;
036
037import dk.netarkivet.common.Constants;
038import dk.netarkivet.common.exceptions.IOFailure;
039import dk.netarkivet.common.utils.ChecksumCalculator;
040import dk.netarkivet.common.utils.archive.ArchiveHeaderBase;
041import dk.netarkivet.common.utils.archive.ArchiveRecordBase;
042import dk.netarkivet.common.utils.archive.HeritrixArchiveRecordWrapper;
043import dk.netarkivet.common.utils.batch.WARCBatchFilter;
044import dk.netarkivet.common.utils.warc.WARCBatchJob;
045
046/**
047 * Batch job that extracts information to create a CDX file.
048 * <p>
049 * A CDX file contains sorted lines of metadata from the WARC files, with each line followed by the file and offset the
050 * record was found at, and optionally a checksum. The timeout of this job is 7 days. See
051 * http://www.archive.org/web/researcher/cdx_file_format.php
052 */
053@SuppressWarnings({"serial"})
054public class WARCExtractCDXJob extends WARCBatchJob {
055
056    /** Logger for this class. */
057    private static final Logger log = LoggerFactory.getLogger(WARCExtractCDXJob.class);
058
059    /** An encoding for the standard included metadata fields without checksum. */
060    private static final String[] STD_FIELDS_EXCL_CHECKSUM = {"A", "e", "b", "m", "n", "g", "v"};
061
062    /** An encoding for the standard included metadata fields with checksum. */
063    private static final String[] STD_FIELDS_INCL_CHECKSUM = {"A", "e", "b", "m", "n", "g", "v", "c"};
064
065    /** The fields to be included in CDX output. */
066    private String[] fields;
067
068    /** True if we put an MD5 in each CDX line as well. */
069    private boolean includeChecksum;
070
071    /**
072     * Constructs a new job for extracting CDX indexes.
073     *
074     * @param includeChecksum If true, an MD5 checksum is also written for each record. If false, it is not.
075     */
076    public WARCExtractCDXJob(boolean includeChecksum) {
077        this.fields = includeChecksum ? STD_FIELDS_INCL_CHECKSUM : STD_FIELDS_EXCL_CHECKSUM;
078        this.includeChecksum = includeChecksum;
079        batchJobTimeout = 7 * Constants.ONE_DAY_IN_MILLIES;
080    }
081
082    /**
083     * Equivalent to WARCExtractCDXJob(true).
084     */
085    public WARCExtractCDXJob() {
086        this(true);
087    }
088
089    /**
090     * Filters out the NON-RESPONSE records.
091     *
092     * @return The filter that defines what WARC records are wanted in the output CDX file.
093     * @see dk.netarkivet.common.utils.warc.WARCBatchJob#getFilter()
094     */
095    @Override
096    public WARCBatchFilter getFilter() {
097        // Per default we want to index all response records.
098        return WARCBatchFilter.EXCLUDE_NON_RESPONSE_RECORDS;
099    }
100
101    /**
102     * Initialize any data needed (none).
103     *
104     * @see dk.netarkivet.common.utils.warc.WARCBatchJob#initialize(OutputStream)
105     */
106    @Override
107    public void initialize(OutputStream os) {
108    }
109
110    /**
111     * Process this entry, reading metadata into the output stream.
112     *
113     * @throws IOFailure on trouble reading WARC record data
114     * @see dk.netarkivet.common.utils.warc.WARCBatchJob#processRecord(WARCRecord, OutputStream)
115     */
116    @Override
117    public void processRecord(WARCRecord sar, OutputStream os) {
118        log.trace("Processing WARCRecord with offset: {}", sar.getHeader().getOffset());
119        /*
120         * Fields are stored in a map so that it's easy to pull them out when looking at the fieldarray.
121         */
122        ArchiveRecordBase record = new HeritrixArchiveRecordWrapper(sar);
123        ArchiveHeaderBase header = record.getHeader();
124        Map<String, String> fieldsread = new HashMap<String, String>();
125        fieldsread.put("A", header.getUrl());
126        fieldsread.put("e", header.getIp());
127        fieldsread.put("b", header.getArcDateStr());
128        fieldsread.put("n", Long.toString(header.getLength()));
129
130        /*
131         * Note about offset: The original dk.netarkivet.ArcUtils.ExtractCDX yields offsets that are consistently 1
132         * lower than this version, which pulls the offset value from the org.archive.io.arc-classes. This difference is
133         * that the former classes count the preceeding newline as part of the ARC header.
134         */
135        fieldsread.put("v", Long.toString(sar.getHeader().getOffset()));
136        fieldsread.put("g", sar.getHeader().getReaderIdentifier());
137
138        String mimeType = header.getMimetype();
139        String msgType;
140        ContentType contentType = ContentType.parseContentType(mimeType);
141        boolean bResponse = false;
142        if (contentType != null) {
143            if ("application".equals(contentType.contentType) && "http".equals(contentType.mediaType)) {
144                msgType = contentType.getParameter("msgtype");
145                if ("response".equals(msgType)) {
146                    bResponse = true;
147                } else if ("request".equals(msgType)) {
148                }
149            }
150            mimeType = contentType.toStringShort();
151        }
152        ByteCountingPushBackInputStream pbin = new ByteCountingPushBackInputStream(sar, 8192);
153        HttpHeader httpResponse = null;
154        if (bResponse) {
155            try {
156                httpResponse = HttpHeader.processPayload(HttpHeader.HT_RESPONSE, pbin, header.getLength(), null);
157                if (httpResponse != null && httpResponse.contentType != null) {
158                    contentType = ContentType.parseContentType(httpResponse.contentType);
159                    if (contentType != null) {
160                        mimeType = contentType.toStringShort();
161                    }
162                }
163            } catch (IOException e) {
164                throw new IOFailure("Error reading WARC httpresponse header", e);
165            }
166        }
167        fieldsread.put("m", mimeType);
168
169        /* Only include checksum if necessary: */
170        if (includeChecksum) {
171            // InputStream instream = sar; //Note: ARCRecord extends InputStream
172            // fieldsread.put("c", MD5.generateMD5(instream));
173            fieldsread.put("c", ChecksumCalculator.calculateMd5(pbin));
174        }
175
176        if (httpResponse != null) {
177            try {
178                httpResponse.close();
179            } catch (IOException e) {
180                throw new IOFailure("Error closing WARC httpresponse header", e);
181            }
182        }
183
184        printFields(fieldsread, os);
185    }
186
187    /**
188     * End of the batch job.
189     *
190     * @see dk.netarkivet.common.utils.warc.WARCBatchJob#finish(OutputStream)
191     */
192    @Override
193    public void finish(OutputStream os) {
194    }
195
196    /**
197     * Print the values found for a set of fields. Prints the '-' character for any null values.
198     *
199     * @param fieldsread A hashtable of values indexed by field letters
200     * @param outstream The outputstream to write the values to
201     */
202    private void printFields(Map<String, String> fieldsread, OutputStream outstream) {
203        StringBuffer sb = new StringBuffer();
204
205        for (int i = 0; i < fields.length; i++) {
206            Object o = fieldsread.get(fields[i]);
207            sb.append((i > 0) ? " " : "");
208            sb.append((o == null) ? "-" : o.toString());
209        }
210        sb.append("\n");
211        try {
212            outstream.write(sb.toString().getBytes("UTF-8"));
213        } catch (IOException e) {
214            throw new IOFailure("Error writing CDX line '" + sb + "' to batch outstream", e);
215        }
216    }
217
218    /**
219     * @return Humanly readable description of this instance.
220     */
221    public String toString() {
222        return getClass().getName() + ", with Filter: " + getFilter() + ", include checksum = " + includeChecksum;
223    }
224
225}