Source code

001/*
002 * #%L
003 * Netarchivesuite - common
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.common.utils.warc;
024
025import java.io.File;
026import java.io.FileOutputStream;
027import java.io.IOException;
028import java.io.InputStream;
029import java.io.PrintStream;
030import java.net.URI;
031import java.net.URISyntaxException;
032import java.util.Date;
033import java.util.HashMap;
034import java.util.HashSet;
035import java.util.Iterator;
036import java.util.Map;
037import java.util.Map.Entry;
038import java.util.Set;
039import java.util.concurrent.atomic.AtomicInteger;
040
041import org.archive.format.warc.WARCConstants;
042import org.archive.format.warc.WARCConstants.WARCRecordType;
043import org.archive.io.ArchiveRecord;
044import org.archive.io.ArchiveRecordHeader;
045import org.archive.io.warc.WARCReader;
046import org.archive.io.warc.WARCReaderFactory;
047import org.archive.io.warc.WARCRecord;
048import org.archive.io.warc.WARCRecordInfo;
049import org.archive.io.warc.WARCWriter;
050import org.archive.io.warc.WARCWriterPoolSettings;
051import org.archive.io.warc.WARCWriterPoolSettingsData;
052import org.archive.uid.UUIDGenerator;
053import org.archive.util.anvl.ANVLRecord;
054import org.slf4j.Logger;
055import org.slf4j.LoggerFactory;
056
057import dk.netarkivet.common.Constants;
058import dk.netarkivet.common.exceptions.ArgumentNotValid;
059import dk.netarkivet.common.exceptions.IOFailure;
060import dk.netarkivet.common.exceptions.IllegalState;
061import dk.netarkivet.common.utils.archive.ArchiveDateConverter;
062import dk.netarkivet.common.utils.archive.HeritrixArchiveHeaderWrapper;
063
064/**
065 * Various utilities on WARC-records. We have borrowed code from wayback. See
066 * org.archive.wayback.resourcestore.indexer.WARCRecordToSearchResultAdapter
067 */
068public class WARCUtils {
069
070    /** Logging output place. */
071    protected static final Logger log = LoggerFactory.getLogger(WARCUtils.class);
072
073    /**
074     * Create new WARCWriter, writing to warcfile newFile.
075     *
076     * @param newFile the WARCfile, that the WARCWriter writes to.
077     * @return new WARCWriter, writing to warcfile newFile.
078     */
079    public static WARCWriter createWARCWriter(File newFile) {
080        WARCWriter writer;
081        PrintStream ps = null;
082        try {
083            ps = new PrintStream(new FileOutputStream(newFile));
084            /*
085            writer = new WARCWriter(new AtomicInteger(), ps,
086            // This name is used for the first (file metadata) record
087                    newFile, false, // Don't compress
088                    // Use current time
089                    ArchiveDateConverter.getWarcDateFormat().format(new Date()), null // No particular file metadata to
090                                                                                      // add
091            );
092            */
093            WARCWriterPoolSettings settings = new WARCWriterPoolSettingsData(
094                        WARCConstants.WARC_FILE_EXTENSION, null, WARCConstants.DEFAULT_MAX_WARC_FILE_SIZE, false,
095                        null, null, new UUIDGenerator());
096            writer = new WARCWriter(new AtomicInteger(), ps, newFile, settings);
097        } catch (IOException e) {
098            if (ps != null) {
099                ps.close();
100            }
101            String message = "Could not create WARCWriter to file '" + newFile + "'.\n";
102            log.warn(message);
103            throw new IOFailure(message, e);
104        }
105        return writer;
106    }
107
108    /**
109     * Insert the contents of a WARC file into another WARCFile.
110     *
111     * @param warcFile An WARC file to read.
112     * @param writer A place to write the arc records
113     * @throws IOFailure if there are problems reading the file.
114     */
115    public static void insertWARCFile(File warcFile, WARCWriter writer) {
116        ArgumentNotValid.checkNotNull(writer, "WARCWriter aw");
117        ArgumentNotValid.checkNotNull(warcFile, "File warcFile");
118        WARCReader r;
119
120        try {
121            r = WARCReaderFactory.get(warcFile);
122        } catch (IOException e) {
123            String message = "Error while copying WARC records from " + warcFile;
124            log.warn(message, e);
125            throw new IOFailure(message, e);
126        }
127        Iterator<ArchiveRecord> it = r.iterator();
128        WARCRecord record;
129        while (it.hasNext()) {
130            record = (WARCRecord) it.next();
131            copySingleRecord(writer, record);
132        }
133    }
134
135    private static final Set<String> ignoreHeadersMap = new HashSet<String>();
136
137    private static final Map<String, String> headerNamesCaseMap = new HashMap<String, String>();
138
139    static {
140        ignoreHeadersMap.add("content-type");
141        ignoreHeadersMap.add("reader-identifier");
142        ignoreHeadersMap.add("absolute-offset");
143        ignoreHeadersMap.add("content-length");
144        //ignoreHeadersMap.add("warc-date");
145        ignoreHeadersMap.add("warc-record-id");
146        ignoreHeadersMap.add("warc-type");
147        ignoreHeadersMap.add("warc-target-uri");
148        String[] headerNames = {"WARC-Type", "WARC-Record-ID", "WARC-Date", "Content-Length", "Content-Type",
149                "WARC-Concurrent-To", "WARC-Block-Digest", "WARC-Payload-Digest", "WARC-IP-Address", "WARC-Refers-To",
150                "WARC-Target-URI", "WARC-Truncated", "WARC-Warcinfo-ID", "WARC-Filename", "WARC-Profile",
151                "WARC-Identified-Payload-Type", "WARC-Segment-Origin-ID", "WARC-Segment-Number",
152                "WARC-Segment-Total-Length"};
153        for (int i = 0; i < headerNames.length; ++i) {
154            headerNamesCaseMap.put(headerNames[i].toLowerCase(), headerNames[i]);
155        }
156    }
157
158    /**
159     * Writes the given WARCRecord on the given WARCWriter.
160     * <p>
161     * Creates a new unique UUID for the copied record.
162     *
163     * @param aw The WARCWriter to output the record on.
164     * @param record The record to output
165     */
166    private static void copySingleRecord(WARCWriter aw, WARCRecord record) {
167        try {
168            // Prepare metadata...
169            HeritrixArchiveHeaderWrapper header = HeritrixArchiveHeaderWrapper.wrapArchiveHeader(null, record);
170            String warcType = header.getHeaderStringValue("WARC-Type");
171
172            String url = header.getUrl();
173            Date date = header.getDate();
174            String dateStr = ArchiveDateConverter.getWarcDateFormat().format(date);
175            String mimetype = header.getMimetype();
176            String recordIdStr;
177            URI recordId;
178            try {
179                recordIdStr = header.getHeaderStringValue("warc-record-id");
180                if (recordIdStr.startsWith("<") && recordIdStr.endsWith(">")) {
181                    recordIdStr = recordIdStr.substring(1, recordIdStr.length() - 1);
182                }
183                recordId = new URI(recordIdStr);
184            } catch (URISyntaxException e) {
185                throw new IllegalState("Epic fail creating URI from UUID!");
186            }
187
188            //ANVLRecord namedFields = new ANVLRecord();
189
190            // Copy to headers from the original WARC record to the new one.
191            // Since we store the headers lowercase, we recase them.
192            // Non WARC header header are lowercase and loose their case.
193            /*
194            Iterator<Entry<String, Object>> headerIter = header.getHeaderFields().entrySet().iterator();
195            Entry<String, Object> headerEntry;
196            String headerName;
197            String headerNameCased;
198            while (headerIter.hasNext()) {
199                headerEntry = headerIter.next();
200                if (!ignoreHeadersMap.contains(headerEntry.getKey())) {
201                    headerName = headerEntry.getKey();
202                    headerNameCased = headerNamesCaseMap.get(headerName);
203                    if (headerNameCased != null) {
204                        headerName = headerNameCased;
205                    }
206                    namedFields.addLabelValue(headerName, headerEntry.getValue().toString());
207                }
208            }
209            */
210
211            InputStream in = record;
212            // getContentBegin only works for WARC and in H1.44.x!
213            Long payloadLength = header.getLength() - record.getHeader().getContentBegin();
214
215            WARCRecordType type = WARCRecordType.valueOf(warcType);
216            WARCRecordInfo newRecord = new WARCRecordInfo();
217            Iterator<Entry<String, Object>> headerIter = header.getHeaderFields().entrySet().iterator();
218            Entry<String, Object> headerEntry;
219            String headerName;
220            String headerNameCased;
221            while (headerIter.hasNext()) {
222                headerEntry = headerIter.next();
223                if (!ignoreHeadersMap.contains(headerEntry.getKey())) {
224                    headerName = headerEntry.getKey();
225                    headerNameCased = headerNamesCaseMap.get(headerName);
226                    if (headerNameCased != null) {
227                        headerName = headerNameCased;
228                    }
229                    newRecord.addExtraHeader(headerName, headerEntry.getValue().toString());
230                }
231            }
232            newRecord.setType(type);
233            newRecord.setUrl(url);
234            newRecord.setMimetype(mimetype);
235            newRecord.setRecordId(recordId);
236            newRecord.setContentStream(in);
237            newRecord.setContentLength(payloadLength);
238                aw.writeRecord(newRecord);
239
240            // Write WARC record with type=warcType
241                /*
242            if ("metadata".equals(warcType)) {
243                aw.writeMetadataRecord(url, dateStr, mimetype, recordId, namedFields, in, payloadLength);
244            } else if ("request".equals(warcType)) {
245                aw.writeRequestRecord(url, dateStr, mimetype, recordId, namedFields, in, payloadLength);
246            } else if ("resource".equals(warcType)) {
247                aw.writeResourceRecord(url, dateStr, mimetype, recordId, namedFields, in, payloadLength);
248            } else if ("response".equals(warcType)) {
249                aw.writeResponseRecord(url, dateStr, mimetype, recordId, namedFields, in, payloadLength);
250            } else if ("revisit".equals(warcType)) {
251                aw.writeRevisitRecord(url, dateStr, mimetype, recordId, namedFields, in, payloadLength);
252            } else if ("warcinfo".equals(warcType)) {
253                aw.writeWarcinfoRecord(dateStr, mimetype, recordId, namedFields, in, payloadLength);
254            } else {
255                throw new IOFailure("Unknown WARC-Type!");
256            }
257            */
258        } catch (Exception e) {
259            throw new IOFailure("Error occurred while writing an WARC record" + record, e);
260        }
261    }
262
263    /**
264     * Read the contents (payload) of an WARC record into a byte array.
265     *
266     * @param record An WARC record to read from. After reading, the WARC Record will no longer have its own data
267     * available for reading.
268     * @return A byte array containing the payload of the WARC record. Note that the size of the payload is calculated
269     * by subtracting the contentBegin value from the length of the record (both values included in the record header).
270     * @throws IOFailure If there is an error reading the data, or if the record is longer than Integer.MAX_VALUE (since
271     * we can't make bigger arrays).
272     */
273    public static byte[] readWARCRecord(WARCRecord record) throws IOFailure {
274        ArgumentNotValid.checkNotNull(record, "WARCRecord record");
275        if (record.getHeader().getLength() > Integer.MAX_VALUE) {
276            throw new IOFailure("WARC Record too long to fit in array: " + record.getHeader().getLength() + " > "
277                    + Integer.MAX_VALUE);
278        }
279        // Calculate the length of the payload.
280        // the size of the payload is calculated by subtracting
281        // the contentBegin value from the length of the record.
282
283        ArchiveRecordHeader header = record.getHeader();
284        long length = header.getLength();
285
286        int payloadLength = (int) (length - header.getContentBegin());
287
288        // read from stream
289        byte[] tmpbuffer = new byte[payloadLength];
290        byte[] buffer = new byte[Constants.IO_BUFFER_SIZE];
291        int bytesRead;
292        int totalBytes = 0;
293        try {
294            for (; (totalBytes < payloadLength) && ((bytesRead = record.read(buffer)) != -1); totalBytes += bytesRead) {
295                System.arraycopy(buffer, 0, tmpbuffer, totalBytes, bytesRead);
296            }
297        } catch (IOException e) {
298            throw new IOFailure("Failure when reading the WARC-record", e);
299        }
300
301        // Check if the number of bytes read (= totalbytes) matches the
302        // size of the buffer.
303        if (tmpbuffer.length != totalBytes) {
304            // make sure we only return an array with bytes we actually read
305            byte[] truncateBuffer = new byte[totalBytes];
306            System.arraycopy(tmpbuffer, 0, truncateBuffer, 0, totalBytes);
307            log.debug("Storing {} bytes. Expected to store: {}", totalBytes, tmpbuffer.length);
308            return truncateBuffer;
309        } else {
310            return tmpbuffer;
311        }
312
313    }
314
315    /**
316     * Find out what type of WARC-record this is.
317     *
318     * @param record a given WARCRecord
319     * @return the type of WARCRecord as a String.
320     */
321    public static String getRecordType(WARCRecord record) {
322        ArgumentNotValid.checkNotNull(record, "record");
323        ArchiveRecordHeader header = record.getHeader();
324        return (String) header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE);
325    }
326
327    /**
328     * Check if the given filename represents a WARC file.
329     *
330     * @param filename A given filename
331     * @return true, if the filename ends with .warc or .warc.gz
332     */
333    public static boolean isWarc(String filename) {
334        ArgumentNotValid.checkNotNullOrEmpty(filename, "filename");
335        String lowercaseFilename = filename.toLowerCase();
336        return (lowercaseFilename.endsWith(".warc") || lowercaseFilename.endsWith(".warc.gz"));
337    }
338
339}