001/*
002 * #%L
003 * Netarchivesuite - common
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.common.utils.arc;
025
026import java.io.File;
027import java.io.FileInputStream;
028import java.io.FileOutputStream;
029import java.io.IOException;
030import java.io.InputStream;
031import java.io.PrintStream;
032import java.util.HashMap;
033import java.util.Iterator;
034import java.util.Map;
035import java.util.concurrent.atomic.AtomicInteger;
036import java.util.regex.Matcher;
037import java.util.regex.Pattern;
038
039import org.archive.format.arc.ARCConstants;
040import org.archive.io.ArchiveRecord;
041import org.archive.io.WriterPoolSettings;
042import org.archive.io.arc.ARCReader;
043import org.archive.io.arc.ARCReaderFactory;
044import org.archive.io.arc.ARCRecord;
045import org.archive.io.arc.ARCRecordMetaData;
046import org.archive.io.arc.ARCWriter;
047import org.archive.io.arc.WriterPoolSettingsData;
048import org.archive.util.ArchiveUtils;
049import org.slf4j.Logger;
050import org.slf4j.LoggerFactory;
051
052import dk.netarkivet.common.Constants;
053import dk.netarkivet.common.exceptions.ArgumentNotValid;
054import dk.netarkivet.common.exceptions.IOFailure;
055import dk.netarkivet.common.utils.InputStreamUtils;
056import dk.netarkivet.common.utils.SystemUtils;
057
058/**
059 * Various utilities that do stuff that ARCWriter does not provide. Also includes method for converting an ARCRecord to
060 * a byte array.
061 */
062public final class ARCUtils {
063
064    /** The log. */
065    private static final Logger log = LoggerFactory.getLogger(ARCUtils.class);
066
067    /** Default constructor to avoid initialization. */
068    private ARCUtils() {
069    }
070
071    /**
072     * Matches HTTP header lines like HTTP/1.1 404 Page has gone south Groups: 111 2222222222222222222.
073     */
074    private static final Pattern HTTP_HEADER_PATTERN = Pattern.compile("^HTTP/1\\.[01] (\\d+) (.*)$");
075
076    /** Extra ARC Record metadata. */
077    public static final String RESPONSETEXT = "RESPONSETEXT";
078
079    /**
080     * Insert the contents of an ARC file (skipping an optional initial filedesc: header) in another ARCfile.
081     *
082     * @param arcFile An ARC file to read.
083     * @param aw A place to write the arc records
084     * @throws IOFailure if there are problems reading the file.
085     */
086    public static void insertARCFile(File arcFile, ARCWriter aw) {
087        ArgumentNotValid.checkNotNull(aw, "ARCWriter aw");
088        ArgumentNotValid.checkNotNull(arcFile, "File arcFile");
089        ARCReader r;
090
091        try {
092            r = ARCReaderFactory.get(arcFile);
093        } catch (IOException e) {
094            String message = "Error while copying ARC records from " + arcFile;
095            log.warn(message, e);
096            throw new IOFailure(message, e);
097        }
098        Iterator<ArchiveRecord> it = r.iterator();
099        ARCRecord record;
100        it.next(); // Skip ARC file header
101        // ARCReaderFactory guarantees the first record exists and is a
102        // filedesc, or it would throw exception
103        while (it.hasNext()) {
104            record = (ARCRecord) it.next();
105            copySingleRecord(aw, record);
106        }
107    }
108
109    /**
110     * Writes the given ARCRecord on the given ARCWriter.
111     * <p>
112     * Note that the ARCWriter.write method takes the metadata fields as separate arguments instead of accepting an
113     * ARCRecordMetaData object. It uses the ArchiveUtils.getDate method to convert an ARCstyle datestring to a Date
114     * object.
115     *
116     * @param aw The ARCWriter to output the record on.
117     * @param record The record to output
118     * @see ArchiveUtils#getDate(java.lang.String)
119     */
120    private static void copySingleRecord(ARCWriter aw, ARCRecord record) {
121        try {
122            // Prepare metadata...
123            ARCRecordMetaData meta = record.getMetaData();
124            String uri = meta.getUrl();
125            String mime = meta.getMimetype();
126            String ip = meta.getIp();
127            // Note the ArchiveUtils.getDate() converts an ARC-style datestring
128            // to a Date object
129            long timeStamp = ArchiveUtils.getDate(meta.getDate()).getTime();
130            // ...and write the given files content into the writer
131            // Note ARCRecord extends InputStream
132            aw.write(uri, mime, ip, timeStamp, meta.getLength(), record);
133        } catch (Exception e) {
134            throw new IOFailure("Error occurred while writing an ARC record" + record, e);
135        }
136    }
137
138    /**
139     * Create new ARCWriter, writing to arcfile newFile.
140     *
141     * @param newFile the ARCfile, that the ARCWriter writes to.
142     * @return new ARCWriter, writing to arcfile newFile.
143     */
144    public static ARCWriter createARCWriter(File newFile) {
145        ARCWriter aw;
146        PrintStream ps = null;
147        try {
148            ps = new PrintStream(new FileOutputStream(newFile));
149            /*
150            aw = new ARCWriter(new AtomicInteger(), ps,
151            // This name is used for the first (file metadata) record
152                    newFile, false, // Don't compress
153                    // Use current time
154                    ArchiveUtils.get14DigitDate(System.currentTimeMillis()), null // No particular file metadata to add
155            );
156            */
157            WriterPoolSettings settings = new WriterPoolSettingsData(
158                        ARCConstants.ARC_FILE_EXTENSION, null, ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE, false, null, null);
159            // This name is used for the first (file metadata) record
160            aw = new ARCWriter(new AtomicInteger(), ps, newFile, settings);
161        } catch (IOException e) {
162            if (ps != null) {
163                ps.close();
164            }
165            String message = "Could not create ARCWriter to file '" + newFile + "'.\n";
166            log.warn(message);
167            throw new IOFailure(message, e);
168        }
169        return aw;
170    }
171
172    /**
173     * Write a file to an ARC file. The writing is done by an existing ARCWriter. An ARCRecord will be added, which
174     * contains a header and the contents of the file. The date of the record written will be set to the lastModified
175     * value of the file being written.
176     *
177     * @param aw The ARCWriter doing the writing
178     * @param file The file we want to write to the ARC file
179     * @param uri The uri for the ARCRecord being written
180     * @param mime The mimetype for the ARCRecord being written
181     * @throws ArgumentNotValid if any arguments aw and file are null and arguments uri and mime are null or empty.
182     */
183    public static void writeFileToARC(ARCWriter aw, File file, String uri, String mime) {
184        ArgumentNotValid.checkNotNull(aw, "ARCWriter aw");
185        ArgumentNotValid.checkNotNull(file, "File file");
186        ArgumentNotValid.checkNotNullOrEmpty(uri, "String uri");
187        ArgumentNotValid.checkNotNullOrEmpty(mime, "String mime");
188
189        InputStream is = null;
190        try {
191            try {
192                // Prepare metadata...
193                String ip = SystemUtils.getLocalIP();
194                long timeStamp = file.lastModified();
195                long length = file.length();
196                // ...and write the CDX file's content into the writer
197                is = new FileInputStream(file);
198                aw.write(uri, mime, ip, timeStamp, length, is);
199            } finally {
200                if (is != null) {
201                    is.close();
202                }
203            }
204        } catch (IOException e) {
205            String msg = "Error writing '" + file + "' to " + aw + " as " + uri;
206            log.warn(msg, e);
207            throw new IOFailure(msg, e);
208        }
209    }
210
211    /**
212     * Return an ARCWriter suitable for the tools ArcMerge and ArcWrap.
213     *
214     * @param stream the given PrintStream.
215     * @param destinationArcfile the given destination ARC file.
216     * @return ARCWriter to be used by tools ArcMerge and ArcWrap
217     * @throws IOException redirect from ARCWriter constructure
218     */
219    public static ARCWriter getToolsARCWriter(PrintStream stream, File destinationArcfile) throws IOException {
220        /*
221        return new ARCWriter(new AtomicInteger(), stream, destinationArcfile, false, // Don't compress
222                // Use current time
223                ArchiveUtils.get14DigitDate(System.currentTimeMillis()), null // //No particular file metadata to add
224        );
225        */
226        WriterPoolSettings settings = new WriterPoolSettingsData(
227                        ARCConstants.ARC_FILE_EXTENSION, null, ARCConstants.DEFAULT_MAX_ARC_FILE_SIZE, false, null, null);
228        return new ARCWriter(new AtomicInteger(), stream, destinationArcfile, settings);
229    }
230
231    /**
232     * Read the contents of an ARC record into a byte array.
233     *
234     * @param in An ARC record to read from. After reading, the ARC Record will no longer have its own data available
235     * for reading.
236     * @return A byte array containing the contents of the ARC record. Note that the size of this may be different from
237     * the size given in the ARC record metadata.
238     * @throws IOException If there is an error reading the data, or if the record is longer than Integer.MAX_VALUE
239     * (since we can't make bigger arrays).
240     */
241    public static byte[] readARCRecord(ARCRecord in) throws IOException {
242        ArgumentNotValid.checkNotNull(in, "ARCRecord in");
243        if (in.getMetaData().getLength() > Integer.MAX_VALUE) {
244            throw new IOFailure("ARC Record too long to fit in array: " + in.getMetaData().getLength() + " > "
245                    + Integer.MAX_VALUE);
246        }
247        // read from stream
248        // The arcreader has a number of "features" that complicates the read
249        // 1) the record at offset 0, returns too large a length
250        // 2) readfully does not work
251        // 3) ARCRecord.read(buf, offset, length) is broken.
252        // TODO verify if these "features" are still around: See bugs #903, #904,
253        // #905
254        int dataLength = (int) in.getMetaData().getLength();
255        byte[] tmpbuffer = new byte[dataLength];
256        byte[] buffer = new byte[Constants.IO_BUFFER_SIZE];
257        int bytesRead;
258        int totalBytes = 0;
259        for (; (totalBytes < dataLength) && ((bytesRead = in.read(buffer)) != -1); totalBytes += bytesRead) {
260            System.arraycopy(buffer, 0, tmpbuffer, totalBytes, bytesRead);
261        }
262        // Check if the number of bytes read (=i) matches the
263        // size of the buffer.
264        if (tmpbuffer.length != totalBytes) {
265            // make sure we only return an array with bytes we actualy read
266            byte[] truncateBuffer = new byte[totalBytes];
267            System.arraycopy(tmpbuffer, 0, truncateBuffer, 0, totalBytes);
268            return truncateBuffer;
269        } else {
270            return tmpbuffer;
271        }
272    }
273
274    /**
275     * TODO write unit test.
276     *
277     * @param in pointing at start of ARC record.
278     * @param offset into ARC file.
279     * @return pairwise headers.
280     * @throws IOException if fails to read ARC files or ARC files isn't valid.
281     */
282    public static Map<String, Object> getHeadersFromARCFile(InputStream in, Long offset) throws IOException {
283        Map<String, Object> headers = new HashMap<String, Object>();
284        // extra needed headers.
285        headers.put(ARCRecordMetaData.VERSION_FIELD_KEY, "");
286        headers.put(ARCRecordMetaData.ABSOLUTE_OFFSET_KEY, offset);
287
288        String line = InputStreamUtils.readLine(in);
289        String[] tmp = line.split(" ");
290
291        // decode header.
292        if (tmp.length == 5) {
293            headers.put(ARCRecordMetaData.URL_FIELD_KEY, tmp[0]);
294            headers.put(ARCRecordMetaData.IP_HEADER_FIELD_KEY, tmp[1]);
295            headers.put(ARCRecordMetaData.DATE_FIELD_KEY, tmp[2]);
296            headers.put(ARCRecordMetaData.MIMETYPE_FIELD_KEY, tmp[3]);
297            headers.put(ARCRecordMetaData.LENGTH_FIELD_KEY, tmp[4]);
298        } else {
299            throw new IOException("Does not include required metadata to be a valid ARC header: " + line);
300        }
301        // Matches rest of header lines.
302        line = InputStreamUtils.readLine(in);
303        Matcher m = HTTP_HEADER_PATTERN.matcher(line);
304
305        if (m.matches()) {
306            headers.put(ARCRecordMetaData.STATUSCODE_FIELD_KEY, m.group(1));
307            // not valid META DATA
308            headers.put(RESPONSETEXT, line);
309        }
310        /* arc/warc header */
311        while ((line = InputStreamUtils.readLine(in)) != null && line.length() > 0 && line.startsWith("<")) {
312            int index = line.indexOf(':');
313            if (index != -1) {
314                headers.put(line.substring(0, index), line.substring(index + 2));
315            } else {
316                throw new IOException("Inputstream doesn't not point to valid ARC record");
317            }
318        }
319
320        return headers;
321    }
322
323    /**
324     * Check if the filename belongs to an ARC file.
325     *
326     * @param filename a given filename
327     * @return true, if the filename converted to lowercase ends with .arc or .arc.gz
328     */
329    public static boolean isARC(String filename) {
330        ArgumentNotValid.checkNotNullOrEmpty(filename, "filename");
331        String filenameLowercase = filename.toLowerCase();
332        return (filenameLowercase.endsWith(".arc") || filenameLowercase.endsWith(".arc.gz"));
333    }
334
335}