001/*
002 * #%L
003 * Netarchivesuite - common
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.common.utils;
025
026import java.io.IOException;
027import java.io.InputStream;
028import java.util.zip.GZIPInputStream;
029
030/**
031 * Subclass of GZIPInputstream, including a workaround to support >2GB files.
032 * <p>
033 * Java currently has a bug that does not allow unzipping Gzip files with contents larger than 2GB. The result will be
034 * an IOException with the message "Corrupt GZIP trailer". This class works around that bug by ignoring that message for
035 * all streams which are uncompressed larger than 2GB. This sacrifices CRC checks for those streams, though.
036 * <p>
037 * See sun bug: http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=5092263
038 *
039 * @see GZIPInputStream
040 */
041public class LargeFileGZIPInputStream extends GZIPInputStream {
042
043    /**
044     * Creates a new input stream with a default buffer size.
045     *
046     * @param in the input stream
047     * @throws IOException if an I/O error has occurred. Note: We usually don't allow IOException in our code, but this
048     * is done here to closely mimic GZIPInputStream
049     */
050    public LargeFileGZIPInputStream(InputStream in) throws IOException {
051        super(in);
052    }
053
054    /**
055     * Reads uncompressed data into an array of bytes. Blocks until enough input is available for decompression.
056     *
057     * @param buf the buffer into which the data is read
058     * @param off the start offset of the data
059     * @param len the maximum number of bytes read
060     * @return the actual number of bytes read, or -1 if the end of the compressed input stream is reached
061     * @throws IOException if an I/O error has occurred or the compressed input data is corrupt. Note that size
062     * differences are ignored in this workaround class if size is larger than Integer.MAX_VALUE. Note: We usually don't
063     * allow IOException in our code, but this is done here to closely mimic GZIPInputStream
064     */
065    public int read(byte[] buf, int off, int len) throws IOException {
066        try {
067            return super.read(buf, off, len);
068        } catch (IOException e) {
069            if (exceptionCausedByJavaException(e)) {
070                // mimic succes
071                eos = true;
072                return -1;
073            } else {
074                throw e;
075            }
076        }
077    }
078
079    /**
080     * Given an IOException caused by read, return whether this is the exception we are working around. This is the case
081     * if 1) The message is Corrupt GZIP trailer 2) More then Integer.MAX_VALUE bytes are written
082     *
083     * @param e An IOException thrown by GZIPInputStream.read
084     * @return Whether it is one caused by the bug we are working around
085     */
086    private boolean exceptionCausedByJavaException(IOException e) {
087        return (e.getMessage() != null && e.getMessage().equals("Corrupt GZIP trailer") && inf.getBytesWritten() >= Integer.MAX_VALUE);
088    }
089
090}