001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.viewerproxy;
024
025import java.io.BufferedInputStream;
026import java.io.BufferedOutputStream;
027import java.io.ByteArrayInputStream;
028import java.io.ByteArrayOutputStream;
029import java.io.File;
030import java.io.IOException;
031import java.io.InputStream;
032import java.io.OutputStream;
033import java.net.URI;
034import java.util.regex.Matcher;
035import java.util.regex.Pattern;
036
037import org.slf4j.Logger;
038import org.slf4j.LoggerFactory;
039
040import dk.netarkivet.common.Constants;
041import dk.netarkivet.common.distribute.arcrepository.ARCLookup;
042import dk.netarkivet.common.distribute.arcrepository.ResultStream;
043import dk.netarkivet.common.distribute.arcrepository.ViewerArcRepositoryClient;
044import dk.netarkivet.common.exceptions.ArgumentNotValid;
045import dk.netarkivet.common.exceptions.IOFailure;
046import dk.netarkivet.common.utils.Settings;
047import dk.netarkivet.harvester.HarvesterSettings;
048
049/**
050 * The ARCArchiveAccess class implements reading of ARC indexes and files. It builds on the Java ARC utils and Lucene
051 * indexes, and handles using these in an HTTP context.
052 */
053public class ARCArchiveAccess implements URIResolver {
054    // Class constants
055    /** Transfer encoding header. */
056    private static final String TRANSFER_ENCODING_HTTP_HEADER = "Transfer-encoding";
057
058    /** HTTP status code for page not found. */
059    private static final int HTTP_NOTFOUND_VALUE = 404;
060    /** HTTP header for page not found. */
061    private static final String NOTFOUND_HEADER = "HTTP/1.1 404 Not found";
062    /** Content-type header used for page not found. */
063    private static final String CONTENT_TYPE_STRING = "Content-type: text/html";
064    /** Inserted before page not found response. */
065    private static final String HTML_HEADER = "<html><head><title>" + "Not found</title></head><body>";
066    /** Inserted after page not found response. */
067    private static final String HTML_FOOTER = "</body></html>";
068
069    /**
070     * Matches HTTP header lines like HTTP/1.1 404 Page has gone south Groups: 111 2222222222222222222.
071     */
072    private static final Pattern HTTP_HEADER_PATTERN = Pattern.compile("^HTTP/1\\.[01] (\\d+) (.*)$");
073
074    /** The underlying ARC record lookup object. */
075    private ARCLookup lookup;
076
077    /** Logger for this class. */
078    private static final Logger log = LoggerFactory.getLogger(ARCArchiveAccess.class);
079
080    /**
081     * If the value is true, we will try to lookup w/ ftp instead of http, if we don't get a hit in the index.
082     */
083    private static final boolean tryToLookupUriAsFtp = Settings.getBoolean(HarvesterSettings.TRY_LOOKUP_URI_AS_FTP);
084
085    /**
086     * Initialise new ARCArchiveAccess with no index file.
087     *
088     * @param arcRepositoryClient The arcRepositoryClient to use when retrieving
089     * @throws ArgumentNotValid if arcRepositoryClient is null.
090     */
091    public ARCArchiveAccess(ViewerArcRepositoryClient arcRepositoryClient) {
092        ArgumentNotValid.checkNotNull(arcRepositoryClient, "ArcRepositoryClient arcRepositoryClient");
093        lookup = new ARCLookup(arcRepositoryClient);
094        lookup.setTryToLookupUriAsFtp(tryToLookupUriAsFtp);
095        log.info("Constructed instance of ARCArchiveAccess with TryToLookupUriAsFtp: {}", tryToLookupUriAsFtp);
096    }
097
098    /**
099     * This method resets the Lucene index this object works on, and replaces it with the given index.
100     *
101     * @param index The new index file, a directory containing Lucene files.
102     * @throws ArgumentNotValid If argument is null
103     * @throws IOFailure if the file cannot be read
104     */
105    public void setIndex(File index) {
106        lookup.setIndex(index);
107        log.info("ARCArchiveAccess instance now uses indexfile {}", index);
108    }
109
110    /**
111     * Look up a given URI and add its contents to the Response given.
112     *
113     * @param request The request to look up record for
114     * @param response The response to return to the browser
115     * @return The response code for this page if found, or URIResolver.NOT_FOUND otherwise.
116     * @throws IOFailure on trouble looking up the request (timeout, i/o, etc.)
117     * @see URIResolver#lookup(Request, Response)
118     */
119    public int lookup(Request request, Response response) {
120        ArgumentNotValid.checkNotNull(request, "Request request");
121        ArgumentNotValid.checkNotNull(response, "Response response");
122        URI uri = request.getURI();
123        ResultStream content = null;
124        InputStream contentStream = null;
125        log.debug("Doing Lookup of URI '{}'", uri);
126        try {
127            content = lookup.lookup(uri);
128            if (content == null) {
129                // If the object wasn't found, return an appropriate message.
130                log.debug("Missing URL '{}'", uri);
131                createNotFoundResponse(uri, response);
132                return URIResolver.NOT_FOUND;
133            }
134            contentStream = content.getInputStream();
135            // First write the original header.
136            if (content.containsHeader()) {
137                log.debug("Write first the original header");
138                writeHeader(contentStream, response);
139            }
140            // Now flush the content to the browser.
141            readPage(contentStream, response.getOutputStream());
142        } finally {
143            if (contentStream != null) {
144                try {
145                    contentStream.close();
146                } catch (IOException e) {
147                    log.debug("Error writing response to browser for '{}'. Giving up!", uri, e);
148                }
149            }
150        }
151        return response.getStatus();
152    }
153
154    /**
155     * Generate an appropriate response when a URI is not found. If this fails, it is logged, but otherwise ignored.
156     *
157     * @param uri The URI attempted read that could not be found
158     * @param response The Response object to write the error response into.
159     */
160    protected void createNotFoundResponse(URI uri, Response response) {
161        try {
162            // first write a header telling the browser to expect text/html
163            response.setStatus(HTTP_NOTFOUND_VALUE);
164            writeHeader(new ByteArrayInputStream((NOTFOUND_HEADER + '\n' + CONTENT_TYPE_STRING).getBytes()), response);
165            // Now flush an error screen to the browser
166            OutputStream browserOut = response.getOutputStream();
167            browserOut.write((HTML_HEADER + "Can't find URL: " + uri + HTML_FOOTER).getBytes());
168            browserOut.flush();
169        } catch (IOFailure e) {
170            log.debug("Error writing error response to browser " + "for '" + uri + "'. Giving up!", e);
171        } catch (IOException e) {
172            log.debug("Error writing error response to browser " + "for '" + uri + "'. Giving up!", e);
173        }
174        // Do not close stream! That is left to the servlet.
175    }
176
177    /**
178     * Apply filters to HTTP headers. Can be overridden in subclasses. Currently only removes Transfer-encoding headers.
179     *
180     * @param headername The name of the header field, e.g. Content-Type Remember that this is not case sensitive
181     * @param headercontents The contents of the header field, e.g. text/html
182     * @return A (possibly modified) header contents string, or null if the header should be skipped.
183     */
184    protected String filterHeader(String headername, String headercontents) {
185        // Cannot get chunked output to work, so we must remove
186        // any chunked encoding lines
187        if (headername.equalsIgnoreCase(TRANSFER_ENCODING_HTTP_HEADER)) {
188                log.debug("Ignoring headerline: '{}','{}'", headername, headercontents);
189            return null;
190        }
191        return headercontents;
192    }
193
194    /**
195     * Write HTTP header, including status and status reason.
196     *
197     * @param is A stream to read the header from.
198     * @param response A Response to write the header, status and reason to.
199     * @throws IOFailure If the underlying reads or writes fail.
200     */
201    private void writeHeader(InputStream is, Response response) {
202        // Reads until the end of the header (indicated by an empty line)
203        try {
204            for (String line = readLine(is); (line != null) && (line.length() > 0); line = readLine(is)) {
205                // Try to match lines like "HTTP/1.0 200 OK"
206                Matcher m = HTTP_HEADER_PATTERN.matcher(line);
207                if (m.matches()) {
208                    String responsecode = m.group(1);
209                    String responsetext = m.group(2);
210                    // Note: Always parsable int, due to the regexp, so no reason
211                    // to check for parse errors
212                    log.debug("SetStatus '{}':'{}", responsecode, responsetext);
213                    response.setStatus(Integer.parseInt(responsecode), responsetext);
214                } else {
215                    // try to match header-lines containing colon,
216                    // like "Content-Type: text/html"
217                    String[] parts = line.split(":", 2);
218                    if (parts.length != 2) {
219                        log.debug("Malformed header line '" + line + "'");
220                    } else {
221                        String name = parts[0];
222                        String contents = filterHeader(name, parts[1].trim());
223                        if (contents != null) {
224                            // filter out unwanted headers
225                                log.debug("Added header-field '{}' with contents '{}'", name, contents);
226                            response.addHeaderField(name, contents);
227                        }
228                    }
229                }
230            }
231        } catch (IOException e) {
232            throw new IOFailure("Trouble reading from input stream or writing" + " to output stream", e);
233        }
234    }
235
236    /**
237     * Read an entire page body into some stream.
238     *
239     * @param content The stream to read the page from. Not closed afterwards.
240     * @param out The stream to write the results to. Not closed afterwards.
241     * @throws IOFailure If the underlying reads or writes fail
242     */
243    private void readPage(InputStream content, OutputStream out) {
244        BufferedInputStream page = new BufferedInputStream(content);
245        BufferedOutputStream responseOut = new BufferedOutputStream(out);
246        ByteArrayOutputStream baos = new ByteArrayOutputStream(); 
247        try {
248            byte[] buffer = new byte[Constants.IO_BUFFER_SIZE];
249            int bytesRead;
250            while ((bytesRead = page.read(buffer)) != -1) {
251                baos.write(buffer, 0, bytesRead);
252                responseOut.write(buffer, 0, bytesRead);
253            }
254            responseOut.flush();
255            log.debug("pagecontents: ", new String(baos.toByteArray(), "UTF-8"));
256        } catch (IOException e) {
257            throw new IOFailure("Could not read or write data", e);
258        }
259    }
260
261    /**
262     * Read a line of bytes from an InputStream. Useful when an InputStream may contain both text and binary data.
263     *
264     * @param inputStream A source of data
265     * @return A line of text read from inputStream, with terminating \r\n or \n removed, or null if no data is
266     * available.
267     * @throws IOException on trouble reading from input stream
268     */
269    private String readLine(InputStream inputStream) throws IOException {
270        byte[] rawdata = readRawLine(inputStream);
271        if (rawdata == null) {
272            return null;
273        }
274        int len = rawdata.length;
275        if (len > 0) {
276            if (rawdata[len - 1] == '\n') {
277                len--;
278                if (len > 0) {
279                    if (rawdata[len - 1] == '\r') {
280                        len--;
281                    }
282                }
283            }
284        }
285        return new String(rawdata, 0, len);
286    }
287
288    /**
289     * Reads a raw line from an InputStream, up till \n. Since HTTP allows \r\n and \n as terminators, this gets the
290     * whole line. This code is adapted from org.apache.commons.httpclient.HttpParser
291     *
292     * @param inputStream A stream to read from.
293     * @return Array of bytes read or null if none are available.
294     * @throws IOException if the underlying reads fail
295     */
296    private static byte[] readRawLine(InputStream inputStream) throws IOException {
297        ByteArrayOutputStream buf = new ByteArrayOutputStream();
298        int ch;
299        while ((ch = inputStream.read()) >= 0) {
300            buf.write(ch);
301            if (ch == '\n') {
302                break;
303            }
304        }
305        if (buf.size() == 0) {
306            return null;
307        }
308        return buf.toByteArray();
309    }
310}