001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.viewerproxy; 024 025import java.io.BufferedInputStream; 026import java.io.BufferedOutputStream; 027import java.io.ByteArrayInputStream; 028import java.io.ByteArrayOutputStream; 029import java.io.File; 030import java.io.IOException; 031import java.io.InputStream; 032import java.io.OutputStream; 033import java.net.URI; 034import java.util.regex.Matcher; 035import java.util.regex.Pattern; 036 037import org.slf4j.Logger; 038import org.slf4j.LoggerFactory; 039 040import dk.netarkivet.common.Constants; 041import dk.netarkivet.common.distribute.arcrepository.ARCLookup; 042import dk.netarkivet.common.distribute.arcrepository.ResultStream; 043import dk.netarkivet.common.distribute.arcrepository.ViewerArcRepositoryClient; 044import dk.netarkivet.common.exceptions.ArgumentNotValid; 045import dk.netarkivet.common.exceptions.IOFailure; 046import dk.netarkivet.common.utils.Settings; 047import dk.netarkivet.harvester.HarvesterSettings; 048 049/** 050 * The ARCArchiveAccess class implements reading of ARC indexes and files. It builds on the Java ARC utils and Lucene 051 * indexes, and handles using these in an HTTP context. 052 */ 053public class ARCArchiveAccess implements URIResolver { 054 // Class constants 055 /** Transfer encoding header. */ 056 private static final String TRANSFER_ENCODING_HTTP_HEADER = "Transfer-encoding"; 057 058 /** HTTP status code for page not found. */ 059 private static final int HTTP_NOTFOUND_VALUE = 404; 060 /** HTTP header for page not found. */ 061 private static final String NOTFOUND_HEADER = "HTTP/1.1 404 Not found"; 062 /** Content-type header used for page not found. */ 063 private static final String CONTENT_TYPE_STRING = "Content-type: text/html"; 064 /** Inserted before page not found response. */ 065 private static final String HTML_HEADER = "<html><head><title>" + "Not found</title></head><body>"; 066 /** Inserted after page not found response. */ 067 private static final String HTML_FOOTER = "</body></html>"; 068 069 /** 070 * Matches HTTP header lines like HTTP/1.1 404 Page has gone south Groups: 111 2222222222222222222. 071 */ 072 private static final Pattern HTTP_HEADER_PATTERN = Pattern.compile("^HTTP/1\\.[01] (\\d+) (.*)$"); 073 074 /** The underlying ARC record lookup object. */ 075 private ARCLookup lookup; 076 077 /** Logger for this class. */ 078 private static final Logger log = LoggerFactory.getLogger(ARCArchiveAccess.class); 079 080 /** 081 * If the value is true, we will try to lookup w/ ftp instead of http, if we don't get a hit in the index. 082 */ 083 private static final boolean tryToLookupUriAsFtp = Settings.getBoolean(HarvesterSettings.TRY_LOOKUP_URI_AS_FTP); 084 085 /** 086 * Initialise new ARCArchiveAccess with no index file. 087 * 088 * @param arcRepositoryClient The arcRepositoryClient to use when retrieving 089 * @throws ArgumentNotValid if arcRepositoryClient is null. 090 */ 091 public ARCArchiveAccess(ViewerArcRepositoryClient arcRepositoryClient) { 092 ArgumentNotValid.checkNotNull(arcRepositoryClient, "ArcRepositoryClient arcRepositoryClient"); 093 lookup = new ARCLookup(arcRepositoryClient); 094 lookup.setTryToLookupUriAsFtp(tryToLookupUriAsFtp); 095 log.info("Constructed instance of ARCArchiveAccess with TryToLookupUriAsFtp: {}", tryToLookupUriAsFtp); 096 } 097 098 /** 099 * This method resets the Lucene index this object works on, and replaces it with the given index. 100 * 101 * @param index The new index file, a directory containing Lucene files. 102 * @throws ArgumentNotValid If argument is null 103 * @throws IOFailure if the file cannot be read 104 */ 105 public void setIndex(File index) { 106 lookup.setIndex(index); 107 log.info("ARCArchiveAccess instance now uses indexfile {}", index); 108 } 109 110 /** 111 * Look up a given URI and add its contents to the Response given. 112 * 113 * @param request The request to look up record for 114 * @param response The response to return to the browser 115 * @return The response code for this page if found, or URIResolver.NOT_FOUND otherwise. 116 * @throws IOFailure on trouble looking up the request (timeout, i/o, etc.) 117 * @see URIResolver#lookup(Request, Response) 118 */ 119 public int lookup(Request request, Response response) { 120 ArgumentNotValid.checkNotNull(request, "Request request"); 121 ArgumentNotValid.checkNotNull(response, "Response response"); 122 URI uri = request.getURI(); 123 ResultStream content = null; 124 InputStream contentStream = null; 125 log.debug("Doing Lookup of URI '{}'", uri); 126 try { 127 content = lookup.lookup(uri); 128 if (content == null) { 129 // If the object wasn't found, return an appropriate message. 130 log.debug("Missing URL '{}'", uri); 131 createNotFoundResponse(uri, response); 132 return URIResolver.NOT_FOUND; 133 } 134 contentStream = content.getInputStream(); 135 // First write the original header. 136 if (content.containsHeader()) { 137 log.debug("Write first the original header"); 138 writeHeader(contentStream, response); 139 } 140 // Now flush the content to the browser. 141 readPage(contentStream, response.getOutputStream()); 142 } finally { 143 if (contentStream != null) { 144 try { 145 contentStream.close(); 146 } catch (IOException e) { 147 log.debug("Error writing response to browser for '{}'. Giving up!", uri, e); 148 } 149 } 150 } 151 return response.getStatus(); 152 } 153 154 /** 155 * Generate an appropriate response when a URI is not found. If this fails, it is logged, but otherwise ignored. 156 * 157 * @param uri The URI attempted read that could not be found 158 * @param response The Response object to write the error response into. 159 */ 160 protected void createNotFoundResponse(URI uri, Response response) { 161 try { 162 // first write a header telling the browser to expect text/html 163 response.setStatus(HTTP_NOTFOUND_VALUE); 164 writeHeader(new ByteArrayInputStream((NOTFOUND_HEADER + '\n' + CONTENT_TYPE_STRING).getBytes()), response); 165 // Now flush an error screen to the browser 166 OutputStream browserOut = response.getOutputStream(); 167 browserOut.write((HTML_HEADER + "Can't find URL: " + uri + HTML_FOOTER).getBytes()); 168 browserOut.flush(); 169 } catch (IOFailure e) { 170 log.debug("Error writing error response to browser " + "for '" + uri + "'. Giving up!", e); 171 } catch (IOException e) { 172 log.debug("Error writing error response to browser " + "for '" + uri + "'. Giving up!", e); 173 } 174 // Do not close stream! That is left to the servlet. 175 } 176 177 /** 178 * Apply filters to HTTP headers. Can be overridden in subclasses. Currently only removes Transfer-encoding headers. 179 * 180 * @param headername The name of the header field, e.g. Content-Type Remember that this is not case sensitive 181 * @param headercontents The contents of the header field, e.g. text/html 182 * @return A (possibly modified) header contents string, or null if the header should be skipped. 183 */ 184 protected String filterHeader(String headername, String headercontents) { 185 // Cannot get chunked output to work, so we must remove 186 // any chunked encoding lines 187 if (headername.equalsIgnoreCase(TRANSFER_ENCODING_HTTP_HEADER)) { 188 log.debug("Ignoring headerline: '{}','{}'", headername, headercontents); 189 return null; 190 } 191 return headercontents; 192 } 193 194 /** 195 * Write HTTP header, including status and status reason. 196 * 197 * @param is A stream to read the header from. 198 * @param response A Response to write the header, status and reason to. 199 * @throws IOFailure If the underlying reads or writes fail. 200 */ 201 private void writeHeader(InputStream is, Response response) { 202 // Reads until the end of the header (indicated by an empty line) 203 try { 204 for (String line = readLine(is); (line != null) && (line.length() > 0); line = readLine(is)) { 205 // Try to match lines like "HTTP/1.0 200 OK" 206 Matcher m = HTTP_HEADER_PATTERN.matcher(line); 207 if (m.matches()) { 208 String responsecode = m.group(1); 209 String responsetext = m.group(2); 210 // Note: Always parsable int, due to the regexp, so no reason 211 // to check for parse errors 212 log.debug("SetStatus '{}':'{}", responsecode, responsetext); 213 response.setStatus(Integer.parseInt(responsecode), responsetext); 214 } else { 215 // try to match header-lines containing colon, 216 // like "Content-Type: text/html" 217 String[] parts = line.split(":", 2); 218 if (parts.length != 2) { 219 log.debug("Malformed header line '" + line + "'"); 220 } else { 221 String name = parts[0]; 222 String contents = filterHeader(name, parts[1].trim()); 223 if (contents != null) { 224 // filter out unwanted headers 225 log.debug("Added header-field '{}' with contents '{}'", name, contents); 226 response.addHeaderField(name, contents); 227 } 228 } 229 } 230 } 231 } catch (IOException e) { 232 throw new IOFailure("Trouble reading from input stream or writing" + " to output stream", e); 233 } 234 } 235 236 /** 237 * Read an entire page body into some stream. 238 * 239 * @param content The stream to read the page from. Not closed afterwards. 240 * @param out The stream to write the results to. Not closed afterwards. 241 * @throws IOFailure If the underlying reads or writes fail 242 */ 243 private void readPage(InputStream content, OutputStream out) { 244 BufferedInputStream page = new BufferedInputStream(content); 245 BufferedOutputStream responseOut = new BufferedOutputStream(out); 246 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 247 try { 248 byte[] buffer = new byte[Constants.IO_BUFFER_SIZE]; 249 int bytesRead; 250 while ((bytesRead = page.read(buffer)) != -1) { 251 baos.write(buffer, 0, bytesRead); 252 responseOut.write(buffer, 0, bytesRead); 253 } 254 responseOut.flush(); 255 log.debug("pagecontents: ", new String(baos.toByteArray(), "UTF-8")); 256 } catch (IOException e) { 257 throw new IOFailure("Could not read or write data", e); 258 } 259 } 260 261 /** 262 * Read a line of bytes from an InputStream. Useful when an InputStream may contain both text and binary data. 263 * 264 * @param inputStream A source of data 265 * @return A line of text read from inputStream, with terminating \r\n or \n removed, or null if no data is 266 * available. 267 * @throws IOException on trouble reading from input stream 268 */ 269 private String readLine(InputStream inputStream) throws IOException { 270 byte[] rawdata = readRawLine(inputStream); 271 if (rawdata == null) { 272 return null; 273 } 274 int len = rawdata.length; 275 if (len > 0) { 276 if (rawdata[len - 1] == '\n') { 277 len--; 278 if (len > 0) { 279 if (rawdata[len - 1] == '\r') { 280 len--; 281 } 282 } 283 } 284 } 285 return new String(rawdata, 0, len); 286 } 287 288 /** 289 * Reads a raw line from an InputStream, up till \n. Since HTTP allows \r\n and \n as terminators, this gets the 290 * whole line. This code is adapted from org.apache.commons.httpclient.HttpParser 291 * 292 * @param inputStream A stream to read from. 293 * @return Array of bytes read or null if none are available. 294 * @throws IOException if the underlying reads fail 295 */ 296 private static byte[] readRawLine(InputStream inputStream) throws IOException { 297 ByteArrayOutputStream buf = new ByteArrayOutputStream(); 298 int ch; 299 while ((ch = inputStream.read()) >= 0) { 300 buf.write(ch); 301 if (ch == '\n') { 302 break; 303 } 304 } 305 if (buf.size() == 0) { 306 return null; 307 } 308 return buf.toByteArray(); 309 } 310}