001/* 002 * #%L 003 * Netarchivesuite - wayback 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.wayback; 024 025import java.io.IOException; 026import java.io.InputStream; 027import java.util.HashMap; 028import java.util.Map; 029import java.util.regex.Matcher; 030import java.util.regex.Pattern; 031 032import org.apache.commons.logging.Log; 033import org.apache.commons.logging.LogFactory; 034import org.archive.io.ArchiveReader; 035import org.archive.io.ArchiveRecordHeader; 036import org.archive.io.arc.ARCRecord; 037import org.archive.io.arc.ARCRecordMetaData; 038import org.archive.wayback.ResourceStore; 039import org.archive.wayback.core.CaptureSearchResult; 040import org.archive.wayback.core.Resource; 041import org.archive.wayback.exception.ResourceNotAvailableException; 042import org.archive.wayback.resourcestore.resourcefile.ArcResource; 043 044import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory; 045import dk.netarkivet.common.distribute.arcrepository.BitarchiveRecord; 046import dk.netarkivet.common.distribute.arcrepository.ViewerArcRepositoryClient; 047import dk.netarkivet.common.utils.InputStreamUtils; 048 049/** 050 * This is the connector between netarchivesuite and wayback. And is based on PrototypeNetarchiveResourceStore.java 051 * which was made as a prototype connector. 052 */ 053public class NetarchiveResourceStore implements ResourceStore { 054 055 /** JMS ArcRepositoryClient. */ 056 protected ViewerArcRepositoryClient client; 057 058 /** Pattern for matching http version header. */ 059 private static final Pattern HTTP_HEADER_PATTERN = Pattern.compile("^HTTP/1\\.[01] (\\d+) (.*)$"); 060 061 /** Logger. */ 062 private Log logger = LogFactory.getLog(getClass().getName()); 063 064 /** 065 * Constructor. 066 */ 067 public NetarchiveResourceStore() { 068 client = ArcRepositoryClientFactory.getViewerInstance(); 069 } 070 071 /** 072 * Transforms search result into a resource, according to the ResourceStore interface. 073 * 074 * @param captureSearchResult the search result. 075 * @return a valid resource containing metadata and a link to the ARC record. 076 * @throws ResourceNotAvailableException if something went wrong fetching record. 077 */ 078 public Resource retrieveResource(CaptureSearchResult captureSearchResult) throws ResourceNotAvailableException { 079 long offset; 080 String responseCode = null; 081 Map<String, Object> metadata = new HashMap<String, Object>(); 082 ARCRecord arcRecord; 083 ArchiveRecordHeader header; 084 085 String arcfile = captureSearchResult.getFile(); 086 try { 087 offset = captureSearchResult.getOffset(); 088 } catch (NumberFormatException e) { 089 logger.error("Error looking for non existing resource", e); 090 throw new ResourceNotAvailableException("NetarchiveResourceStore " 091 + "thows NumberFormatException when reading offset."); 092 } catch (NullPointerException e) { 093 logger.error("Error looking for non existing resource", e); 094 throw new ResourceNotAvailableException("NetarchiveResourceStore " 095 + "throws NullPointerException when accessing " + "CaptureResult given from Wayback."); 096 } 097 logger.info("Received request for resource from file '" + arcfile + "' at offset '" + offset + "'"); 098 BitarchiveRecord bitarchiveRecord = client.get(arcfile, offset); 099 if (bitarchiveRecord == null) { 100 throw new ResourceNotAvailableException("NetarchiveResourceStore: " 101 + "Bitarchive didn't return the requested record."); 102 } 103 logger.info("Retrieved resource from file '" + arcfile + "' at offset '" + offset + "'"); 104 105 InputStream is = bitarchiveRecord.getData(); 106 // Match header-lines (until empty line). 107 try { 108 for (String line = InputStreamUtils.readLine(is); line != null && line.length() > 0; line = InputStreamUtils 109 .readLine(is)) { 110 Matcher m = HTTP_HEADER_PATTERN.matcher(line); 111 if (m.matches()) { 112 responseCode = m.group(1); 113 logger.debug("Setting response code '" + responseCode + "'"); 114 115 } else { 116 String[] parts = line.split(":", 2); 117 if (parts.length != 2) { 118 logger.debug("Malformed header line '" + line + "'"); 119 } else { 120 String name = parts[0]; 121 String contents = parts[1].trim(); 122 if (contents != null) { 123 if (name.equals("Content-Length")) { 124 logger.info("Setting length header to '" + contents + "'"); 125 metadata.put(ARCRecordMetaData.LENGTH_FIELD_KEY, contents); 126 } else if (name.equals("Content-Type")) { 127 logger.info("Setting Content-Type header to '" + contents + "'"); 128 metadata.put(ARCRecordMetaData.MIMETYPE_FIELD_KEY, contents); 129 } else if (name.equals("Location")) { 130 logger.info("Setting redirect Location header to '" + contents + "'"); 131 metadata.put("Location", contents); 132 } 133 } 134 } 135 } 136 } 137 } catch (IOException e) { 138 logger.error("Error looking for empty line", e); 139 throw new ResourceNotAvailableException(e.getMessage()); 140 } 141 // fill metadata for ARC record. 142 metadata.put(ARCRecordMetaData.URL_FIELD_KEY, captureSearchResult.getUrlKey()); 143 // TODO the following is the correct way to set the URL. If we do 144 // things this way then we should be able to get arcrecord to parse 145 // the headers for us. 146 /* 147 * metadata.put(ARCRecordMetaData.URL_FIELD_KEY, captureSearchResult.getOriginalUrl()); 148 */ 149 try { 150 metadata.put(ARCRecordMetaData.IP_HEADER_FIELD_KEY, captureSearchResult.getOriginalHost()); 151 } catch (NullPointerException ex) { 152 metadata.put(ARCRecordMetaData.IP_HEADER_FIELD_KEY, ""); 153 } 154 metadata.put(ARCRecordMetaData.DATE_FIELD_KEY, captureSearchResult.getCaptureDate().toString()); 155 metadata.put(ARCRecordMetaData.MIMETYPE_FIELD_KEY, captureSearchResult.getMimeType()); 156 metadata.put(ARCRecordMetaData.VERSION_FIELD_KEY, captureSearchResult.getHttpCode()); 157 metadata.put(ARCRecordMetaData.ABSOLUTE_OFFSET_KEY, "" + offset); 158 metadata.put(ARCRecordMetaData.LENGTH_FIELD_KEY, "" + bitarchiveRecord.getLength()); 159 if (responseCode != null) { 160 metadata.put(ARCRecordMetaData.STATUSCODE_FIELD_KEY, responseCode); 161 } 162 163 // create header. 164 try { 165 header = new ARCRecordMetaData(arcfile, metadata); 166 } catch (IOException e) { 167 logger.error("Could not create header", e); 168 throw new ResourceNotAvailableException(e.getMessage()); 169 } 170 171 // create ARCRecord. 172 try { 173 arcRecord = new ARCRecord(is, header, 0, false, false, true); 174 int code = arcRecord.getStatusCode(); 175 logger.debug("ARCRecord created with code '" + code + "'"); 176 arcRecord.skipHttpHeader(); 177 } catch (NullPointerException e) { 178 logger.error("Could not create ARCRecord", e); 179 throw new ResourceNotAvailableException("ARC record doesn't contain" + " valid http URL"); 180 } catch (IOException e) { 181 logger.error("Could not create ARCRecord", e); 182 throw new ResourceNotAvailableException(e.getMessage()); 183 } 184 final String statusCode = responseCode; 185 final Map<String, Object> metadataF = metadata; 186 // TODO This the sleaziest thing in this class. Why does the 187 // ARCRecord give the wrong status code if we don't override this method? 188 Resource resource = new ArcResource(arcRecord, (ArchiveReader) null) { 189 public int getStatusCode() { 190 return Integer.parseInt(statusCode); 191 } 192 // FIXME incompatible, needed? 193 /* 194 @Override 195 public Map<String, String> getHttpHeaders() { 196 return metadataF; 197 } 198 */ 199 }; 200 logger.info("Returning resource '" + resource + "'"); 201 return resource; 202 } 203 204 /** 205 * Shuts down this resource store, closing the arcrepository client. 206 * 207 * @throws IOException if an exception occurred while closing the client. 208 */ 209 public void shutdown() throws IOException { 210 // Close JMS connection. 211 client.close(); 212 } 213}