001/* 002 * #%L 003 * Netarchivesuite - wayback 004 * %% 005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.wayback; 024 025import java.io.IOException; 026import java.io.InputStream; 027import java.util.HashMap; 028import java.util.Map; 029 030import org.apache.commons.logging.Log; 031import org.apache.commons.logging.LogFactory; 032import org.archive.format.ArchiveFileConstants; 033import org.archive.io.ArchiveRecordHeader; 034import org.archive.io.arc.ARCRecord; 035import org.archive.io.arc.ARCRecordMetaData; 036import org.archive.wayback.ResourceStore; 037import org.archive.wayback.core.CaptureSearchResult; 038import org.archive.wayback.core.Resource; 039import org.archive.wayback.exception.ResourceNotAvailableException; 040import org.archive.wayback.resourcestore.resourcefile.ArcResource; 041 042import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory; 043import dk.netarkivet.common.distribute.arcrepository.BitarchiveRecord; 044import dk.netarkivet.common.distribute.arcrepository.ViewerArcRepositoryClient; 045 046/** 047 * This is the connector between netarchivesuite and wayback. And is based on PrototypeNetarchiveResourceStore.java 048 * which was made as a prototype connector. 049 */ 050public class NetarchiveResourceStore implements ResourceStore { 051 052 /** JMS ArcRepositoryClient. */ 053 protected ViewerArcRepositoryClient client; 054 055 /** Logger. */ 056 private Log logger = LogFactory.getLog(getClass().getName()); 057 058 /** 059 * Constructor. 060 */ 061 public NetarchiveResourceStore() { 062 client = ArcRepositoryClientFactory.getViewerInstance(); 063 } 064 065 /** 066 * Transforms search result into a resource, according to the ResourceStore interface. 067 * 068 * @param captureSearchResult the search result. 069 * @return a valid resource containing metadata and a link to the ARC record. 070 * @throws ResourceNotAvailableException if something went wrong fetching record. 071 */ 072 public Resource retrieveResource(CaptureSearchResult captureSearchResult) throws ResourceNotAvailableException { 073 long offset; 074 Map<String, Object> metadata = new HashMap<String, Object>(); 075 ARCRecord arcRecord; 076 ArchiveRecordHeader arcRecordMetaData; 077 078 String filename = captureSearchResult.getFile(); 079 try { 080 offset = captureSearchResult.getOffset(); 081 } catch (NumberFormatException e) { 082 logger.error("Error looking for non existing resource", e); 083 throw new ResourceNotAvailableException("NetarchiveResourceStore " 084 + "thows NumberFormatException when reading offset."); 085 } catch (NullPointerException e) { 086 logger.error("Error looking for non existing resource", e); 087 throw new ResourceNotAvailableException("NetarchiveResourceStore " 088 + "throws NullPointerException when accessing " + "CaptureResult given from Wayback."); 089 } 090 logger.info("Received request for resource from file '" + filename + "' at offset '" + offset + "'"); 091 BitarchiveRecord bitarchiveRecord = client.get(filename, offset); 092 if (bitarchiveRecord == null) { 093 throw new ResourceNotAvailableException("NetarchiveResourceStore: " 094 + "Bitarchive didn't return the requested record."); 095 } 096 logger.info("Retrieved resource from file '" + filename + "' at offset '" + offset + "'"); 097 098 // This InputStream is just the http-response, starting with the HTTP arcRecordMetaData. 099 InputStream is = bitarchiveRecord.getData(); 100 101 metadata.put(ARCRecordMetaData.URL_FIELD_KEY, captureSearchResult.getOriginalUrl()); 102 try { 103 metadata.put(ARCRecordMetaData.IP_HEADER_FIELD_KEY, captureSearchResult.getOriginalHost()); 104 } catch (NullPointerException ex) { 105 metadata.put(ARCRecordMetaData.IP_HEADER_FIELD_KEY, ""); 106 } 107 metadata.put(ARCRecordMetaData.DATE_FIELD_KEY, captureSearchResult.getCaptureDate().toString()); 108 metadata.put(ARCRecordMetaData.MIMETYPE_FIELD_KEY, captureSearchResult.getMimeType()); 109 metadata.put(ARCRecordMetaData.VERSION_FIELD_KEY, captureSearchResult.getHttpCode()); 110 metadata.put(ARCRecordMetaData.ABSOLUTE_OFFSET_KEY, "" + offset); 111 metadata.put(ARCRecordMetaData.LENGTH_FIELD_KEY, "" + bitarchiveRecord.getLength()); 112 metadata.put(ARCRecordMetaData.STATUSCODE_FIELD_KEY, captureSearchResult.getHttpCode()); 113 metadata.put(ArchiveFileConstants.ORIGIN_FIELD_KEY, captureSearchResult.getOriginalUrl()); 114 // create arcRecordMetaData. 115 try { 116 arcRecordMetaData = new ARCRecordMetaData(filename, metadata); 117 } catch (IOException e) { 118 logger.error("Could not create arcRecordMetaData", e); 119 throw new ResourceNotAvailableException(e.getMessage()); 120 } 121 122 // create ARCRecord. 123 try { 124 arcRecord = new ARCRecord(is, arcRecordMetaData, 0, false, false, true); 125 //arcRecord.getHttpHeaders(); 126 //arcRecord.skipHttpHeader(); 127 logger.debug("ARCRecord created with code '" + arcRecord.getStatusCode() + "'"); 128 logger.debug("Headers: " + arcRecord.getHeaderString()); 129 } catch (NullPointerException e) { 130 logger.error("Could not create ARCRecord", e); 131 throw new ResourceNotAvailableException("ARC record doesn't contain" + " valid http URL"); 132 } catch (IOException e) { 133 logger.error("Could not create ARCRecord", e); 134 throw new ResourceNotAvailableException(e.getMessage()); 135 } 136 Resource resource = new ArcResource(arcRecord, null); 137 try { 138 //This call has the side-effect of queueing up the resource at the start of the response-body, after the http headers. 139 resource.parseHeaders(); 140 } catch (IOException e) { 141 logger.debug(e); 142 } 143 logger.info("Returning resource '" + resource + "'"); 144 return resource; 145 } 146 147 /** 148 * Shuts down this resource store, closing the arcrepository client. 149 * 150 * @throws IOException if an exception occurred while closing the client. 151 */ 152 public void shutdown() throws IOException { 153 // Close JMS connection. 154 client.close(); 155 } 156}