001/*
002 * #%L
003 * Netarchivesuite - wayback
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.wayback;
024
025import java.io.IOException;
026import java.io.InputStream;
027import java.util.HashMap;
028import java.util.Map;
029import java.util.regex.Matcher;
030import java.util.regex.Pattern;
031
032import org.apache.commons.logging.Log;
033import org.apache.commons.logging.LogFactory;
034import org.archive.io.ArchiveReader;
035import org.archive.io.ArchiveRecordHeader;
036import org.archive.io.arc.ARCRecord;
037import org.archive.io.arc.ARCRecordMetaData;
038import org.archive.wayback.ResourceStore;
039import org.archive.wayback.core.CaptureSearchResult;
040import org.archive.wayback.core.Resource;
041import org.archive.wayback.exception.ResourceNotAvailableException;
042import org.archive.wayback.resourcestore.resourcefile.ArcResource;
043
044import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory;
045import dk.netarkivet.common.distribute.arcrepository.BitarchiveRecord;
046import dk.netarkivet.common.distribute.arcrepository.ViewerArcRepositoryClient;
047import dk.netarkivet.common.utils.InputStreamUtils;
048
049/**
050 * This is the connector between netarchivesuite and wayback. And is based on PrototypeNetarchiveResourceStore.java
051 * which was made as a prototype connector.
052 */
053public class NetarchiveResourceStore implements ResourceStore {
054
055    /** JMS ArcRepositoryClient. */
056    protected ViewerArcRepositoryClient client;
057
058    /** Pattern for matching http version header. */
059    private static final Pattern HTTP_HEADER_PATTERN = Pattern.compile("^HTTP/1\\.[01] (\\d+) (.*)$");
060
061    /** Logger. */
062    private Log logger = LogFactory.getLog(getClass().getName());
063
064    /**
065     * Constructor.
066     */
067    public NetarchiveResourceStore() {
068        client = ArcRepositoryClientFactory.getViewerInstance();
069    }
070
071    /**
072     * Transforms search result into a resource, according to the ResourceStore interface.
073     *
074     * @param captureSearchResult the search result.
075     * @return a valid resource containing metadata and a link to the ARC record.
076     * @throws ResourceNotAvailableException if something went wrong fetching record.
077     */
078    public Resource retrieveResource(CaptureSearchResult captureSearchResult) throws ResourceNotAvailableException {
079        long offset;
080        String responseCode = null;
081        Map<String, Object> metadata = new HashMap<String, Object>();
082        ARCRecord arcRecord;
083        ArchiveRecordHeader header;
084
085        String arcfile = captureSearchResult.getFile();
086        try {
087            offset = captureSearchResult.getOffset();
088        } catch (NumberFormatException e) {
089            logger.error("Error looking for non existing resource", e);
090            throw new ResourceNotAvailableException("NetarchiveResourceStore "
091                    + "thows NumberFormatException when reading offset.");
092        } catch (NullPointerException e) {
093            logger.error("Error looking for non existing resource", e);
094            throw new ResourceNotAvailableException("NetarchiveResourceStore "
095                    + "throws NullPointerException when accessing " + "CaptureResult given from Wayback.");
096        }
097        logger.info("Received request for resource from file '" + arcfile + "' at offset '" + offset + "'");
098        BitarchiveRecord bitarchiveRecord = client.get(arcfile, offset);
099        if (bitarchiveRecord == null) {
100            throw new ResourceNotAvailableException("NetarchiveResourceStore: "
101                    + "Bitarchive didn't return the requested record.");
102        }
103        logger.info("Retrieved resource from file '" + arcfile + "' at offset '" + offset + "'");
104
105        InputStream is = bitarchiveRecord.getData();
106        // Match header-lines (until empty line).
107        try {
108            for (String line = InputStreamUtils.readLine(is); line != null && line.length() > 0; line = InputStreamUtils
109                    .readLine(is)) {
110                Matcher m = HTTP_HEADER_PATTERN.matcher(line);
111                if (m.matches()) {
112                    responseCode = m.group(1);
113                    logger.debug("Setting response code '" + responseCode + "'");
114
115                } else {
116                    String[] parts = line.split(":", 2);
117                    if (parts.length != 2) {
118                        logger.debug("Malformed header line '" + line + "'");
119                    } else {
120                        String name = parts[0];
121                        String contents = parts[1].trim();
122                        if (contents != null) {
123                            if (name.equals("Content-Length")) {
124                                logger.info("Setting length header to '" + contents + "'");
125                                metadata.put(ARCRecordMetaData.LENGTH_FIELD_KEY, contents);
126                            } else if (name.equals("Content-Type")) {
127                                logger.info("Setting Content-Type header to '" + contents + "'");
128                                metadata.put(ARCRecordMetaData.MIMETYPE_FIELD_KEY, contents);
129                            } else if (name.equals("Location")) {
130                                logger.info("Setting redirect Location header to '" + contents + "'");
131                                metadata.put("Location", contents);
132                            }
133                        }
134                    }
135                }
136            }
137        } catch (IOException e) {
138            logger.error("Error looking for empty line", e);
139            throw new ResourceNotAvailableException(e.getMessage());
140        }
141        // fill metadata for ARC record.
142        metadata.put(ARCRecordMetaData.URL_FIELD_KEY, captureSearchResult.getUrlKey());
143        // TODO the following is the correct way to set the URL. If we do
144        // things this way then we should be able to get arcrecord to parse
145        // the headers for us.
146        /*
147         * metadata.put(ARCRecordMetaData.URL_FIELD_KEY, captureSearchResult.getOriginalUrl());
148         */
149        try {
150            metadata.put(ARCRecordMetaData.IP_HEADER_FIELD_KEY, captureSearchResult.getOriginalHost());
151        } catch (NullPointerException ex) {
152            metadata.put(ARCRecordMetaData.IP_HEADER_FIELD_KEY, "");
153        }
154        metadata.put(ARCRecordMetaData.DATE_FIELD_KEY, captureSearchResult.getCaptureDate().toString());
155        metadata.put(ARCRecordMetaData.MIMETYPE_FIELD_KEY, captureSearchResult.getMimeType());
156        metadata.put(ARCRecordMetaData.VERSION_FIELD_KEY, captureSearchResult.getHttpCode());
157        metadata.put(ARCRecordMetaData.ABSOLUTE_OFFSET_KEY, "" + offset);
158        metadata.put(ARCRecordMetaData.LENGTH_FIELD_KEY, "" + bitarchiveRecord.getLength());
159        if (responseCode != null) {
160            metadata.put(ARCRecordMetaData.STATUSCODE_FIELD_KEY, responseCode);
161        }
162
163        // create header.
164        try {
165            header = new ARCRecordMetaData(arcfile, metadata);
166        } catch (IOException e) {
167            logger.error("Could not create header", e);
168            throw new ResourceNotAvailableException(e.getMessage());
169        }
170
171        // create ARCRecord.
172        try {
173            arcRecord = new ARCRecord(is, header, 0, false, false, true);
174            int code = arcRecord.getStatusCode();
175            logger.debug("ARCRecord created with code '" + code + "'");
176            arcRecord.skipHttpHeader();
177        } catch (NullPointerException e) {
178            logger.error("Could not create ARCRecord", e);
179            throw new ResourceNotAvailableException("ARC record doesn't contain" + " valid http URL");
180        } catch (IOException e) {
181            logger.error("Could not create ARCRecord", e);
182            throw new ResourceNotAvailableException(e.getMessage());
183        }
184        final String statusCode = responseCode;
185        final Map<String, Object> metadataF = metadata;
186        // TODO This the sleaziest thing in this class. Why does the
187        // ARCRecord give the wrong status code if we don't override this method?
188        Resource resource = new ArcResource(arcRecord, (ArchiveReader) null) {
189            public int getStatusCode() {
190                return Integer.parseInt(statusCode);
191            }
192            // FIXME incompatible, needed?
193            /*
194            @Override
195            public Map<String, String> getHttpHeaders() {
196                return metadataF;
197            }
198            */
199        };
200        logger.info("Returning resource '" + resource + "'");
201        return resource;
202    }
203
204    /**
205     * Shuts down this resource store, closing the arcrepository client.
206     *
207     * @throws IOException if an exception occurred while closing the client.
208     */
209    public void shutdown() throws IOException {
210        // Close JMS connection.
211        client.close();
212    }
213}