001/* 002 * #%L 003 * Netarchivesuite - common 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.common.distribute.arcrepository; 025 026import java.io.File; 027import java.io.IOException; 028import java.net.URI; 029import java.util.List; 030 031import org.apache.lucene.document.Document; 032import org.apache.lucene.index.IndexReader; 033import org.apache.lucene.search.ConstantScoreQuery; 034import org.apache.lucene.search.IndexSearcher; 035import org.apache.lucene.search.Query; 036import org.apache.lucene.search.ScoreDoc; 037import org.apache.lucene.search.TermRangeFilter; 038import org.apache.lucene.store.FSDirectory; 039import org.apache.lucene.util.BytesRef; 040import org.slf4j.Logger; 041import org.slf4j.LoggerFactory; 042 043import dk.netarkivet.common.exceptions.ArgumentNotValid; 044import dk.netarkivet.common.exceptions.IOFailure; 045import dk.netarkivet.common.exceptions.IllegalState; 046import dk.netarkivet.common.utils.AllDocsCollector; 047import dk.netarkivet.common.utils.arc.ARCKey; 048import is.hi.bok.deduplicator.DigestIndexer; 049 050/** 051 * This class allows lookup of URLs in the ArcRepository, using full Lucene indexes to find offsets. The input takes the 052 * form of a directory containing a Lucene index. 053 */ 054public class ARCLookup { 055 056 /** Logger for this class. */ 057 private static final Logger log = LoggerFactory.getLogger(ARCLookup.class); 058 059 /** The ArcRepositoryClient we use to retrieve records. */ 060 private final ViewerArcRepositoryClient arcRepositoryClient; 061 062 /** The currently active lucene search engine. */ 063 private IndexSearcher luceneSearcher; 064 /** The Indexreader used by the index-searcher. */ 065 private IndexReader luceneReader; 066 067 /** If the value is true, we will try to lookup w/ ftp instead of http, if we don't get a hit in the index. */ 068 private boolean tryToLookupUriAsFtp; 069 070 /** 071 * Create a new ARCLookup object. 072 * 073 * @param arcRepositoryClient The interface to the ArcRepository 074 * @throws ArgumentNotValid if arcRepositoryClient is null. 075 */ 076 public ARCLookup(ViewerArcRepositoryClient arcRepositoryClient) { 077 ArgumentNotValid.checkNotNull(arcRepositoryClient, "ArcRepositoryClient arcRepositoryClient"); 078 this.arcRepositoryClient = arcRepositoryClient; 079 luceneSearcher = null; 080 } 081 082 /** 083 * The setter for the option to search for URIs with ftp instead of http as the scheme. 084 * Note that The scheme information is absent from the original URI request, when the request arrives here 085 * 086 * @param searchForFtpUri if true, we replace the http schema with ftp and try again, if unsuccessful with http as 087 * the schema 088 */ 089 public void setTryToLookupUriAsFtp(boolean searchForFtpUri) { 090 this.tryToLookupUriAsFtp = searchForFtpUri; 091 } 092 093 /** 094 * This method sets the current Lucene index this object works on, replacing and closing the current index if one is 095 * already set. 096 * 097 * @param indexDir The new index, a directory containing Lucene files. 098 * @throws ArgumentNotValid If argument is null 099 */ 100 public void setIndex(File indexDir) { 101 ArgumentNotValid.checkNotNull(indexDir, "File indexDir"); 102 ArgumentNotValid.checkTrue(indexDir.isDirectory(), "indexDir '" + indexDir + "' should be a directory"); 103 if (luceneSearcher != null) { 104 try { 105 // Existing lucene indices must be shut down 106 luceneReader.close(); 107 } catch (IOException e) { 108 throw new IOFailure("Unable to close index " + luceneSearcher, e); 109 } finally { 110 // Must be careful to shut down only once. 111 luceneSearcher = null; 112 } 113 } 114 try { 115 luceneReader = org.apache.lucene.index.DirectoryReader.open(FSDirectory.open(indexDir)); 116 luceneSearcher = new IndexSearcher(luceneReader); 117 } catch (IOException e) { 118 throw new IOFailure("Unable to find/open index " + indexDir, e); 119 } 120 } 121 122 /** 123 * Look up a given URI and return the contents as an InputStream. The uri is first checked using url-decoding (e.g. 124 * "," in the argument is converted to "%2C"). If this returns no match, the method then searches for a 125 * non-url-decoded match. If neither returns a match the method returns null. 126 * <p> 127 * If the tryToLookupUriAsFtp field is set to true, we will try exchanging the schema with ftp, whenever we can't 128 * lookup the uri with the original schema. 129 * 130 * @param uri The URI to find in the archive. If the URI does not match any entries in the archive, null is 131 * returned. 132 * @return An InputStream Containing all the data in the entry, or null if the entry was not found 133 * @throws IOFailure If the ARC file was found in the Lucene index but not in the bit archive, or if some other 134 * failure happened while finding the file. 135 */ 136 public ResultStream lookup(URI uri) { 137 ArgumentNotValid.checkNotNull(uri, "uri"); 138 log.debug("Doing lookup of {}", uri); 139 boolean containsHeader = true; 140 // the URI.getSchemeSpecificPart() carries out the url-decoding 141 ARCKey key = luceneLookup(uri.getScheme() + ":" + uri.getSchemeSpecificPart()); 142 if (key == null) { 143 // the URI.getRawSchemeSpecificPart() returns the uri in non-decoded form 144 key = luceneLookup(uri.getScheme() + ":" + uri.getRawSchemeSpecificPart()); 145 } 146 147 if (key == null && tryToLookupUriAsFtp) { 148 log.debug("Url not found with the schema '{}'. Now trying with 'ftp' as the schema", uri.getScheme()); 149 final String ftpSchema = "ftp"; 150 key = luceneLookup(ftpSchema + ":" + uri.getSchemeSpecificPart()); 151 if (key == null) { 152 key = luceneLookup(ftpSchema + ":" + uri.getRawSchemeSpecificPart()); 153 if (key != null) { 154 // Remember, that the found ftp-records don't have any HTTP 155 // Header 156 containsHeader = false; 157 } 158 } else { 159 // Remember, that the found ftp-record don't have any HTTP 160 // Header 161 containsHeader = false; 162 } 163 } 164 165 if (key == null) { 166 log.debug("Lookup failed for uri '{}'"); 167 return null; // key not found 168 } else { 169 log.debug("Retrieving record {},{} from archive", key.getFile().getName(), key.getOffset()); 170 final BitarchiveRecord bitarchiveRecord = arcRepositoryClient.get(key.getFile().getName(), key.getOffset()); 171 if (bitarchiveRecord == null) { 172 String message = "ARC file '" + key.getFile().getName() + "' mentioned in index file was not found by" 173 + " arc repository. This may mean we have a timeout, or that the index is wrong; or" 174 + " it may mean we have lost a record in the bitarchives."; 175 log.debug(message); 176 throw new IOFailure(message); 177 } 178 log.debug("Retrieved record {},{} from archive and returning it as ResultStream", key.getFile().getName(), key.getOffset()); 179 return new ResultStream(bitarchiveRecord.getData(), containsHeader); 180 } 181 } 182 183 /** 184 * Looks up a URI in our lucene index and extracts a key. 185 * 186 * @param uri A URI to look for. 187 * @return The file and offset where that URI can be found, or null if it doesn't exist. 188 * @throws IllegalState If a URL is found with a malformed origin field. 189 * @throws IOFailure if no index is set or Lucene gives problems. 190 */ 191 private ARCKey luceneLookup(String uri) { 192 if (luceneSearcher == null) { 193 throw new IOFailure("No index set while searching for '" + uri + "'"); 194 } 195 return luceneLookUp(uri); 196 } 197 198 /** 199 * Lucene Lookup. 200 * 201 * @param uri A URI to look for. 202 * @return The file and offset where that URI can be found, or null if it doesn't exist. 203 */ 204 private ARCKey luceneLookUp(String uri) { 205 BytesRef uriRef = new BytesRef(uri.getBytes()); // Should we decide which charset? 206 207 Query query = new ConstantScoreQuery(new TermRangeFilter(DigestIndexer.FIELD_URL, uriRef, uriRef, true, true)); 208 209 try { 210 AllDocsCollector allResultsCollector = new AllDocsCollector(); 211 luceneSearcher.search(query, allResultsCollector); 212 Document doc = null; 213 List<ScoreDoc> hits = allResultsCollector.getHits(); 214 if (hits != null) { 215 log.debug("Found {} hits for uri: {}", hits.size(), uri); 216 int i = 0; 217 for (ScoreDoc hit : hits) { 218 int docId = hit.doc; 219 doc = luceneSearcher.doc(docId); 220 String origin = doc.get(DigestIndexer.FIELD_ORIGIN); 221 // Here is where we will handle multiple hits in the future 222 if (origin == null) { 223 log.debug("No origin for URL '{}' hit {}", uri, i++); 224 continue; 225 } 226 String[] originParts = origin.split(","); 227 if (originParts.length != 2) { 228 throw new IllegalState("Bad origin for URL '" + uri + "': '" + origin + "'"); 229 } 230 log.debug("Found document with origin: {}", origin); 231 return new ARCKey(originParts[0], Long.parseLong(originParts[1])); 232 } 233 } 234 } catch (IOException e) { 235 throw new IOFailure("Fatal error looking up '" + uri + "'", e); 236 } 237 return null; 238 } 239 240}