001/*
002 * #%L
003 * Netarchivesuite - common
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.common.distribute.arcrepository;
025
026import java.io.File;
027import java.io.IOException;
028import java.net.URI;
029import java.util.List;
030
031import org.apache.lucene.document.Document;
032import org.apache.lucene.index.IndexReader;
033import org.apache.lucene.search.ConstantScoreQuery;
034import org.apache.lucene.search.IndexSearcher;
035import org.apache.lucene.search.Query;
036import org.apache.lucene.search.ScoreDoc;
037import org.apache.lucene.search.TermRangeFilter;
038import org.apache.lucene.store.FSDirectory;
039import org.apache.lucene.util.BytesRef;
040import org.slf4j.Logger;
041import org.slf4j.LoggerFactory;
042
043import dk.netarkivet.common.exceptions.ArgumentNotValid;
044import dk.netarkivet.common.exceptions.IOFailure;
045import dk.netarkivet.common.exceptions.IllegalState;
046import dk.netarkivet.common.utils.AllDocsCollector;
047import dk.netarkivet.common.utils.arc.ARCKey;
048import is.hi.bok.deduplicator.DigestIndexer;
049
050/**
051 * This class allows lookup of URLs in the ArcRepository, using full Lucene indexes to find offsets. The input takes the
052 * form of a directory containing a Lucene index.
053 */
054public class ARCLookup {
055
056    /** Logger for this class. */
057    private static final Logger log = LoggerFactory.getLogger(ARCLookup.class);
058
059    /** The ArcRepositoryClient we use to retrieve records. */
060    private final ViewerArcRepositoryClient arcRepositoryClient;
061
062    /** The currently active lucene search engine. */
063    private IndexSearcher luceneSearcher;
064    /** The Indexreader used by the index-searcher. */
065    private IndexReader luceneReader;
066
067    /** If the value is true, we will try to lookup w/ ftp instead of http, if we don't get a hit in the index. */
068    private boolean tryToLookupUriAsFtp;
069
070    /**
071     * Create a new ARCLookup object.
072     *
073     * @param arcRepositoryClient The interface to the ArcRepository
074     * @throws ArgumentNotValid if arcRepositoryClient is null.
075     */
076    public ARCLookup(ViewerArcRepositoryClient arcRepositoryClient) {
077        ArgumentNotValid.checkNotNull(arcRepositoryClient, "ArcRepositoryClient arcRepositoryClient");
078        this.arcRepositoryClient = arcRepositoryClient;
079        luceneSearcher = null;
080    }
081
082    /**
083     * The setter for the option to search for URIs with ftp instead of http as the scheme. 
084     * Note that The scheme information is absent from the original URI request, when the request arrives here
085     * 
086     * @param searchForFtpUri if true, we replace the http schema with ftp and try again, if unsuccessful with http as
087     * the schema
088     */
089    public void setTryToLookupUriAsFtp(boolean searchForFtpUri) {
090        this.tryToLookupUriAsFtp = searchForFtpUri;
091    }
092
093    /**
094     * This method sets the current Lucene index this object works on, replacing and closing the current index if one is
095     * already set.
096     *
097     * @param indexDir The new index, a directory containing Lucene files.
098     * @throws ArgumentNotValid If argument is null
099     */
100    public void setIndex(File indexDir) {
101        ArgumentNotValid.checkNotNull(indexDir, "File indexDir");
102        ArgumentNotValid.checkTrue(indexDir.isDirectory(), "indexDir '" + indexDir + "' should be a directory");
103        if (luceneSearcher != null) {
104            try {
105                // Existing lucene indices must be shut down
106                luceneReader.close();
107            } catch (IOException e) {
108                throw new IOFailure("Unable to close index " + luceneSearcher, e);
109            } finally {
110                // Must be careful to shut down only once.
111                luceneSearcher = null;
112            }
113        }
114        try {
115            luceneReader = org.apache.lucene.index.DirectoryReader.open(FSDirectory.open(indexDir));
116            luceneSearcher = new IndexSearcher(luceneReader);
117        } catch (IOException e) {
118            throw new IOFailure("Unable to find/open index " + indexDir, e);
119        }
120    }
121
122    /**
123     * Look up a given URI and return the contents as an InputStream. The uri is first checked using url-decoding (e.g.
124     * "," in the argument is converted to "%2C"). If this returns no match, the method then searches for a
125     * non-url-decoded match. If neither returns a match the method returns null.
126     * <p>
127     * If the tryToLookupUriAsFtp field is set to true, we will try exchanging the schema with ftp, whenever we can't
128     * lookup the uri with the original schema.
129     *
130     * @param uri The URI to find in the archive. If the URI does not match any entries in the archive, null is
131     * returned.
132     * @return An InputStream Containing all the data in the entry, or null if the entry was not found
133     * @throws IOFailure If the ARC file was found in the Lucene index but not in the bit archive, or if some other
134     * failure happened while finding the file.
135     */
136    public ResultStream lookup(URI uri) {
137        ArgumentNotValid.checkNotNull(uri, "uri");
138        log.debug("Doing lookup of {}", uri);
139        boolean containsHeader = true;
140        // the URI.getSchemeSpecificPart() carries out the url-decoding
141        ARCKey key = luceneLookup(uri.getScheme() + ":" + uri.getSchemeSpecificPart());
142        if (key == null) {
143            // the URI.getRawSchemeSpecificPart() returns the uri in non-decoded form
144            key = luceneLookup(uri.getScheme() + ":" + uri.getRawSchemeSpecificPart());
145        }
146
147        if (key == null && tryToLookupUriAsFtp) {
148            log.debug("Url not found with the schema '{}'. Now trying with 'ftp' as the schema", uri.getScheme());
149            final String ftpSchema = "ftp";
150            key = luceneLookup(ftpSchema + ":" + uri.getSchemeSpecificPart());
151            if (key == null) {
152                key = luceneLookup(ftpSchema + ":" + uri.getRawSchemeSpecificPart());
153                if (key != null) {
154                    // Remember, that the found ftp-records don't have any HTTP
155                    // Header
156                    containsHeader = false;
157                }
158            } else {
159                // Remember, that the found ftp-record don't have any HTTP
160                // Header
161                containsHeader = false;
162            }
163        }
164
165        if (key == null) {
166                log.debug("Lookup failed for uri '{}'");
167            return null; // key not found
168        } else {
169                log.debug("Retrieving record {},{} from archive", key.getFile().getName(), key.getOffset());
170            final BitarchiveRecord bitarchiveRecord = arcRepositoryClient.get(key.getFile().getName(), key.getOffset());
171            if (bitarchiveRecord == null) {
172                String message = "ARC file '" + key.getFile().getName() + "' mentioned in index file was not found by"
173                        + " arc repository. This may mean we have a timeout, or that the index is wrong; or"
174                        + " it may mean we have lost a record in the bitarchives.";
175                log.debug(message);
176                throw new IOFailure(message);
177            }
178            log.debug("Retrieved record {},{} from archive and returning it as ResultStream", key.getFile().getName(), key.getOffset());
179            return new ResultStream(bitarchiveRecord.getData(), containsHeader);
180        }
181    }
182
183    /**
184     * Looks up a URI in our lucene index and extracts a key.
185     *
186     * @param uri A URI to look for.
187     * @return The file and offset where that URI can be found, or null if it doesn't exist.
188     * @throws IllegalState If a URL is found with a malformed origin field.
189     * @throws IOFailure if no index is set or Lucene gives problems.
190     */
191    private ARCKey luceneLookup(String uri) {
192        if (luceneSearcher == null) {
193            throw new IOFailure("No index set while searching for '" + uri + "'");
194        }
195        return luceneLookUp(uri);
196    }
197
198    /**
199     * Lucene Lookup. 
200     *
201     * @param uri A URI to look for.
202     * @return The file and offset where that URI can be found, or null if it doesn't exist. 
203     */
204    private ARCKey luceneLookUp(String uri) {
205        BytesRef uriRef = new BytesRef(uri.getBytes()); // Should we decide which charset?
206
207        Query query = new ConstantScoreQuery(new TermRangeFilter(DigestIndexer.FIELD_URL, uriRef, uriRef, true, true));
208
209        try {
210            AllDocsCollector allResultsCollector = new AllDocsCollector();
211            luceneSearcher.search(query, allResultsCollector);
212            Document doc = null;
213            List<ScoreDoc> hits = allResultsCollector.getHits();
214            if (hits != null) {
215                log.debug("Found {} hits for uri: {}", hits.size(), uri);
216                int i = 0;
217                for (ScoreDoc hit : hits) {
218                    int docId = hit.doc;
219                    doc = luceneSearcher.doc(docId);
220                    String origin = doc.get(DigestIndexer.FIELD_ORIGIN);
221                    // Here is where we will handle multiple hits in the future
222                    if (origin == null) {
223                        log.debug("No origin for URL '{}' hit {}", uri, i++);
224                        continue;
225                    }
226                    String[] originParts = origin.split(",");
227                    if (originParts.length != 2) {
228                        throw new IllegalState("Bad origin for URL '" + uri + "': '" + origin + "'");
229                    }
230                    log.debug("Found document with origin: {}", origin);
231                    return new ARCKey(originParts[0], Long.parseLong(originParts[1]));
232                }
233            }
234        } catch (IOException e) {
235            throw new IOFailure("Fatal error looking up '" + uri + "'", e);
236        }
237        return null;
238    }
239
240}