001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.indexserver;
024
025import java.io.BufferedReader;
026import java.io.ByteArrayInputStream;
027import java.io.File;
028import java.io.IOException;
029import java.io.InputStreamReader;
030
031import dk.netarkivet.common.exceptions.ArgumentNotValid;
032import dk.netarkivet.common.exceptions.IOFailure;
033import is.hi.bok.deduplicator.CrawlLogIterator;
034import is.hi.bok.deduplicator.DigestIndexer;
035
036/**
037 * Some Lucene Utilities used in some of our tests.
038 */
039public class LuceneUtils {
040
041    // FIXME WTF?!
042    static final File ORIGINALS_DIR = new File("tests/dk/netarkivet/harvester/harvesting/data/launcher/originals");
043    static final File EMPTY_CRAWLLOG_FILE = new File(ORIGINALS_DIR, "empty_crawl.log");
044
045    /**
046     * Create Dummy Lucene index. uses an empty file as basis for the lucene-index.
047     *
048     * @param indexLocation location of index
049     * @throws IOFailure
050     */
051    public static void makeDummyIndex(File indexLocation) throws IOFailure {
052        try {
053            // use empty crawl.log to generate default lucene index
054            generateIndex(EMPTY_CRAWLLOG_FILE, new BufferedReader(new InputStreamReader(new ByteArrayInputStream(
055                    new byte[0]))), indexLocation);
056        } catch (IOFailure e) {
057            throw new IOFailure("Unable to create dummy lucene index", e);
058        }
059    }
060
061    /**
062     * Generate a Lucene index from a crawllog, and a CDXReader.
063     *
064     * @param CrawlLog some crawllog
065     * @param cdxreader some CDXReader
066     * @param indexDir Destinationdirector for the Lucene index.
067     */
068    public static void generateIndex(File CrawlLog, BufferedReader cdxreader, File indexDir) {
069        ArgumentNotValid.checkNotNull(CrawlLog, "File CrawlLog");
070        ArgumentNotValid.checkNotNull(cdxreader, "BufferedReader cdxreader");
071        ArgumentNotValid.checkNotNull(indexDir, "File indexDir");
072        ArgumentNotValid.checkTrue(CrawlLog.exists(), "The crawl log '" + CrawlLog.getAbsolutePath()
073                + "' does not exist.");
074
075        try {
076            // Setup Lucene for indexing our crawllogs
077            String indexLocation = indexDir.getAbsolutePath();
078            // MODE_BOTH: Both URL's and Hash are indexed: Alternatives:
079            // DigestIndexer.MODE_HASH or DigestIndexer.MODE_URL
080            String indexingMode = DigestIndexer.MODE_BOTH;
081            boolean includeNormalizedURL = false; // used to be 'equivalent' setting
082            boolean includeTimestamp = true; // used to be 'timestamp' setting
083            boolean includeEtag = true; // used to be 'etag' setting
084            boolean addToExistingIndex = false;
085            DigestIndexer indexer = new DigestIndexer(indexLocation, indexingMode, includeNormalizedURL,
086                    includeTimestamp, includeEtag, addToExistingIndex);
087
088            /** The blacklist set to true results in docs matching the mimefilter being ignored. */
089            boolean blacklist = true;
090            final String mimefilter = "^text/.*";
091            final boolean verbose = false; // Avoids System.out.println's
092            String defaultOrigin = "defaultOrigin";
093
094            CrawlLogIterator reader = null;
095            try {
096                reader = new CDXOriginCrawlLogIterator(CrawlLog, cdxreader);
097                indexer.writeToIndex(reader, mimefilter, blacklist, defaultOrigin, verbose);
098            } finally {
099                if (reader != null) {
100                    reader.close();
101                }
102            }
103            indexer.close();
104        } catch (IOException e) {
105            throw new IOFailure("Error setting up craw.log index framework for " + indexDir, e);
106        }
107    }
108
109}