001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.indexserver; 024 025import java.io.BufferedReader; 026import java.io.ByteArrayInputStream; 027import java.io.File; 028import java.io.IOException; 029import java.io.InputStreamReader; 030 031import dk.netarkivet.common.exceptions.ArgumentNotValid; 032import dk.netarkivet.common.exceptions.IOFailure; 033import is.hi.bok.deduplicator.CrawlLogIterator; 034import is.hi.bok.deduplicator.DigestIndexer; 035 036/** 037 * Some Lucene Utilities used in some of our tests. 038 */ 039public class LuceneUtils { 040 041 // FIXME WTF?! 042 static final File ORIGINALS_DIR = new File("tests/dk/netarkivet/harvester/harvesting/data/launcher/originals"); 043 static final File EMPTY_CRAWLLOG_FILE = new File(ORIGINALS_DIR, "empty_crawl.log"); 044 045 /** 046 * Create Dummy Lucene index. uses an empty file as basis for the lucene-index. 047 * 048 * @param indexLocation location of index 049 * @throws IOFailure 050 */ 051 public static void makeDummyIndex(File indexLocation) throws IOFailure { 052 try { 053 // use empty crawl.log to generate default lucene index 054 generateIndex(EMPTY_CRAWLLOG_FILE, new BufferedReader(new InputStreamReader(new ByteArrayInputStream( 055 new byte[0]))), indexLocation); 056 } catch (IOFailure e) { 057 throw new IOFailure("Unable to create dummy lucene index", e); 058 } 059 } 060 061 /** 062 * Generate a Lucene index from a crawllog, and a CDXReader. 063 * 064 * @param CrawlLog some crawllog 065 * @param cdxreader some CDXReader 066 * @param indexDir Destinationdirector for the Lucene index. 067 */ 068 public static void generateIndex(File CrawlLog, BufferedReader cdxreader, File indexDir) { 069 ArgumentNotValid.checkNotNull(CrawlLog, "File CrawlLog"); 070 ArgumentNotValid.checkNotNull(cdxreader, "BufferedReader cdxreader"); 071 ArgumentNotValid.checkNotNull(indexDir, "File indexDir"); 072 ArgumentNotValid.checkTrue(CrawlLog.exists(), "The crawl log '" + CrawlLog.getAbsolutePath() 073 + "' does not exist."); 074 075 try { 076 // Setup Lucene for indexing our crawllogs 077 String indexLocation = indexDir.getAbsolutePath(); 078 // MODE_BOTH: Both URL's and Hash are indexed: Alternatives: 079 // DigestIndexer.MODE_HASH or DigestIndexer.MODE_URL 080 String indexingMode = DigestIndexer.MODE_BOTH; 081 boolean includeNormalizedURL = false; // used to be 'equivalent' setting 082 boolean includeTimestamp = true; // used to be 'timestamp' setting 083 boolean includeEtag = true; // used to be 'etag' setting 084 boolean addToExistingIndex = false; 085 DigestIndexer indexer = new DigestIndexer(indexLocation, indexingMode, includeNormalizedURL, 086 includeTimestamp, includeEtag, addToExistingIndex); 087 088 /** The blacklist set to true results in docs matching the mimefilter being ignored. */ 089 boolean blacklist = true; 090 final String mimefilter = "^text/.*"; 091 final boolean verbose = false; // Avoids System.out.println's 092 String defaultOrigin = "defaultOrigin"; 093 094 CrawlLogIterator reader = null; 095 try { 096 reader = new CDXOriginCrawlLogIterator(CrawlLog, cdxreader); 097 indexer.writeToIndex(reader, mimefilter, blacklist, defaultOrigin, verbose); 098 } finally { 099 if (reader != null) { 100 reader.close(); 101 } 102 } 103 indexer.close(); 104 } catch (IOException e) { 105 throw new IOFailure("Error setting up craw.log index framework for " + indexDir, e); 106 } 107 } 108 109}