001/* DigestIndexer 002 * 003 * Created on 10.04.2006 004 * 005 * Copyright (C) 2006 National and University Library of Iceland 006 * 007 * This file is part of the DeDuplicator (Heritrix add-on module). 008 * 009 * DeDuplicator is free software; you can redistribute it and/or modify 010 * it under the terms of the GNU Lesser Public License as published by 011 * the Free Software Foundation; either version 2.1 of the License, or 012 * any later version. 013 * 014 * DeDuplicator is distributed in the hope that it will be useful, 015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 017 * GNU Lesser Public License for more details. 018 * 019 * You should have received a copy of the GNU Lesser Public License 020 * along with DeDuplicator; if not, write to the Free Software 021 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 022 */ 023package is.hi.bok.deduplicator; 024 025import java.io.File; 026import java.io.IOException; 027import java.io.PrintWriter; 028import java.lang.reflect.Constructor; 029import java.util.List; 030 031import org.apache.commons.cli.Option; 032import org.apache.lucene.analysis.core.WhitespaceAnalyzer; 033import org.apache.lucene.document.Document; 034import org.apache.lucene.document.Field; 035import org.apache.lucene.document.FieldType; 036import org.apache.lucene.document.StringField; 037import org.apache.lucene.index.IndexWriter; 038import org.apache.lucene.index.IndexWriterConfig; 039import org.apache.lucene.index.IndexWriterConfig.OpenMode; 040import org.apache.lucene.store.Directory; 041import org.apache.lucene.store.FSDirectory; 042import org.archive.util.ArchiveUtils; 043 044import dk.netarkivet.common.Constants; 045 046/** 047 * A class for building a de-duplication index. 048 * <p> 049 * The indexing can be done via the command line options (Run with --help parameter to print usage information) or 050 * natively embedded in other applications. 051 * <p> 052 * This class also defines string constants for the lucene field names. 053 * 054 * @author Kristinn Sigurðsson 055 * @author Søren Vejrup Carlsen 056 */ 057public class DigestIndexer { 058 059 // Lucene index field names 060 /** The URL. * */ 061 public static final String FIELD_URL = "url"; 062 /** The content digest as String. * */ 063 public static final String FIELD_DIGEST = "digest"; 064 /** 065 * The URLs timestamp (time of fetch). The exact nature of this time may vary slightly depending on the source (i.e. 066 * crawl.log and ARCs contain slightly different times but both indicate roughly when the document was obtained. The 067 * time is encoded as a String with the Java date format yyyyMMddHHmmssSSS 068 */ 069 public static final String FIELD_TIMESTAMP = "date"; 070 /** The document's etag. * */ 071 public static final String FIELD_ETAG = "etag"; 072 /** A stripped (normalized) version of the URL. * */ 073 public static final String FIELD_URL_NORMALIZED = "url-normalized"; 074 /** 075 * A field containing meta-data on where the original version of a document is stored. 076 */ 077 public static final String FIELD_ORIGIN = "origin"; 078 079 // Indexing modes (by url, by hash or both) 080 /** 081 * Index URL enabling lookups by URL. If normalized URLs are included in the index they will also be indexed and 082 * searchable. * 083 */ 084 public static final String MODE_URL = "URL"; 085 /** Index HASH enabling lookups by hash (content digest). * */ 086 public static final String MODE_HASH = "HASH"; 087 /** Both URL and hash are indexed. * */ 088 public static final String MODE_BOTH = "BOTH"; 089 090 /** Lucene Storage used by the indexwriter. */ 091 private Directory luceneDirectory; 092 093 /** The index being manipulated. * */ 094 private IndexWriter index; 095 096 /** 097 * @return the IndexWriter 098 */ 099 public IndexWriter getIndex() { 100 return index; 101 } 102 103 // The options with default settings 104 /** Should etags be included in the index. */ 105 private boolean etag = false; 106 /** 107 * Should a normalized version of the URL be added to the index. 108 */ 109 private boolean equivalent = false; 110 /** Should a timestamp be included in the index. */ 111 private boolean timestamp = false; 112 /** Should we index the url. */ 113 private boolean indexURL = true; 114 /** Should we index the digest. */ 115 private boolean indexDigest = true; 116 117 /** 118 * Each instance of this class wraps one Lucene index for writing deduplication information to it. 119 * 120 * @param indexLocation The location of the index (path). 121 * @param indexingMode Index {@link #MODE_URL}, {@link #MODE_HASH} or {@link #MODE_BOTH}. 122 * @param includeNormalizedURL Should a normalized version of the URL be added to the index. See 123 * {@link #stripURL(String)}. 124 * @param includeTimestamp Should a timestamp be included in the index. 125 * @param includeEtag Should an Etag be included in the index. 126 * @param addToExistingIndex Are we opening up an existing index. Setting this to false will cause any index at 127 * <code>indexLocation</code> to be overwritten. 128 * @throws IOException If an error occurs opening the index. 129 */ 130 public DigestIndexer(String indexLocation, String indexingMode, boolean includeNormalizedURL, 131 boolean includeTimestamp, boolean includeEtag, boolean addToExistingIndex) throws IOException { 132 133 this.etag = includeEtag; 134 this.equivalent = includeNormalizedURL; 135 this.timestamp = includeTimestamp; 136 137 if (indexingMode.equals(MODE_URL)) { 138 indexDigest = false; 139 } else if (indexingMode.equals(MODE_HASH)) { 140 indexURL = false; 141 } 142 143 // Set up the index writer 144 IndexWriterConfig config = new IndexWriterConfig(Constants.LUCENE_VERSION, new WhitespaceAnalyzer( 145 Constants.LUCENE_VERSION)); 146 // TODO Possibly change the default MergePolicy, see NAS-2119 147 if (!addToExistingIndex) { 148 config.setOpenMode(OpenMode.CREATE); 149 } else { 150 config.setOpenMode(OpenMode.CREATE_OR_APPEND); 151 } 152 luceneDirectory = FSDirectory.open(new File(indexLocation)); 153 index = new IndexWriter(luceneDirectory, config); 154 } 155 156 /** 157 * Writes the contents of a {@link CrawlDataIterator} to this index. 158 * <p> 159 * This method may be invoked multiple times with different CrawlDataIterators until {@link #close} has been called. 160 * 161 * @param dataIt The CrawlDataIterator that provides the data to index. 162 * @param mimefilter A regular expression that is used as a filter on the mimetypes to include in the index. 163 * @param blacklist If true then the <code>mimefilter</code> is used as a blacklist for mimetypes. If false then the 164 * <code>mimefilter</code> is treated as a whitelist. 165 * @param defaultOrigin If an item is missing an origin, this default value will be assigned to it. Can be null if 166 * no default origin value should be assigned. 167 * @param verbose If true then progress information will be sent to System.out. 168 * @return The number of items added to the index. 169 * @throws IOException If an error occurs writing the index. 170 */ 171 public long writeToIndex(CrawlDataIterator dataIt, String mimefilter, boolean blacklist, String defaultOrigin, 172 boolean verbose) throws IOException { 173 return writeToIndex(dataIt, mimefilter, blacklist, defaultOrigin, verbose, false); 174 } 175 176 /** 177 * Writes the contents of a {@link CrawlDataIterator} to this index. 178 * <p> 179 * This method may be invoked multiple times with different CrawlDataIterators until {@link #close} has been called. 180 * 181 * @param dataIt The CrawlDataIterator that provides the data to index. 182 * @param mimefilter A regular expression that is used as a filter on the mimetypes to include in the index. 183 * @param blacklist If true then the <code>mimefilter</code> is used as a blacklist for mimetypes. If false then the 184 * <code>mimefilter</code> is treated as a whitelist. 185 * @param defaultOrigin If an item is missing an origin, this default value will be assigned to it. Can be null if 186 * no default origin value should be assigned. 187 * @param verbose If true then progress information will be sent to System.out. 188 * @param skipDuplicates Do not add URLs that are marked as duplicates to the index 189 * @return The number of items added to the index. 190 * @throws IOException If an error occurs writing the index. 191 */ 192 public long writeToIndex(CrawlDataIterator dataIt, String mimefilter, boolean blacklist, String defaultOrigin, 193 boolean verbose, boolean skipDuplicates) throws IOException { 194 int count = 0; 195 int skipped = 0; 196 while (dataIt.hasNext()) { 197 CrawlDataItem item = dataIt.next(); 198 if (!(skipDuplicates && item.duplicate) && item.mimetype.matches(mimefilter) != blacklist) { 199 // Ok, we wish to index this URL/Digest 200 count++; 201 if (verbose && count % 10000 == 0) { 202 System.out.println("Indexed " + count + " - Last URL " + "from " + item.getTimestamp()); 203 } 204 205 Document doc = createDocument(item, defaultOrigin); 206 index.addDocument(doc); 207 // needed with new IndexWriter (see line 144) 208 // index.commit(); 209 } else { 210 skipped++; 211 } 212 } 213 index.commit(); 214 if (verbose) { 215 System.out.println("Indexed " + count + " items (skipped " + skipped + ")"); 216 } 217 return count; 218 } 219 220 /** 221 * Create Lucene Document for given CrawlDataItem. 222 * @param item A CrawlDataItem 223 * @param defaultOrigin 224 * @return Lucene Document for the given CrawlDataItem 225 */ 226 private Document createDocument(CrawlDataItem item, String defaultOrigin) { 227 Document doc = new Document(); 228 229 FieldType storedNotIndexed = new FieldType(StringField.TYPE_STORED); 230 storedNotIndexed.setIndexed(false); 231 232 FieldType storedNotAnalyzed = new FieldType(StringField.TYPE_STORED); 233 storedNotAnalyzed.setOmitNorms(false); 234 235 // Add URL to index. 236 if (indexURL) { 237 doc.add(new Field(FIELD_URL, item.getURL(), storedNotAnalyzed)); 238 if (equivalent) { 239 doc.add(new Field(FIELD_URL_NORMALIZED, stripURL(item.getURL()), storedNotAnalyzed)); 240 } 241 } else { 242 doc.add(new Field(FIELD_URL, item.getURL(), storedNotIndexed)); 243 if (equivalent) { 244 doc.add(new Field(FIELD_URL_NORMALIZED, stripURL(item.getURL()), storedNotIndexed)); 245 } 246 } 247 248 // Add digest to index 249 if (indexDigest) { 250 doc.add(new Field(FIELD_DIGEST, item.getContentDigest(), storedNotAnalyzed)); 251 } else { 252 doc.add(new Field(FIELD_DIGEST, item.getContentDigest(), storedNotIndexed)); 253 } 254 // Add timestamp to index 255 if (timestamp) { 256 doc.add(new Field(FIELD_TIMESTAMP, item.getTimestamp(), storedNotIndexed)); 257 } 258 // Add etag to index 259 if (etag && item.getEtag() != null) { 260 doc.add(new Field(FIELD_ETAG, item.getEtag(), storedNotIndexed)); 261 } 262 if (defaultOrigin != null) { 263 String tmp = item.getOrigin(); 264 if (tmp == null) { 265 tmp = defaultOrigin; 266 } 267 doc.add(new Field(FIELD_ORIGIN, tmp, storedNotIndexed)); 268 } 269 return doc; 270 } 271 272 /** 273 * Close the index. 274 * 275 * @throws IOException If an error occurs while closing the index. 276 */ 277 public void close() throws IOException { 278 index.close(true); 279 luceneDirectory.close(); 280 } 281 282 /** 283 * An aggressive URL normalizer. This methods removes any www[0-9]. segments from an URL, along with any trailing 284 * slashes and all parameters. 285 * <p> 286 * Example: <code>http://www.bok.hi.is/?lang=ice</code> would become <code>http://bok.hi.is</code> 287 * 288 * @param url The url to strip 289 * @return A normalized URL. 290 */ 291 public static String stripURL(String url) { 292 url = url.replaceAll("www[0-9]*\\.", ""); 293 url = url.replaceAll("\\?.*$", ""); 294 url = url.replaceAll("/$", ""); 295 return url; 296 } 297 298 @SuppressWarnings({"unchecked", "rawtypes"}) 299 public static void main(String[] args) throws Exception { 300 CommandLineParser clp = new CommandLineParser(args, new PrintWriter(System.out)); 301 long start = System.currentTimeMillis(); 302 303 // Set default values for all settings. 304 boolean etag = false; 305 boolean equivalent = false; 306 boolean timestamp = false; 307 String indexMode = MODE_BOTH; 308 boolean addToIndex = false; 309 String mimefilter = "^text/.*"; 310 boolean blacklist = true; 311 String iteratorClassName = CrawlLogIterator.class.getName(); 312 String origin = null; 313 boolean skipDuplicates = false; 314 315 // Process the options 316 Option[] opts = clp.getCommandLineOptions(); 317 for (int i = 0; i < opts.length; i++) { 318 Option opt = opts[i]; 319 switch (opt.getId()) { 320 case 'w': 321 blacklist = false; 322 break; 323 case 'a': 324 addToIndex = true; 325 break; 326 case 'e': 327 etag = true; 328 break; 329 case 'h': 330 clp.usage(0); 331 break; 332 case 'i': 333 iteratorClassName = opt.getValue(); 334 break; 335 case 'm': 336 mimefilter = opt.getValue(); 337 break; 338 case 'o': 339 indexMode = opt.getValue(); 340 break; 341 case 's': 342 equivalent = true; 343 break; 344 case 't': 345 timestamp = true; 346 break; 347 case 'r': 348 origin = opt.getValue(); 349 break; 350 case 'd': 351 skipDuplicates = true; 352 break; 353 default: 354 System.err.println("Unhandled option id: " + opt.getId()); 355 } 356 } 357 358 List cargs = clp.getCommandLineArguments(); 359 360 if (cargs.size() != 2) { 361 // Should be exactly two arguments. Source and target! 362 clp.usage(0); 363 } 364 365 // Get the CrawlDataIterator 366 // Get the iterator classname or load default. 367 Class cl = Class.forName(iteratorClassName); 368 Constructor co = cl.getConstructor(new Class[] {String.class}); 369 CrawlDataIterator iterator = (CrawlDataIterator) co.newInstance(new Object[] {(String) cargs.get(0)}); 370 371 // Print initial stuff 372 System.out.println("Indexing: " + cargs.get(0)); 373 System.out.println(" - Mode: " + indexMode); 374 System.out.println(" - Mime filter: " + mimefilter + " (" + (blacklist ? "blacklist" : "whitelist") + ")"); 375 System.out.println(" - Includes" + (equivalent ? " <equivalent URL>" : "") + (timestamp ? " <timestamp>" : "") 376 + (etag ? " <etag>" : "")); 377 System.out.println(" - Skip duplicates: " + (skipDuplicates ? "yes" : "no")); 378 System.out.println(" - Iterator: " + iteratorClassName); 379 System.out.println(" - " + iterator.getSourceType()); 380 System.out.println("Target: " + cargs.get(1)); 381 if (addToIndex) { 382 System.out.println(" - Add to existing index (if any)"); 383 } else { 384 System.out.println(" - New index (erases any existing index at " + "that location)"); 385 } 386 387 DigestIndexer di = new DigestIndexer((String) cargs.get(1), indexMode, equivalent, timestamp, etag, addToIndex); 388 389 // Create the index 390 di.writeToIndex(iterator, mimefilter, blacklist, origin, true, skipDuplicates); 391 392 // Clean-up 393 di.close(); 394 395 System.out.println("Total run time: " 396 + ArchiveUtils.formatMillisecondsToConventional(System.currentTimeMillis() - start)); 397 } 398}