001/* DigestIndexer 002 * 003 * Created on 10.04.2006 004 * 005 * Copyright (C) 2006 National and University Library of Iceland 006 * 007 * This file is part of the DeDuplicator (Heritrix add-on module). 008 * 009 * DeDuplicator is free software; you can redistribute it and/or modify 010 * it under the terms of the GNU Lesser Public License as published by 011 * the Free Software Foundation; either version 2.1 of the License, or 012 * any later version. 013 * 014 * DeDuplicator is distributed in the hope that it will be useful, 015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 017 * GNU Lesser Public License for more details. 018 * 019 * You should have received a copy of the GNU Lesser Public License 020 * along with DeDuplicator; if not, write to the Free Software 021 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 022 */ 023package is.hi.bok.deduplicator; 024 025import java.io.File; 026import java.io.IOException; 027import java.io.PrintWriter; 028import java.lang.reflect.Constructor; 029import java.util.List; 030 031import org.apache.commons.cli.Option; 032import org.apache.lucene.analysis.core.WhitespaceAnalyzer; 033import org.apache.lucene.document.Document; 034import org.apache.lucene.document.Field; 035import org.apache.lucene.document.FieldType; 036import org.apache.lucene.document.StringField; 037import org.apache.lucene.index.IndexWriter; 038import org.apache.lucene.index.IndexWriterConfig; 039import org.apache.lucene.index.IndexWriterConfig.OpenMode; 040import org.apache.lucene.store.Directory; 041import org.apache.lucene.store.FSDirectory; 042import org.archive.util.ArchiveUtils; 043 044import dk.netarkivet.common.Constants; 045 046/** 047 * A class for building a de-duplication index. 048 * <p> 049 * The indexing can be done via the command line options (Run with --help parameter to print usage information) or 050 * natively embedded in other applications. 051 * <p> 052 * This class also defines string constants for the lucene field names. 053 * 054 * @author Kristinn Sigurðsson 055 * @author Søren Vejrup Carlsen 056 */ 057public class DigestIndexer { 058 059 // Lucene index field names 060 /** The URL. * */ 061 public static final String FIELD_URL = "url"; 062 /** The content digest as String. * */ 063 public static final String FIELD_DIGEST = "digest"; 064 /** 065 * The URLs timestamp (time of fetch). The exact nature of this time may vary slightly depending on the source (i.e. 066 * crawl.log and ARCs contain slightly different times but both indicate roughly when the document was obtained. The 067 * time is encoded as a String with the Java date format yyyyMMddHHmmssSSS 068 */ 069 public static final String FIELD_TIMESTAMP = "date"; 070 /** The document's etag. * */ 071 public static final String FIELD_ETAG = "etag"; 072 /** A stripped (normalized) version of the URL. * */ 073 public static final String FIELD_URL_NORMALIZED = "url-normalized"; 074 /** 075 * A field containing meta-data on where the original version of a document is stored. 076 */ 077 public static final String FIELD_ORIGIN = "origin"; 078 079 // Indexing modes (by url, by hash or both) 080 /** 081 * Index URL enabling lookups by URL. If normalized URLs are included in the index they will also be indexed and 082 * searchable. * 083 */ 084 public static final String MODE_URL = "URL"; 085 /** Index HASH enabling lookups by hash (content digest). * */ 086 public static final String MODE_HASH = "HASH"; 087 /** Both URL and hash are indexed. * */ 088 public static final String MODE_BOTH = "BOTH"; 089 090 /** Lucene Storage used by the indexwriter. */ 091 private Directory luceneDirectory; 092 093 /** The index being manipulated. * */ 094 private IndexWriter index; 095 096 /** 097 * @return the IndexWriter 098 */ 099 public IndexWriter getIndex() { 100 return index; 101 } 102 103 // The options with default settings 104 /** Should etags be included in the index. */ 105 private boolean etag = false; 106 /** 107 * Should a normalized version of the URL be added to the index. 108 */ 109 private boolean equivalent = false; 110 /** Should a timestamp be included in the index. */ 111 private boolean timestamp = false; 112 /** Should we index the url. */ 113 private boolean indexURL = true; 114 /** Should we index the digest. */ 115 private boolean indexDigest = true; 116 117 /** 118 * Each instance of this class wraps one Lucene index for writing deduplication information to it. 119 * 120 * @param indexLocation The location of the index (path). 121 * @param indexingMode Index {@link #MODE_URL}, {@link #MODE_HASH} or {@link #MODE_BOTH}. 122 * @param includeNormalizedURL Should a normalized version of the URL be added to the index. See 123 * {@link #stripURL(String)}. 124 * @param includeTimestamp Should a timestamp be included in the index. 125 * @param includeEtag Should an Etag be included in the index. 126 * @param addToExistingIndex Are we opening up an existing index. Setting this to false will cause any index at 127 * <code>indexLocation</code> to be overwritten. 128 * @throws IOException If an error occurs opening the index. 129 */ 130 public DigestIndexer(String indexLocation, String indexingMode, boolean includeNormalizedURL, 131 boolean includeTimestamp, boolean includeEtag, boolean addToExistingIndex) throws IOException { 132 133 this.etag = includeEtag; 134 this.equivalent = includeNormalizedURL; 135 this.timestamp = includeTimestamp; 136 137 if (indexingMode.equals(MODE_URL)) { 138 indexDigest = false; 139 } else if (indexingMode.equals(MODE_HASH)) { 140 indexURL = false; 141 } 142 143 // Set up the index writer 144 IndexWriterConfig config = new IndexWriterConfig(Constants.LUCENE_VERSION, new WhitespaceAnalyzer( 145 Constants.LUCENE_VERSION)); 146 // TODO Possibly change the default MergePolicy, see NAS-2119 147 if (!addToExistingIndex) { 148 config.setOpenMode(OpenMode.CREATE); 149 } else { 150 config.setOpenMode(OpenMode.CREATE_OR_APPEND); 151 } 152 luceneDirectory = FSDirectory.open(new File(indexLocation)); 153 index = new IndexWriter(luceneDirectory, config); 154 } 155 156 /** 157 * Writes the contents of a {@link CrawlDataIterator} to this index. 158 * <p> 159 * This method may be invoked multiple times with different CrawlDataIterators until {@link #close} has been called. 160 * 161 * @param dataIt The CrawlDataIterator that provides the data to index. 162 * @param mimefilter A regular expression that is used as a filter on the mimetypes to include in the index. 163 * @param blacklist If true then the <code>mimefilter</code> is used as a blacklist for mimetypes. If false then the 164 * <code>mimefilter</code> is treated as a whitelist. 165 * @param defaultOrigin If an item is missing an origin, this default value will be assigned to it. Can be null if 166 * no default origin value should be assigned. 167 * @param verbose If true then progress information will be sent to System.out. 168 * @return The number of items added to the index. 169 * @throws IOException If an error occurs writing the index. 170 */ 171 public long writeToIndex(CrawlDataIterator dataIt, String mimefilter, boolean blacklist, String defaultOrigin, 172 boolean verbose) throws IOException { 173 return writeToIndex(dataIt, mimefilter, blacklist, defaultOrigin, verbose, false); 174 } 175 176 /** 177 * Writes the contents of a {@link CrawlDataIterator} to this index. 178 * <p> 179 * This method may be invoked multiple times with different CrawlDataIterators until {@link #close} has been called. 180 * 181 * @param dataIt The CrawlDataIterator that provides the data to index. 182 * @param mimefilter A regular expression that is used as a filter on the mimetypes to include in the index. 183 * @param blacklist If true then the <code>mimefilter</code> is used as a blacklist for mimetypes. If false then the 184 * <code>mimefilter</code> is treated as a whitelist. 185 * @param defaultOrigin If an item is missing an origin, this default value will be assigned to it. Can be null if 186 * no default origin value should be assigned. 187 * @param verbose If true then progress information will be sent to System.out. 188 * @param skipDuplicates Do not add URLs that are marked as duplicates to the index 189 * @return The number of items added to the index. 190 * @throws IOException If an error occurs writing the index. 191 */ 192 public long writeToIndex(CrawlDataIterator dataIt, String mimefilter, boolean blacklist, String defaultOrigin, 193 boolean verbose, boolean skipDuplicates) throws IOException { 194 int count = 0; 195 int skipped = 0; 196 while (dataIt.hasNext()) { 197 CrawlDataItem item = dataIt.next(); 198 if (!(skipDuplicates && item.duplicate) && item.mimetype.matches(mimefilter) != blacklist) { 199 // Ok, we wish to index this URL/Digest 200 count++; 201 if (verbose && count % 10000 == 0) { 202 System.out.println("Indexed " + count + " - Last URL " + "from " + item.getTimestamp()); 203 } 204 205 Document doc = createDocument(item, defaultOrigin); 206 index.addDocument(doc); 207 // needed with new IndexWriter (see line 144) 208 // index.commit(); 209 } else { 210 skipped++; 211 } 212 } 213 index.commit(); 214 if (verbose) { 215 System.out.println("Indexed " + count + " items (skipped " + skipped + ")"); 216 } 217 return count; 218 } 219 220 /** 221 * @param item 222 * @param defaultOrigin 223 * @return 224 */ 225 private Document createDocument(CrawlDataItem item, String defaultOrigin) { 226 Document doc = new Document(); 227 228 FieldType storedNotIndexed = new FieldType(StringField.TYPE_STORED); 229 storedNotIndexed.setIndexed(false); 230 231 FieldType storedNotAnalyzed = new FieldType(StringField.TYPE_STORED); 232 storedNotAnalyzed.setOmitNorms(false); 233 234 // Add URL to index. 235 if (indexURL) { 236 doc.add(new Field(FIELD_URL, item.getURL(), storedNotAnalyzed)); 237 if (equivalent) { 238 doc.add(new Field(FIELD_URL_NORMALIZED, stripURL(item.getURL()), storedNotAnalyzed)); 239 } 240 } else { 241 doc.add(new Field(FIELD_URL, item.getURL(), storedNotIndexed)); 242 if (equivalent) { 243 doc.add(new Field(FIELD_URL_NORMALIZED, stripURL(item.getURL()), storedNotIndexed)); 244 } 245 } 246 247 // Add digest to index 248 if (indexDigest) { 249 doc.add(new Field(FIELD_DIGEST, item.getContentDigest(), storedNotAnalyzed)); 250 } else { 251 doc.add(new Field(FIELD_DIGEST, item.getContentDigest(), storedNotIndexed)); 252 } 253 // Add timestamp to index 254 if (timestamp) { 255 doc.add(new Field(FIELD_TIMESTAMP, item.getTimestamp(), storedNotIndexed)); 256 } 257 // Add etag to index 258 if (etag && item.getEtag() != null) { 259 doc.add(new Field(FIELD_ETAG, item.getEtag(), storedNotIndexed)); 260 } 261 if (defaultOrigin != null) { 262 String tmp = item.getOrigin(); 263 if (tmp == null) { 264 tmp = defaultOrigin; 265 } 266 doc.add(new Field(FIELD_ORIGIN, tmp, storedNotIndexed)); 267 } 268 return doc; 269 } 270 271 /** 272 * Close the index. 273 * 274 * @throws IOException If an error occurs while closing the index. 275 */ 276 public void close() throws IOException { 277 index.close(true); 278 luceneDirectory.close(); 279 } 280 281 /** 282 * An aggressive URL normalizer. This methods removes any www[0-9]. segments from an URL, along with any trailing 283 * slashes and all parameters. 284 * <p> 285 * Example: <code>http://www.bok.hi.is/?lang=ice</code> would become <code>http://bok.hi.is</code> 286 * 287 * @param url The url to strip 288 * @return A normalized URL. 289 */ 290 public static String stripURL(String url) { 291 url = url.replaceAll("www[0-9]*\\.", ""); 292 url = url.replaceAll("\\?.*$", ""); 293 url = url.replaceAll("/$", ""); 294 return url; 295 } 296 297 @SuppressWarnings({"unchecked", "rawtypes"}) 298 public static void main(String[] args) throws Exception { 299 CommandLineParser clp = new CommandLineParser(args, new PrintWriter(System.out)); 300 long start = System.currentTimeMillis(); 301 302 // Set default values for all settings. 303 boolean etag = false; 304 boolean equivalent = false; 305 boolean timestamp = false; 306 String indexMode = MODE_BOTH; 307 boolean addToIndex = false; 308 String mimefilter = "^text/.*"; 309 boolean blacklist = true; 310 String iteratorClassName = CrawlLogIterator.class.getName(); 311 String origin = null; 312 boolean skipDuplicates = false; 313 314 // Process the options 315 Option[] opts = clp.getCommandLineOptions(); 316 for (int i = 0; i < opts.length; i++) { 317 Option opt = opts[i]; 318 switch (opt.getId()) { 319 case 'w': 320 blacklist = false; 321 break; 322 case 'a': 323 addToIndex = true; 324 break; 325 case 'e': 326 etag = true; 327 break; 328 case 'h': 329 clp.usage(0); 330 break; 331 case 'i': 332 iteratorClassName = opt.getValue(); 333 break; 334 case 'm': 335 mimefilter = opt.getValue(); 336 break; 337 case 'o': 338 indexMode = opt.getValue(); 339 break; 340 case 's': 341 equivalent = true; 342 break; 343 case 't': 344 timestamp = true; 345 break; 346 case 'r': 347 origin = opt.getValue(); 348 break; 349 case 'd': 350 skipDuplicates = true; 351 break; 352 default: 353 System.err.println("Unhandled option id: " + opt.getId()); 354 } 355 } 356 357 List cargs = clp.getCommandLineArguments(); 358 359 if (cargs.size() != 2) { 360 // Should be exactly two arguments. Source and target! 361 clp.usage(0); 362 } 363 364 // Get the CrawlDataIterator 365 // Get the iterator classname or load default. 366 Class cl = Class.forName(iteratorClassName); 367 Constructor co = cl.getConstructor(new Class[] {String.class}); 368 CrawlDataIterator iterator = (CrawlDataIterator) co.newInstance(new Object[] {(String) cargs.get(0)}); 369 370 // Print initial stuff 371 System.out.println("Indexing: " + cargs.get(0)); 372 System.out.println(" - Mode: " + indexMode); 373 System.out.println(" - Mime filter: " + mimefilter + " (" + (blacklist ? "blacklist" : "whitelist") + ")"); 374 System.out.println(" - Includes" + (equivalent ? " <equivalent URL>" : "") + (timestamp ? " <timestamp>" : "") 375 + (etag ? " <etag>" : "")); 376 System.out.println(" - Skip duplicates: " + (skipDuplicates ? "yes" : "no")); 377 System.out.println(" - Iterator: " + iteratorClassName); 378 System.out.println(" - " + iterator.getSourceType()); 379 System.out.println("Target: " + cargs.get(1)); 380 if (addToIndex) { 381 System.out.println(" - Add to existing index (if any)"); 382 } else { 383 System.out.println(" - New index (erases any existing index at " + "that location)"); 384 } 385 386 DigestIndexer di = new DigestIndexer((String) cargs.get(1), indexMode, equivalent, timestamp, etag, addToIndex); 387 388 // Create the index 389 di.writeToIndex(iterator, mimefilter, blacklist, origin, true, skipDuplicates); 390 391 // Clean-up 392 di.close(); 393 394 System.out.println("Total run time: " 395 + ArchiveUtils.formatMillisecondsToConventional(System.currentTimeMillis() - start)); 396 } 397}