Source code

001/* DigestIndexer
002 * 
003 * Created on 10.04.2006
004 *
005 * Copyright (C) 2006 National and University Library of Iceland
006 * 
007 * This file is part of the DeDuplicator (Heritrix add-on module).
008 * 
009 * DeDuplicator is free software; you can redistribute it and/or modify
010 * it under the terms of the GNU Lesser Public License as published by
011 * the Free Software Foundation; either version 2.1 of the License, or
012 * any later version.
013 * 
014 * DeDuplicator is distributed in the hope that it will be useful, 
015 * but WITHOUT ANY WARRANTY; without even the implied warranty of
016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
017 * GNU Lesser Public License for more details.
018 * 
019 * You should have received a copy of the GNU Lesser Public License
020 * along with DeDuplicator; if not, write to the Free Software
021 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
022 */
023package is.hi.bok.deduplicator;
024
025import java.io.File;
026import java.io.IOException;
027import java.io.PrintWriter;
028import java.lang.reflect.Constructor;
029import java.util.List;
030
031import org.apache.commons.cli.Option;
032import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
033import org.apache.lucene.document.Document;
034import org.apache.lucene.document.Field;
035import org.apache.lucene.document.FieldType;
036import org.apache.lucene.document.StringField;
037import org.apache.lucene.index.IndexWriter;
038import org.apache.lucene.index.IndexWriterConfig;
039import org.apache.lucene.index.IndexWriterConfig.OpenMode;
040import org.apache.lucene.store.Directory;
041import org.apache.lucene.store.FSDirectory;
042import org.archive.util.ArchiveUtils;
043
044import dk.netarkivet.common.Constants;
045
046/**
047 * A class for building a de-duplication index.
048 * <p>
049 * The indexing can be done via the command line options (Run with --help parameter to print usage information) or
050 * natively embedded in other applications.
051 * <p>
052 * This class also defines string constants for the lucene field names.
053 *
054 * @author Kristinn Sigur&eth;sson
055 * @author Søren Vejrup Carlsen
056 */
057public class DigestIndexer {
058
059    // Lucene index field names
060    /** The URL. * */
061    public static final String FIELD_URL = "url";
062    /** The content digest as String. * */
063    public static final String FIELD_DIGEST = "digest";
064    /**
065     * The URLs timestamp (time of fetch). The exact nature of this time may vary slightly depending on the source (i.e.
066     * crawl.log and ARCs contain slightly different times but both indicate roughly when the document was obtained. The
067     * time is encoded as a String with the Java date format yyyyMMddHHmmssSSS
068     */
069    public static final String FIELD_TIMESTAMP = "date";
070    /** The document's etag. * */
071    public static final String FIELD_ETAG = "etag";
072    /** A stripped (normalized) version of the URL. * */
073    public static final String FIELD_URL_NORMALIZED = "url-normalized";
074    /**
075     * A field containing meta-data on where the original version of a document is stored.
076     */
077    public static final String FIELD_ORIGIN = "origin";
078
079    // Indexing modes (by url, by hash or both)
080    /**
081     * Index URL enabling lookups by URL. If normalized URLs are included in the index they will also be indexed and
082     * searchable. *
083     */
084    public static final String MODE_URL = "URL";
085    /** Index HASH enabling lookups by hash (content digest). * */
086    public static final String MODE_HASH = "HASH";
087    /** Both URL and hash are indexed. * */
088    public static final String MODE_BOTH = "BOTH";
089
090    /** Lucene Storage used by the indexwriter. */
091    private Directory luceneDirectory;
092
093    /** The index being manipulated. * */
094    private IndexWriter index;
095
096    /**
097     * @return the IndexWriter
098     */
099    public IndexWriter getIndex() {
100        return index;
101    }
102
103    // The options with default settings
104    /** Should etags be included in the index. */
105    private boolean etag = false;
106    /**
107     * Should a normalized version of the URL be added to the index.
108     */
109    private boolean equivalent = false;
110    /** Should a timestamp be included in the index. */
111    private boolean timestamp = false;
112    /** Should we index the url. */
113    private boolean indexURL = true;
114    /** Should we index the digest. */
115    private boolean indexDigest = true;
116
117    /**
118     * Each instance of this class wraps one Lucene index for writing deduplication information to it.
119     *
120     * @param indexLocation The location of the index (path).
121     * @param indexingMode Index {@link #MODE_URL}, {@link #MODE_HASH} or {@link #MODE_BOTH}.
122     * @param includeNormalizedURL Should a normalized version of the URL be added to the index. See
123     * {@link #stripURL(String)}.
124     * @param includeTimestamp Should a timestamp be included in the index.
125     * @param includeEtag Should an Etag be included in the index.
126     * @param addToExistingIndex Are we opening up an existing index. Setting this to false will cause any index at
127     * <code>indexLocation</code> to be overwritten.
128     * @throws IOException If an error occurs opening the index.
129     */
130    public DigestIndexer(String indexLocation, String indexingMode, boolean includeNormalizedURL,
131            boolean includeTimestamp, boolean includeEtag, boolean addToExistingIndex) throws IOException {
132
133        this.etag = includeEtag;
134        this.equivalent = includeNormalizedURL;
135        this.timestamp = includeTimestamp;
136
137        if (indexingMode.equals(MODE_URL)) {
138            indexDigest = false;
139        } else if (indexingMode.equals(MODE_HASH)) {
140            indexURL = false;
141        }
142
143        // Set up the index writer
144        IndexWriterConfig config = new IndexWriterConfig(Constants.LUCENE_VERSION, new WhitespaceAnalyzer(
145                Constants.LUCENE_VERSION));
146        // TODO Possibly change the default MergePolicy, see NAS-2119
147        if (!addToExistingIndex) {
148            config.setOpenMode(OpenMode.CREATE);
149        } else {
150            config.setOpenMode(OpenMode.CREATE_OR_APPEND);
151        }
152        luceneDirectory = FSDirectory.open(new File(indexLocation));
153        index = new IndexWriter(luceneDirectory, config);
154    }
155
156    /**
157     * Writes the contents of a {@link CrawlDataIterator} to this index.
158     * <p>
159     * This method may be invoked multiple times with different CrawlDataIterators until {@link #close} has been called.
160     *
161     * @param dataIt The CrawlDataIterator that provides the data to index.
162     * @param mimefilter A regular expression that is used as a filter on the mimetypes to include in the index.
163     * @param blacklist If true then the <code>mimefilter</code> is used as a blacklist for mimetypes. If false then the
164     * <code>mimefilter</code> is treated as a whitelist.
165     * @param defaultOrigin If an item is missing an origin, this default value will be assigned to it. Can be null if
166     * no default origin value should be assigned.
167     * @param verbose If true then progress information will be sent to System.out.
168     * @return The number of items added to the index.
169     * @throws IOException If an error occurs writing the index.
170     */
171    public long writeToIndex(CrawlDataIterator dataIt, String mimefilter, boolean blacklist, String defaultOrigin,
172            boolean verbose) throws IOException {
173        return writeToIndex(dataIt, mimefilter, blacklist, defaultOrigin, verbose, false);
174    }
175
176    /**
177     * Writes the contents of a {@link CrawlDataIterator} to this index.
178     * <p>
179     * This method may be invoked multiple times with different CrawlDataIterators until {@link #close} has been called.
180     *
181     * @param dataIt The CrawlDataIterator that provides the data to index.
182     * @param mimefilter A regular expression that is used as a filter on the mimetypes to include in the index.
183     * @param blacklist If true then the <code>mimefilter</code> is used as a blacklist for mimetypes. If false then the
184     * <code>mimefilter</code> is treated as a whitelist.
185     * @param defaultOrigin If an item is missing an origin, this default value will be assigned to it. Can be null if
186     * no default origin value should be assigned.
187     * @param verbose If true then progress information will be sent to System.out.
188     * @param skipDuplicates Do not add URLs that are marked as duplicates to the index
189     * @return The number of items added to the index.
190     * @throws IOException If an error occurs writing the index.
191     */
192    public long writeToIndex(CrawlDataIterator dataIt, String mimefilter, boolean blacklist, String defaultOrigin,
193            boolean verbose, boolean skipDuplicates) throws IOException {
194        int count = 0;
195        int skipped = 0;
196        while (dataIt.hasNext()) {
197            CrawlDataItem item = dataIt.next();
198            if (!(skipDuplicates && item.duplicate) && item.mimetype.matches(mimefilter) != blacklist) {
199                // Ok, we wish to index this URL/Digest
200                count++;
201                if (verbose && count % 10000 == 0) {
202                    System.out.println("Indexed " + count + " - Last URL " + "from " + item.getTimestamp());
203                }
204
205                Document doc = createDocument(item, defaultOrigin);
206                index.addDocument(doc);
207                // needed with new IndexWriter (see line 144)
208                // index.commit();
209            } else {
210                skipped++;
211            }
212        }
213        index.commit();
214        if (verbose) {
215            System.out.println("Indexed " + count + " items (skipped " + skipped + ")");
216        }
217        return count;
218    }
219
220    /**
221     * Create Lucene Document for given CrawlDataItem.
222     * @param item A CrawlDataItem
223     * @param defaultOrigin
224     * @return Lucene Document for the given CrawlDataItem
225     */
226    private Document createDocument(CrawlDataItem item, String defaultOrigin) {
227        Document doc = new Document();
228
229        FieldType storedNotIndexed = new FieldType(StringField.TYPE_STORED);
230        storedNotIndexed.setIndexed(false);
231
232        FieldType storedNotAnalyzed = new FieldType(StringField.TYPE_STORED);
233        storedNotAnalyzed.setOmitNorms(false);
234
235        // Add URL to index.
236        if (indexURL) {
237            doc.add(new Field(FIELD_URL, item.getURL(), storedNotAnalyzed));
238            if (equivalent) {
239                doc.add(new Field(FIELD_URL_NORMALIZED, stripURL(item.getURL()), storedNotAnalyzed));
240            }
241        } else {
242            doc.add(new Field(FIELD_URL, item.getURL(), storedNotIndexed));
243            if (equivalent) {
244                doc.add(new Field(FIELD_URL_NORMALIZED, stripURL(item.getURL()), storedNotIndexed));
245            }
246        }
247
248        // Add digest to index
249        if (indexDigest) {
250            doc.add(new Field(FIELD_DIGEST, item.getContentDigest(), storedNotAnalyzed));
251        } else {
252            doc.add(new Field(FIELD_DIGEST, item.getContentDigest(), storedNotIndexed));
253        }
254        // Add timestamp to index
255        if (timestamp) {
256            doc.add(new Field(FIELD_TIMESTAMP, item.getTimestamp(), storedNotIndexed));
257        }
258        // Add etag to index
259        if (etag && item.getEtag() != null) {
260            doc.add(new Field(FIELD_ETAG, item.getEtag(), storedNotIndexed));
261        }
262        if (defaultOrigin != null) {
263            String tmp = item.getOrigin();
264            if (tmp == null) {
265                tmp = defaultOrigin;
266            }
267            doc.add(new Field(FIELD_ORIGIN, tmp, storedNotIndexed));
268        }
269        return doc;
270    }
271
272    /**
273     * Close the index.
274     *
275     * @throws IOException If an error occurs while closing the index.
276     */
277    public void close() throws IOException {
278        index.close(true);
279        luceneDirectory.close();
280    }
281
282    /**
283     * An aggressive URL normalizer. This methods removes any www[0-9]. segments from an URL, along with any trailing
284     * slashes and all parameters.
285     * <p>
286     * Example: <code>http://www.bok.hi.is/?lang=ice</code> would become <code>http://bok.hi.is</code>
287     *
288     * @param url The url to strip
289     * @return A normalized URL.
290     */
291    public static String stripURL(String url) {
292        url = url.replaceAll("www[0-9]*\\.", "");
293        url = url.replaceAll("\\?.*$", "");
294        url = url.replaceAll("/$", "");
295        return url;
296    }
297
298    @SuppressWarnings({"unchecked", "rawtypes"})
299    public static void main(String[] args) throws Exception {
300        CommandLineParser clp = new CommandLineParser(args, new PrintWriter(System.out));
301        long start = System.currentTimeMillis();
302
303        // Set default values for all settings.
304        boolean etag = false;
305        boolean equivalent = false;
306        boolean timestamp = false;
307        String indexMode = MODE_BOTH;
308        boolean addToIndex = false;
309        String mimefilter = "^text/.*";
310        boolean blacklist = true;
311        String iteratorClassName = CrawlLogIterator.class.getName();
312        String origin = null;
313        boolean skipDuplicates = false;
314
315        // Process the options
316        Option[] opts = clp.getCommandLineOptions();
317        for (int i = 0; i < opts.length; i++) {
318            Option opt = opts[i];
319            switch (opt.getId()) {
320            case 'w':
321                blacklist = false;
322                break;
323            case 'a':
324                addToIndex = true;
325                break;
326            case 'e':
327                etag = true;
328                break;
329            case 'h':
330                clp.usage(0);
331                break;
332            case 'i':
333                iteratorClassName = opt.getValue();
334                break;
335            case 'm':
336                mimefilter = opt.getValue();
337                break;
338            case 'o':
339                indexMode = opt.getValue();
340                break;
341            case 's':
342                equivalent = true;
343                break;
344            case 't':
345                timestamp = true;
346                break;
347            case 'r':
348                origin = opt.getValue();
349                break;
350            case 'd':
351                skipDuplicates = true;
352                break;
353            default:
354                System.err.println("Unhandled option id: " + opt.getId());
355            }
356        }
357
358        List cargs = clp.getCommandLineArguments();
359
360        if (cargs.size() != 2) {
361            // Should be exactly two arguments. Source and target!
362            clp.usage(0);
363        }
364
365        // Get the CrawlDataIterator
366        // Get the iterator classname or load default.
367        Class cl = Class.forName(iteratorClassName);
368        Constructor co = cl.getConstructor(new Class[] {String.class});
369        CrawlDataIterator iterator = (CrawlDataIterator) co.newInstance(new Object[] {(String) cargs.get(0)});
370
371        // Print initial stuff
372        System.out.println("Indexing: " + cargs.get(0));
373        System.out.println(" - Mode: " + indexMode);
374        System.out.println(" - Mime filter: " + mimefilter + " (" + (blacklist ? "blacklist" : "whitelist") + ")");
375        System.out.println(" - Includes" + (equivalent ? " <equivalent URL>" : "") + (timestamp ? " <timestamp>" : "")
376                + (etag ? " <etag>" : ""));
377        System.out.println(" - Skip duplicates: " + (skipDuplicates ? "yes" : "no"));
378        System.out.println(" - Iterator: " + iteratorClassName);
379        System.out.println("   - " + iterator.getSourceType());
380        System.out.println("Target: " + cargs.get(1));
381        if (addToIndex) {
382            System.out.println(" - Add to existing index (if any)");
383        } else {
384            System.out.println(" - New index (erases any existing index at " + "that location)");
385        }
386
387        DigestIndexer di = new DigestIndexer((String) cargs.get(1), indexMode, equivalent, timestamp, etag, addToIndex);
388
389        // Create the index
390        di.writeToIndex(iterator, mimefilter, blacklist, origin, true, skipDuplicates);
391
392        // Clean-up
393        di.close();
394
395        System.out.println("Total run time: "
396                + ArchiveUtils.formatMillisecondsToConventional(System.currentTimeMillis() - start));
397    }
398}