Source code

001/* DigestIndexer
002 * 
003 * Created on 10.04.2006
004 *
005 * Copyright (C) 2006 National and University Library of Iceland
006 * 
007 * This file is part of the DeDuplicator (Heritrix add-on module).
008 * 
009 * DeDuplicator is free software; you can redistribute it and/or modify
010 * it under the terms of the GNU Lesser Public License as published by
011 * the Free Software Foundation; either version 2.1 of the License, or
012 * any later version.
013 * 
014 * DeDuplicator is distributed in the hope that it will be useful, 
015 * but WITHOUT ANY WARRANTY; without even the implied warranty of
016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
017 * GNU Lesser Public License for more details.
018 * 
019 * You should have received a copy of the GNU Lesser Public License
020 * along with DeDuplicator; if not, write to the Free Software
021 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
022 */
023package is.hi.bok.deduplicator;
024
025import java.io.File;
026import java.io.IOException;
027import java.io.PrintWriter;
028import java.lang.reflect.Constructor;
029import java.util.List;
030
031import org.apache.commons.cli.Option;
032import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
033import org.apache.lucene.document.Document;
034import org.apache.lucene.document.Field;
035import org.apache.lucene.document.FieldType;
036import org.apache.lucene.document.StringField;
037import org.apache.lucene.index.IndexWriter;
038import org.apache.lucene.index.IndexWriterConfig;
039import org.apache.lucene.index.IndexWriterConfig.OpenMode;
040import org.apache.lucene.store.Directory;
041import org.apache.lucene.store.FSDirectory;
042import org.archive.util.ArchiveUtils;
043
044import dk.netarkivet.common.Constants;
045
046/**
047 * A class for building a de-duplication index.
048 * <p>
049 * The indexing can be done via the command line options (Run with --help parameter to print usage information) or
050 * natively embedded in other applications.
051 * <p>
052 * This class also defines string constants for the lucene field names.
053 *
054 * @author Kristinn Sigur&eth;sson
055 * @author Søren Vejrup Carlsen
056 */
057public class DigestIndexer {
058
059    // Lucene index field names
060    /** The URL. * */
061    public static final String FIELD_URL = "url";
062    /** The content digest as String. * */
063    public static final String FIELD_DIGEST = "digest";
064    /**
065     * The URLs timestamp (time of fetch). The exact nature of this time may vary slightly depending on the source (i.e.
066     * crawl.log and ARCs contain slightly different times but both indicate roughly when the document was obtained. The
067     * time is encoded as a String with the Java date format yyyyMMddHHmmssSSS
068     */
069    public static final String FIELD_TIMESTAMP = "date";
070    /** The document's etag. * */
071    public static final String FIELD_ETAG = "etag";
072    /** A stripped (normalized) version of the URL. * */
073    public static final String FIELD_URL_NORMALIZED = "url-normalized";
074    /**
075     * A field containing meta-data on where the original version of a document is stored.
076     */
077    public static final String FIELD_ORIGIN = "origin";
078
079    // Indexing modes (by url, by hash or both)
080    /**
081     * Index URL enabling lookups by URL. If normalized URLs are included in the index they will also be indexed and
082     * searchable. *
083     */
084    public static final String MODE_URL = "URL";
085    /** Index HASH enabling lookups by hash (content digest). * */
086    public static final String MODE_HASH = "HASH";
087    /** Both URL and hash are indexed. * */
088    public static final String MODE_BOTH = "BOTH";
089
090    /** Lucene Storage used by the indexwriter. */
091    private Directory luceneDirectory;
092
093    /** The index being manipulated. * */
094    private IndexWriter index;
095
096    /**
097     * @return the IndexWriter
098     */
099    public IndexWriter getIndex() {
100        return index;
101    }
102
103    // The options with default settings
104    /** Should etags be included in the index. */
105    private boolean etag = false;
106    /**
107     * Should a normalized version of the URL be added to the index.
108     */
109    private boolean equivalent = false;
110    /** Should a timestamp be included in the index. */
111    private boolean timestamp = false;
112    /** Should we index the url. */
113    private boolean indexURL = true;
114    /** Should we index the digest. */
115    private boolean indexDigest = true;
116
117    /**
118     * Each instance of this class wraps one Lucene index for writing deduplication information to it.
119     *
120     * @param indexLocation The location of the index (path).
121     * @param indexingMode Index {@link #MODE_URL}, {@link #MODE_HASH} or {@link #MODE_BOTH}.
122     * @param includeNormalizedURL Should a normalized version of the URL be added to the index. See
123     * {@link #stripURL(String)}.
124     * @param includeTimestamp Should a timestamp be included in the index.
125     * @param includeEtag Should an Etag be included in the index.
126     * @param addToExistingIndex Are we opening up an existing index. Setting this to false will cause any index at
127     * <code>indexLocation</code> to be overwritten.
128     * @throws IOException If an error occurs opening the index.
129     */
130    public DigestIndexer(String indexLocation, String indexingMode, boolean includeNormalizedURL,
131            boolean includeTimestamp, boolean includeEtag, boolean addToExistingIndex) throws IOException {
132
133        this.etag = includeEtag;
134        this.equivalent = includeNormalizedURL;
135        this.timestamp = includeTimestamp;
136
137        if (indexingMode.equals(MODE_URL)) {
138            indexDigest = false;
139        } else if (indexingMode.equals(MODE_HASH)) {
140            indexURL = false;
141        }
142
143        // Set up the index writer
144        IndexWriterConfig config = new IndexWriterConfig(Constants.LUCENE_VERSION, new WhitespaceAnalyzer(
145                Constants.LUCENE_VERSION));
146        // TODO Possibly change the default MergePolicy, see NAS-2119
147        if (!addToExistingIndex) {
148            config.setOpenMode(OpenMode.CREATE);
149        } else {
150            config.setOpenMode(OpenMode.CREATE_OR_APPEND);
151        }
152        luceneDirectory = FSDirectory.open(new File(indexLocation));
153        index = new IndexWriter(luceneDirectory, config);
154    }
155
156    /**
157     * Writes the contents of a {@link CrawlDataIterator} to this index.
158     * <p>
159     * This method may be invoked multiple times with different CrawlDataIterators until {@link #close} has been called.
160     *
161     * @param dataIt The CrawlDataIterator that provides the data to index.
162     * @param mimefilter A regular expression that is used as a filter on the mimetypes to include in the index.
163     * @param blacklist If true then the <code>mimefilter</code> is used as a blacklist for mimetypes. If false then the
164     * <code>mimefilter</code> is treated as a whitelist.
165     * @param defaultOrigin If an item is missing an origin, this default value will be assigned to it. Can be null if
166     * no default origin value should be assigned.
167     * @param verbose If true then progress information will be sent to System.out.
168     * @return The number of items added to the index.
169     * @throws IOException If an error occurs writing the index.
170     */
171    public long writeToIndex(CrawlDataIterator dataIt, String mimefilter, boolean blacklist, String defaultOrigin,
172            boolean verbose) throws IOException {
173        return writeToIndex(dataIt, mimefilter, blacklist, defaultOrigin, verbose, false);
174    }
175
176    /**
177     * Writes the contents of a {@link CrawlDataIterator} to this index.
178     * <p>
179     * This method may be invoked multiple times with different CrawlDataIterators until {@link #close} has been called.
180     *
181     * @param dataIt The CrawlDataIterator that provides the data to index.
182     * @param mimefilter A regular expression that is used as a filter on the mimetypes to include in the index.
183     * @param blacklist If true then the <code>mimefilter</code> is used as a blacklist for mimetypes. If false then the
184     * <code>mimefilter</code> is treated as a whitelist.
185     * @param defaultOrigin If an item is missing an origin, this default value will be assigned to it. Can be null if
186     * no default origin value should be assigned.
187     * @param verbose If true then progress information will be sent to System.out.
188     * @param skipDuplicates Do not add URLs that are marked as duplicates to the index
189     * @return The number of items added to the index.
190     * @throws IOException If an error occurs writing the index.
191     */
192    public long writeToIndex(CrawlDataIterator dataIt, String mimefilter, boolean blacklist, String defaultOrigin,
193            boolean verbose, boolean skipDuplicates) throws IOException {
194        int count = 0;
195        int skipped = 0;
196        while (dataIt.hasNext()) {
197            CrawlDataItem item = dataIt.next();
198            if (!(skipDuplicates && item.duplicate) && item.mimetype.matches(mimefilter) != blacklist) {
199                // Ok, we wish to index this URL/Digest
200                count++;
201                if (verbose && count % 10000 == 0) {
202                    System.out.println("Indexed " + count + " - Last URL " + "from " + item.getTimestamp());
203                }
204
205                Document doc = createDocument(item, defaultOrigin);
206                index.addDocument(doc);
207                // needed with new IndexWriter (see line 144)
208                // index.commit();
209            } else {
210                skipped++;
211            }
212        }
213        index.commit();
214        if (verbose) {
215            System.out.println("Indexed " + count + " items (skipped " + skipped + ")");
216        }
217        return count;
218    }
219
220    /**
221     * @param item
222     * @param defaultOrigin
223     * @return
224     */
225    private Document createDocument(CrawlDataItem item, String defaultOrigin) {
226        Document doc = new Document();
227
228        FieldType storedNotIndexed = new FieldType(StringField.TYPE_STORED);
229        storedNotIndexed.setIndexed(false);
230
231        FieldType storedNotAnalyzed = new FieldType(StringField.TYPE_STORED);
232        storedNotAnalyzed.setOmitNorms(false);
233
234        // Add URL to index.
235        if (indexURL) {
236            doc.add(new Field(FIELD_URL, item.getURL(), storedNotAnalyzed));
237            if (equivalent) {
238                doc.add(new Field(FIELD_URL_NORMALIZED, stripURL(item.getURL()), storedNotAnalyzed));
239            }
240        } else {
241            doc.add(new Field(FIELD_URL, item.getURL(), storedNotIndexed));
242            if (equivalent) {
243                doc.add(new Field(FIELD_URL_NORMALIZED, stripURL(item.getURL()), storedNotIndexed));
244            }
245        }
246
247        // Add digest to index
248        if (indexDigest) {
249            doc.add(new Field(FIELD_DIGEST, item.getContentDigest(), storedNotAnalyzed));
250        } else {
251            doc.add(new Field(FIELD_DIGEST, item.getContentDigest(), storedNotIndexed));
252        }
253        // Add timestamp to index
254        if (timestamp) {
255            doc.add(new Field(FIELD_TIMESTAMP, item.getTimestamp(), storedNotIndexed));
256        }
257        // Add etag to index
258        if (etag && item.getEtag() != null) {
259            doc.add(new Field(FIELD_ETAG, item.getEtag(), storedNotIndexed));
260        }
261        if (defaultOrigin != null) {
262            String tmp = item.getOrigin();
263            if (tmp == null) {
264                tmp = defaultOrigin;
265            }
266            doc.add(new Field(FIELD_ORIGIN, tmp, storedNotIndexed));
267        }
268        return doc;
269    }
270
271    /**
272     * Close the index.
273     *
274     * @throws IOException If an error occurs while closing the index.
275     */
276    public void close() throws IOException {
277        index.close(true);
278        luceneDirectory.close();
279    }
280
281    /**
282     * An aggressive URL normalizer. This methods removes any www[0-9]. segments from an URL, along with any trailing
283     * slashes and all parameters.
284     * <p>
285     * Example: <code>http://www.bok.hi.is/?lang=ice</code> would become <code>http://bok.hi.is</code>
286     *
287     * @param url The url to strip
288     * @return A normalized URL.
289     */
290    public static String stripURL(String url) {
291        url = url.replaceAll("www[0-9]*\\.", "");
292        url = url.replaceAll("\\?.*$", "");
293        url = url.replaceAll("/$", "");
294        return url;
295    }
296
297    @SuppressWarnings({"unchecked", "rawtypes"})
298    public static void main(String[] args) throws Exception {
299        CommandLineParser clp = new CommandLineParser(args, new PrintWriter(System.out));
300        long start = System.currentTimeMillis();
301
302        // Set default values for all settings.
303        boolean etag = false;
304        boolean equivalent = false;
305        boolean timestamp = false;
306        String indexMode = MODE_BOTH;
307        boolean addToIndex = false;
308        String mimefilter = "^text/.*";
309        boolean blacklist = true;
310        String iteratorClassName = CrawlLogIterator.class.getName();
311        String origin = null;
312        boolean skipDuplicates = false;
313
314        // Process the options
315        Option[] opts = clp.getCommandLineOptions();
316        for (int i = 0; i < opts.length; i++) {
317            Option opt = opts[i];
318            switch (opt.getId()) {
319            case 'w':
320                blacklist = false;
321                break;
322            case 'a':
323                addToIndex = true;
324                break;
325            case 'e':
326                etag = true;
327                break;
328            case 'h':
329                clp.usage(0);
330                break;
331            case 'i':
332                iteratorClassName = opt.getValue();
333                break;
334            case 'm':
335                mimefilter = opt.getValue();
336                break;
337            case 'o':
338                indexMode = opt.getValue();
339                break;
340            case 's':
341                equivalent = true;
342                break;
343            case 't':
344                timestamp = true;
345                break;
346            case 'r':
347                origin = opt.getValue();
348                break;
349            case 'd':
350                skipDuplicates = true;
351                break;
352            default:
353                System.err.println("Unhandled option id: " + opt.getId());
354            }
355        }
356
357        List cargs = clp.getCommandLineArguments();
358
359        if (cargs.size() != 2) {
360            // Should be exactly two arguments. Source and target!
361            clp.usage(0);
362        }
363
364        // Get the CrawlDataIterator
365        // Get the iterator classname or load default.
366        Class cl = Class.forName(iteratorClassName);
367        Constructor co = cl.getConstructor(new Class[] {String.class});
368        CrawlDataIterator iterator = (CrawlDataIterator) co.newInstance(new Object[] {(String) cargs.get(0)});
369
370        // Print initial stuff
371        System.out.println("Indexing: " + cargs.get(0));
372        System.out.println(" - Mode: " + indexMode);
373        System.out.println(" - Mime filter: " + mimefilter + " (" + (blacklist ? "blacklist" : "whitelist") + ")");
374        System.out.println(" - Includes" + (equivalent ? " <equivalent URL>" : "") + (timestamp ? " <timestamp>" : "")
375                + (etag ? " <etag>" : ""));
376        System.out.println(" - Skip duplicates: " + (skipDuplicates ? "yes" : "no"));
377        System.out.println(" - Iterator: " + iteratorClassName);
378        System.out.println("   - " + iterator.getSourceType());
379        System.out.println("Target: " + cargs.get(1));
380        if (addToIndex) {
381            System.out.println(" - Add to existing index (if any)");
382        } else {
383            System.out.println(" - New index (erases any existing index at " + "that location)");
384        }
385
386        DigestIndexer di = new DigestIndexer((String) cargs.get(1), indexMode, equivalent, timestamp, etag, addToIndex);
387
388        // Create the index
389        di.writeToIndex(iterator, mimefilter, blacklist, origin, true, skipDuplicates);
390
391        // Clean-up
392        di.close();
393
394        System.out.println("Total run time: "
395                + ArchiveUtils.formatMillisecondsToConventional(System.currentTimeMillis() - start));
396    }
397}