/* File: $Id: TestDBConnection.java 11 2007-07-24 10:11:24Z kfc $ * Revision: $Revision: 11 $ * Author: $Author: kfc $ * Date: $Date: 2007-07-24 12:11:24 +0200 (Tue, 24 Jul 2007) $ * * The Netarchive Suite - Software to harvest and preserve websites * Copyright 2004-2007 Det Kongelige Bibliotek and Statsbiblioteket, Denmark * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 * USA. */ package dk.netarkivet.common.distribute.arcrepository; import java.io.IOException; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import is.hi.bok.deduplicator.CrawlDataItem; import is.hi.bok.deduplicator.CrawlDataItemWithStatuscode; import is.hi.bok.deduplicator.CrawlDataIterator; import is.hi.bok.deduplicator.DigestIndexer; /** * DigestIndexer, that adds a statuscode to the index. * Note that */ public class NetarchiveDigestIndexer extends DigestIndexer { /** A field containing meta-data on the status-code of the document. */ public static final String FIELD_STATUSCODE = "statuscode"; /** * Constructor for the NetarchiveDigestIndexer * @param indexLocation The location of the index (a path) * @param indexingMode the indexing mode. Either {@link #MODE_URL}, {@link #MODE_HASH} or * {@link #MODE_BOTH}. * @param includeNormalizedURL Should NormalizedURL be included? * @param includeTimestamp Should timestamp be included? * @param includeEtag Should etag be included? * @param addToExistingIndex Are we adding to an existing index? * @throws IOException */ public NetarchiveDigestIndexer(String indexLocation, String indexingMode, boolean includeNormalizedURL, boolean includeTimestamp, boolean includeEtag, boolean addToExistingIndex) throws IOException { super(indexLocation, indexingMode, includeNormalizedURL, includeTimestamp, includeEtag, addToExistingIndex); } /** * Writes the contents of a {@link CrawlDataIterator} to this index. *

* This method may be invoked multiple times with different * CrawlDataIterators until {@link #close(boolean)} has been called. * * @param dataIt The CrawlDataIterator that provides the data to index. * @param mimefilter A regular expression that is used as a filter on the * mimetypes to include in the index. * @param blacklist If true then the mimefilter is used * as a blacklist for mimetypes. If false then the * mimefilter is treated as a whitelist. * @param defaultOrigin If an item is missing an origin, this default value * will be assigned to it. Can be null if no default * origin value should be assigned. * @param verbose If true then progress information will be sent to * System.out. * @param skipDuplicates Do not add URLs that are marked as duplicates to * the index * @return The number of items added to the index. * @throws IOException If an error occurs writing the index. */ public long writeToIndex( CrawlDataIterator dataIt, String mimefilter, boolean blacklist, String defaultOrigin, boolean verbose, boolean skipDuplicates) throws IOException { int count = 0; int skipped = 0; while (dataIt.hasNext()) { CrawlDataItem item = dataIt.next(); if(!(skipDuplicates && item.isDuplicate()) && item.getMimeType().matches(mimefilter) != blacklist){ // Ok, we wish to index this URL/Digest count++; if(verbose && count%10000==0){ System.out.println("Indexed " + count + " - Last URL " + "from " + item.getTimestamp()); } Document doc = new Document(); // Add URL to index. //doc.add(new Field( Field newField = new Field( FIELD_URL, item.getURL(), Field.Store.YES, (indexURL ? Field.Index.UN_TOKENIZED : Field.Index.NO) ); newField.setOmitNorms(true); doc.add(newField); if(equivalent){ newField = new Field( FIELD_URL_NORMALIZED, stripURL(item.getURL()), Field.Store.YES, (indexURL ? Field.Index.UN_TOKENIZED : Field.Index.NO) ); newField.setOmitNorms(true); doc.add(newField); } // Add digest to index newField = new Field( FIELD_DIGEST, item.getContentDigest(), Field.Store.YES, (indexDigest ? Field.Index.UN_TOKENIZED : Field.Index.NO) ); newField.setOmitNorms(true); doc.add(newField); if(timestamp){ newField = new Field( FIELD_TIMESTAMP, item.getTimestamp(), Field.Store.YES, Field.Index.NO ); newField.setOmitNorms(true); doc.add(newField); } if(etag && item.getEtag()!=null){ newField = new Field( FIELD_ETAG, item.getEtag(), Field.Store.YES, Field.Index.NO ); newField.setOmitNorms(true); doc.add(newField); } if(defaultOrigin!=null){ String tmp = item.getOrigin(); if(tmp==null){ tmp = defaultOrigin; } newField = new Field( FIELD_ORIGIN, tmp, Field.Store.YES, Field.Index.NO ); newField.setOmitNorms(true); doc.add(newField); } // Investigate whether or there is a statuscode available. if (item instanceof CrawlDataItemWithStatuscode) { // add statuscode information to the index CrawlDataItemWithStatuscode itemWithStatuscode = (CrawlDataItemWithStatuscode) item; String statuscode = itemWithStatuscode.getStatuscode(); newField = new Field( FIELD_STATUSCODE, statuscode, Field.Store.YES, Field.Index.NO ); newField.setOmitNorms(true); doc.add(newField); } index.addDocument(doc); } else { skipped++; } } if(verbose){ System.out.println("Indexed " + count + " items (skipped " + skipped + ")"); } return count; } }