/* File: $Id: TestDBConnection.java 11 2007-07-24 10:11:24Z kfc $ * Revision: $Revision: 11 $ * Author: $Author: kfc $ * Date: $Date: 2007-07-24 12:11:24 +0200 (Tue, 24 Jul 2007) $ * * The Netarchive Suite - Software to harvest and preserve websites * Copyright 2004-2007 Det Kongelige Bibliotek and Statsbiblioteket, Denmark * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 * USA. */ package dk.netarkivet.common.distribute.arcrepository; import java.io.IOException; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import is.hi.bok.deduplicator.CrawlDataItem; import is.hi.bok.deduplicator.CrawlDataItemWithStatuscode; import is.hi.bok.deduplicator.CrawlDataIterator; import is.hi.bok.deduplicator.DigestIndexer; /** * DigestIndexer, that adds a statuscode to the index. * Note that */ public class NetarchiveDigestIndexer extends DigestIndexer { /** A field containing meta-data on the status-code of the document. */ public static final String FIELD_STATUSCODE = "statuscode"; /** * Constructor for the NetarchiveDigestIndexer * @param indexLocation The location of the index (a path) * @param indexingMode the indexing mode. Either {@link #MODE_URL}, {@link #MODE_HASH} or * {@link #MODE_BOTH}. * @param includeNormalizedURL Should NormalizedURL be included? * @param includeTimestamp Should timestamp be included? * @param includeEtag Should etag be included? * @param addToExistingIndex Are we adding to an existing index? * @throws IOException */ public NetarchiveDigestIndexer(String indexLocation, String indexingMode, boolean includeNormalizedURL, boolean includeTimestamp, boolean includeEtag, boolean addToExistingIndex) throws IOException { super(indexLocation, indexingMode, includeNormalizedURL, includeTimestamp, includeEtag, addToExistingIndex); } /** * Writes the contents of a {@link CrawlDataIterator} to this index. *
* This method may be invoked multiple times with different
* CrawlDataIterators until {@link #close(boolean)} has been called.
*
* @param dataIt The CrawlDataIterator that provides the data to index.
* @param mimefilter A regular expression that is used as a filter on the
* mimetypes to include in the index.
* @param blacklist If true then the mimefilter
is used
* as a blacklist for mimetypes. If false then the
* mimefilter
is treated as a whitelist.
* @param defaultOrigin If an item is missing an origin, this default value
* will be assigned to it. Can be null if no default
* origin value should be assigned.
* @param verbose If true then progress information will be sent to
* System.out.
* @param skipDuplicates Do not add URLs that are marked as duplicates to
* the index
* @return The number of items added to the index.
* @throws IOException If an error occurs writing the index.
*/
public long writeToIndex(
CrawlDataIterator dataIt,
String mimefilter,
boolean blacklist,
String defaultOrigin,
boolean verbose,
boolean skipDuplicates)
throws IOException {
int count = 0;
int skipped = 0;
while (dataIt.hasNext()) {
CrawlDataItem item = dataIt.next();
if(!(skipDuplicates && item.isDuplicate()) &&
item.getMimeType().matches(mimefilter) != blacklist){
// Ok, we wish to index this URL/Digest
count++;
if(verbose && count%10000==0){
System.out.println("Indexed " + count + " - Last URL " +
"from " + item.getTimestamp());
}
Document doc = new Document();
// Add URL to index.
//doc.add(new Field(
Field newField = new Field(
FIELD_URL,
item.getURL(),
Field.Store.YES,
(indexURL ? Field.Index.UN_TOKENIZED : Field.Index.NO)
);
newField.setOmitNorms(true);
doc.add(newField);
if(equivalent){
newField = new Field(
FIELD_URL_NORMALIZED,
stripURL(item.getURL()),
Field.Store.YES,
(indexURL ?
Field.Index.UN_TOKENIZED : Field.Index.NO)
);
newField.setOmitNorms(true);
doc.add(newField);
}
// Add digest to index
newField = new Field(
FIELD_DIGEST,
item.getContentDigest(),
Field.Store.YES,
(indexDigest ?
Field.Index.UN_TOKENIZED : Field.Index.NO)
);
newField.setOmitNorms(true);
doc.add(newField);
if(timestamp){
newField = new Field(
FIELD_TIMESTAMP,
item.getTimestamp(),
Field.Store.YES,
Field.Index.NO
);
newField.setOmitNorms(true);
doc.add(newField);
}
if(etag && item.getEtag()!=null){
newField = new Field(
FIELD_ETAG,
item.getEtag(),
Field.Store.YES,
Field.Index.NO
);
newField.setOmitNorms(true);
doc.add(newField);
}
if(defaultOrigin!=null){
String tmp = item.getOrigin();
if(tmp==null){
tmp = defaultOrigin;
}
newField = new Field(
FIELD_ORIGIN,
tmp,
Field.Store.YES,
Field.Index.NO
);
newField.setOmitNorms(true);
doc.add(newField);
}
// Investigate whether or there is a statuscode available.
if (item instanceof CrawlDataItemWithStatuscode) {
// add statuscode information to the index
CrawlDataItemWithStatuscode itemWithStatuscode = (CrawlDataItemWithStatuscode) item;
String statuscode = itemWithStatuscode.getStatuscode();
newField = new Field(
FIELD_STATUSCODE,
statuscode,
Field.Store.YES,
Field.Index.NO
);
newField.setOmitNorms(true);
doc.add(newField);
}
index.addDocument(doc);
} else {
skipped++;
}
}
if(verbose){
System.out.println("Indexed " + count + " items (skipped " +
skipped + ")");
}
return count;
}
}