/* File:        $Id: TestDBConnection.java 11 2007-07-24 10:11:24Z kfc $
* Revision:    $Revision: 11 $
* Author:      $Author: kfc $
* Date:        $Date: 2007-07-24 12:11:24 +0200 (Tue, 24 Jul 2007) $
*
* The Netarchive Suite - Software to harvest and preserve websites
* Copyright 2004-2007 Det Kongelige Bibliotek and Statsbiblioteket, Denmark
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301
* USA.
*/
package dk.netarkivet.common.distribute.arcrepository;

import java.io.IOException;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;

import is.hi.bok.deduplicator.CrawlDataItem;
import is.hi.bok.deduplicator.CrawlDataItemWithStatuscode;
import is.hi.bok.deduplicator.CrawlDataIterator;
import is.hi.bok.deduplicator.DigestIndexer;


/**
 * DigestIndexer, that adds a statuscode to the index.
 * Note that 
 */
public class NetarchiveDigestIndexer extends DigestIndexer {

    /** A field containing meta-data on the status-code of the document. */
    public static final String FIELD_STATUSCODE = "statuscode";
    
    /**
     * Constructor for the NetarchiveDigestIndexer
     * @param indexLocation The location of the index (a path)
     * @param indexingMode the indexing mode. Either {@link #MODE_URL}, {@link #MODE_HASH} or 
     *                     {@link #MODE_BOTH}.
     * @param includeNormalizedURL Should NormalizedURL be included?
     * @param includeTimestamp Should timestamp be included?
     * @param includeEtag Should etag be included?
     * @param addToExistingIndex Are we adding to an existing index?
     * @throws IOException
     */
    public NetarchiveDigestIndexer(String indexLocation,
            String indexingMode,
            boolean includeNormalizedURL,
            boolean includeTimestamp,
            boolean includeEtag,
            boolean addToExistingIndex) throws IOException {
      super(indexLocation, indexingMode, includeNormalizedURL, 
              includeTimestamp, includeEtag, addToExistingIndex);
    }
    

    /**
     * Writes the contents of a {@link CrawlDataIterator} to this index.
     * <p>
     * This method may be invoked multiple times with different 
     * CrawlDataIterators until {@link #close(boolean)} has been called.
     * 
     * @param dataIt The CrawlDataIterator that provides the data to index.
     * @param mimefilter A regular expression that is used as a filter on the 
     *                   mimetypes to include in the index. 
     * @param blacklist If true then the <code>mimefilter</code> is used
     *                  as a blacklist for mimetypes. If false then the
     *                  <code>mimefilter</code> is treated as a whitelist. 
     * @param defaultOrigin If an item is missing an origin, this default value
     *                      will be assigned to it. Can be null if no default
     *                      origin value should be assigned.
     * @param verbose If true then progress information will be sent to 
     *                System.out.
     * @param skipDuplicates Do not add URLs that are marked as duplicates to
     *                       the index
     * @return The number of items added to the index.
     * @throws IOException If an error occurs writing the index.
     */
    public long writeToIndex(
            CrawlDataIterator dataIt, 
            String mimefilter, 
            boolean blacklist,
            String defaultOrigin,            
            boolean verbose,
            boolean skipDuplicates) 
    throws IOException {

        int count = 0;
        int skipped = 0;
        while (dataIt.hasNext()) {
            CrawlDataItem item = dataIt.next();
            if(!(skipDuplicates && item.isDuplicate()) &&
                    item.getMimeType().matches(mimefilter) != blacklist){
                // Ok, we wish to index this URL/Digest
                count++;
                if(verbose && count%10000==0){
                    System.out.println("Indexed " + count + " - Last URL " +
                            "from " + item.getTimestamp());
                }
                Document doc = new Document();

                // Add URL to index.
                //doc.add(new Field(
                Field newField = new Field(
                        FIELD_URL,
                        item.getURL(),
                        Field.Store.YES,
                        (indexURL ? Field.Index.UN_TOKENIZED : Field.Index.NO)
                );
                newField.setOmitNorms(true);
                doc.add(newField);
                if(equivalent){
                    newField = new Field(
                            FIELD_URL_NORMALIZED,
                            stripURL(item.getURL()),
                            Field.Store.YES,
                            (indexURL ? 
                                    Field.Index.UN_TOKENIZED : Field.Index.NO)
                    );
                    newField.setOmitNorms(true);
                    doc.add(newField);
                }

                // Add digest to index
                newField = new Field(
                        FIELD_DIGEST,
                        item.getContentDigest(),
                        Field.Store.YES,
                        (indexDigest ? 
                                Field.Index.UN_TOKENIZED : Field.Index.NO)
                );
                newField.setOmitNorms(true);
                doc.add(newField);

                if(timestamp){
                    newField = new Field(
                            FIELD_TIMESTAMP,
                            item.getTimestamp(),
                            Field.Store.YES,
                            Field.Index.NO
                    );
                    newField.setOmitNorms(true);
                    doc.add(newField);
                }
                if(etag && item.getEtag()!=null){
                    newField = new Field(
                            FIELD_ETAG,
                            item.getEtag(),
                            Field.Store.YES,
                            Field.Index.NO
                    );
                    newField.setOmitNorms(true);
                    doc.add(newField);

                }
                if(defaultOrigin!=null){
                    String tmp = item.getOrigin();
                    if(tmp==null){
                        tmp = defaultOrigin;
                    }
                    newField = new Field(
                            FIELD_ORIGIN,
                            tmp,
                            Field.Store.YES,
                            Field.Index.NO
                    );
                    newField.setOmitNorms(true);
                    doc.add(newField);
                }
                // Investigate whether or there is a statuscode available.
                if (item instanceof CrawlDataItemWithStatuscode) {
                    // add statuscode information to the index
                    CrawlDataItemWithStatuscode itemWithStatuscode = (CrawlDataItemWithStatuscode) item;
                    String statuscode = itemWithStatuscode.getStatuscode();
                    newField = new Field(
                            FIELD_STATUSCODE,
                            statuscode,
                            Field.Store.YES,
                            Field.Index.NO
                    );
                    newField.setOmitNorms(true);
                    doc.add(newField);
                }               
                index.addDocument(doc);
            } else {
                skipped++;
            }
        }
        if(verbose){
            System.out.println("Indexed " + count + " items (skipped " + 
                    skipped + ")");
        }
        return count;
    }
}