Source code

001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.indexserver;
024
025import java.io.File;
026import java.util.concurrent.Callable;
027
028import org.slf4j.Logger;
029import org.slf4j.LoggerFactory;
030
031import dk.netarkivet.common.exceptions.ArgumentNotValid;
032import is.hi.bok.deduplicator.DigestIndexer;
033
034/**
035 * This worker class handles the indexing of one single crawl-log and associated cdxfile.
036 */
037public class DigestIndexerWorker implements Callable<Boolean> {
038
039    /** The log. */
040    private static final Logger log = LoggerFactory.getLogger(DigestIndexerWorker.class);
041
042    /** The full path to the index. */
043    private String indexlocation;
044    /** The ID of the job which logfiles are being indexed. */
045    private Long jobId;
046    /** The crawllog from the job. */
047    private File crawlLog;
048    /** The cdxfile from the job. */
049    private File cdxfile;
050    /** The options for the indexing process. */
051    private DigestOptions indexingOptions;
052    /** Was this process successful. */
053    private boolean successful = true;
054    /** String defining this task among other tasks. */
055    private String taskID;
056
057    /**
058     * Constructor for the DigestIndexerWorker.
059     *
060     * @param indexpath The full path to the index
061     * @param jobId The ID of the job which logfiles are being indexed
062     * @param crawllogfile The crawllog from the job
063     * @param cdxFile The cdxfile from the job
064     * @param indexingOptions The options for the indexing process.
065     * @param taskID string defining this task
066     */
067    public DigestIndexerWorker(String indexpath, Long jobId, File crawllogfile, File cdxFile,
068            DigestOptions indexingOptions, String taskID) {
069        ArgumentNotValid.checkNotNullOrEmpty(indexpath, "String indexpath");
070        ArgumentNotValid.checkNotNull(crawllogfile, "File crawllogfile");
071        ArgumentNotValid.checkNotNull(cdxFile, "File cdxFile");
072        ArgumentNotValid.checkNotNull(indexingOptions, "DigestOptions indexingOptions");
073        ArgumentNotValid.checkNotNullOrEmpty(taskID, "String taskID");
074        this.indexlocation = indexpath;
075        this.jobId = jobId;
076        this.crawlLog = crawllogfile;
077        this.cdxfile = cdxFile;
078        this.indexingOptions = indexingOptions;
079        this.taskID = taskID;
080    }
081
082    /**
083     * This method does the actual indexing.
084     *
085     * @return true, if the indexing completes successfully; otherwise it returns false
086     */
087    @Override
088    public Boolean call() {
089        try {
090            log.info("Starting subindexing task ({}) of data from job {}", taskID, this.jobId);
091            DigestIndexer localindexer = CrawlLogIndexCache.createStandardIndexer(indexlocation);
092            CrawlLogIndexCache.indexFile(jobId, crawlLog, cdxfile, localindexer, indexingOptions);
093
094            log.info("Completed subindexing task ({}) of data from job {} w/ {} index-entries)", taskID, this.jobId,
095                    localindexer.getIndex().numDocs());
096
097            localindexer.close();
098        } catch (Throwable t) {
099            successful = false;
100            log.warn("Indexing for job w/ id {} failed.", jobId, t);
101        }
102        return successful;
103
104    }
105
106}