001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.indexserver; 024 025import java.io.File; 026import java.util.concurrent.Callable; 027 028import org.slf4j.Logger; 029import org.slf4j.LoggerFactory; 030 031import dk.netarkivet.common.exceptions.ArgumentNotValid; 032import is.hi.bok.deduplicator.DigestIndexer; 033 034/** 035 * This worker class handles the indexing of one single crawl-log and associated cdxfile. 036 */ 037public class DigestIndexerWorker implements Callable<Boolean> { 038 039 /** The log. */ 040 private static final Logger log = LoggerFactory.getLogger(DigestIndexerWorker.class); 041 042 /** The full path to the index. */ 043 private String indexlocation; 044 /** The ID of the job which logfiles are being indexed. */ 045 private Long jobId; 046 /** The crawllog from the job. */ 047 private File crawlLog; 048 /** The cdxfile from the job. */ 049 private File cdxfile; 050 /** The options for the indexing process. */ 051 private DigestOptions indexingOptions; 052 /** Was this process successful. */ 053 private boolean successful = true; 054 /** String defining this task among other tasks. */ 055 private String taskID; 056 057 /** 058 * Constructor for the DigestIndexerWorker. 059 * 060 * @param indexpath The full path to the index 061 * @param jobId The ID of the job which logfiles are being indexed 062 * @param crawllogfile The crawllog from the job 063 * @param cdxFile The cdxfile from the job 064 * @param indexingOptions The options for the indexing process. 065 * @param taskID string defining this task 066 */ 067 public DigestIndexerWorker(String indexpath, Long jobId, File crawllogfile, File cdxFile, 068 DigestOptions indexingOptions, String taskID) { 069 ArgumentNotValid.checkNotNullOrEmpty(indexpath, "String indexpath"); 070 ArgumentNotValid.checkNotNull(crawllogfile, "File crawllogfile"); 071 ArgumentNotValid.checkNotNull(cdxFile, "File cdxFile"); 072 ArgumentNotValid.checkNotNull(indexingOptions, "DigestOptions indexingOptions"); 073 ArgumentNotValid.checkNotNullOrEmpty(taskID, "String taskID"); 074 this.indexlocation = indexpath; 075 this.jobId = jobId; 076 this.crawlLog = crawllogfile; 077 this.cdxfile = cdxFile; 078 this.indexingOptions = indexingOptions; 079 this.taskID = taskID; 080 } 081 082 /** 083 * This method does the actual indexing. 084 * 085 * @return true, if the indexing completes successfully; otherwise it returns false 086 */ 087 @Override 088 public Boolean call() { 089 try { 090 log.info("Starting subindexing task ({}) of data from job {}", taskID, this.jobId); 091 DigestIndexer localindexer = CrawlLogIndexCache.createStandardIndexer(indexlocation); 092 CrawlLogIndexCache.indexFile(jobId, crawlLog, cdxfile, localindexer, indexingOptions); 093 094 log.info("Completed subindexing task ({}) of data from job {} w/ {} index-entries)", taskID, this.jobId, 095 localindexer.getIndex().numDocs()); 096 097 localindexer.close(); 098 } catch (Throwable t) { 099 successful = false; 100 log.warn("Indexing for job w/ id {} failed.", jobId, t); 101 } 102 return successful; 103 104 } 105 106}