001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.harvester.indexserver; 025 026import java.io.File; 027import java.util.HashMap; 028import java.util.Map; 029import java.util.Set; 030 031import dk.netarkivet.common.utils.FileUtils; 032 033/** 034 * This class provides the framework for classes that cache the effort of combining multiple files into one. For 035 * instance, creating a Lucene index out of crawl.log files takes O(nlogn) where n is the number of lines in the files 036 * combined. 037 * <p> 038 * It is based on an underlying cache of single files. It handles the possibility of some of the files in the underlying 039 * cache not being available by telling which files are available rather than by sending an incomplete file. 040 * 041 * @param <T> A comparable instance. Must inherit the java.lang.Comparable interface. 042 */ 043public abstract class CombiningMultiFileBasedCache<T extends Comparable<T>> extends MultiFileBasedCache<T> { 044 045 /** The raw data cache that this cache gets data from. */ 046 protected FileBasedCache<T> rawcache; 047 048 /** 049 * Constructor for a CombiningMultiFileBasedCache. 050 * 051 * @param name The name of the cache 052 * @param rawcache The underlying cache of single files. 053 */ 054 protected CombiningMultiFileBasedCache(String name, FileBasedCache<T> rawcache) { 055 super(name); 056 this.rawcache = rawcache; 057 } 058 059 /** 060 * This is called when an appropriate file for the ids in question has not been found. It is expected to do the 061 * actual operations necessary to get the data. At the outset, the file for the given IDs is expected to be not 062 * present. 063 * 064 * @param ids The set of identifiers for which we want the corresponding data 065 * @return The set of IDs, or subset if data fetching failed for some IDs. If some IDs failed, the file is not 066 * filled, though some data may be cached at a lower level. 067 */ 068 protected Set<T> cacheData(Set<T> ids) { 069 Map<T, File> filesFound = prepareCombine(ids); 070 File resultFile = getCacheFile(ids); 071 if (filesFound.size() == ids.size()) { 072 combine(filesFound); 073 } else { 074 FileUtils.remove(resultFile); 075 } 076 return filesFound.keySet(); 077 } 078 079 /** 080 * Prepare needed data for performing combine(). This should ensure that all data is ready to use, or else the ids 081 * where the data cannot be obtained should be missing in the returned set. 082 * 083 * @param ids Set of job IDs to get ready to combine 084 * @return The map of ID->file of the data we will combine for each ID. If subclasses override this method to ensure 085 * other data is present, jobs with missing IDs should be removed from this map. 086 */ 087 protected Map<T, File> prepareCombine(Set<T> ids) { 088 Map<T, File> rawdata = rawcache.get(ids); 089 // First figure out which files were found 090 Map<T, File> filesFound = new HashMap<T, File>(); 091 for (Map.Entry<T, File> entry : rawdata.entrySet()) { 092 if (entry.getValue() != null) { 093 filesFound.put(entry.getKey(), entry.getValue()); 094 } 095 } 096 return filesFound; 097 } 098 099 /** 100 * Combine a set of files found in the raw data cache to form our kind of file. 101 * 102 * @param filesFound The files that were found for the IDs in the raw data cache. The map must not contain any null 103 * values. 104 */ 105 protected abstract void combine(Map<T, File> filesFound); 106 107}