001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.harvester.indexserver;
025
026import java.io.File;
027import java.util.HashMap;
028import java.util.Map;
029import java.util.Set;
030
031import dk.netarkivet.common.utils.FileUtils;
032
033/**
034 * This class provides the framework for classes that cache the effort of combining multiple files into one. For
035 * instance, creating a Lucene index out of crawl.log files takes O(nlogn) where n is the number of lines in the files
036 * combined.
037 * <p>
038 * It is based on an underlying cache of single files. It handles the possibility of some of the files in the underlying
039 * cache not being available by telling which files are available rather than by sending an incomplete file.
040 *
041 * @param <T> A comparable instance. Must inherit the java.lang.Comparable interface.
042 */
043public abstract class CombiningMultiFileBasedCache<T extends Comparable<T>> extends MultiFileBasedCache<T> {
044
045    /** The raw data cache that this cache gets data from. */
046    protected FileBasedCache<T> rawcache;
047
048    /**
049     * Constructor for a CombiningMultiFileBasedCache.
050     *
051     * @param name The name of the cache
052     * @param rawcache The underlying cache of single files.
053     */
054    protected CombiningMultiFileBasedCache(String name, FileBasedCache<T> rawcache) {
055        super(name);
056        this.rawcache = rawcache;
057    }
058
059    /**
060     * This is called when an appropriate file for the ids in question has not been found. It is expected to do the
061     * actual operations necessary to get the data. At the outset, the file for the given IDs is expected to be not
062     * present.
063     *
064     * @param ids The set of identifiers for which we want the corresponding data
065     * @return The set of IDs, or subset if data fetching failed for some IDs. If some IDs failed, the file is not
066     * filled, though some data may be cached at a lower level.
067     */
068    protected Set<T> cacheData(Set<T> ids) {
069        Map<T, File> filesFound = prepareCombine(ids);
070        File resultFile = getCacheFile(ids);
071        if (filesFound.size() == ids.size()) {
072            combine(filesFound);
073        } else {
074            FileUtils.remove(resultFile);
075        }
076        return filesFound.keySet();
077    }
078
079    /**
080     * Prepare needed data for performing combine(). This should ensure that all data is ready to use, or else the ids
081     * where the data cannot be obtained should be missing in the returned set.
082     *
083     * @param ids Set of job IDs to get ready to combine
084     * @return The map of ID->file of the data we will combine for each ID. If subclasses override this method to ensure
085     * other data is present, jobs with missing IDs should be removed from this map.
086     */
087    protected Map<T, File> prepareCombine(Set<T> ids) {
088        Map<T, File> rawdata = rawcache.get(ids);
089        // First figure out which files were found
090        Map<T, File> filesFound = new HashMap<T, File>();
091        for (Map.Entry<T, File> entry : rawdata.entrySet()) {
092            if (entry.getValue() != null) {
093                filesFound.put(entry.getKey(), entry.getValue());
094            }
095        }
096        return filesFound;
097    }
098
099    /**
100     * Combine a set of files found in the raw data cache to form our kind of file.
101     *
102     * @param filesFound The files that were found for the IDs in the raw data cache. The map must not contain any null
103     * values.
104     */
105    protected abstract void combine(Map<T, File> filesFound);
106
107}