001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.harvester.indexserver;
025
026import java.io.File;
027import java.io.FileOutputStream;
028import java.io.IOException;
029import java.nio.channels.FileLock;
030import java.nio.channels.OverlappingFileLockException;
031import java.util.HashMap;
032import java.util.Map;
033import java.util.Set;
034
035import org.slf4j.Logger;
036import org.slf4j.LoggerFactory;
037
038import dk.netarkivet.common.CommonSettings;
039import dk.netarkivet.common.distribute.indexserver.Index;
040import dk.netarkivet.common.exceptions.ArgumentNotValid;
041import dk.netarkivet.common.exceptions.IOFailure;
042import dk.netarkivet.common.utils.FileUtils;
043import dk.netarkivet.common.utils.Settings;
044
045/**
046 * A generic cache that stores items in files. This abstract superclass handles placement of the cache directory and
047 * adding/getting files using the subclasses' methods for generating filenames.
048 *
049 * @param <T> The type of cache.
050 */
051public abstract class FileBasedCache<T> {
052
053    /** Logger. */
054    private static final Logger log = LoggerFactory.getLogger(FileBasedCache.class);
055
056    /** Cache directory. */
057    protected File cacheDir;
058
059    /**
060     * Creates a new FileBasedCache object. This creates a directory under the main cache directory holding cached
061     * files.
062     *
063     * @param cacheName Name of this cache (enabling sharing among processes). The directory created in the cachedir
064     * will have this name.
065     */
066    public FileBasedCache(String cacheName) {
067        ArgumentNotValid.checkNotNullOrEmpty(cacheName, "cacheName");
068        this.cacheDir = new File(new File(Settings.get(CommonSettings.CACHE_DIR)), cacheName).getAbsoluteFile();
069        log.info("Metadata cache for '{}' uses directory '{}'", cacheName, getCacheDir().getAbsolutePath());
070        FileUtils.createDir(getCacheDir());
071    }
072
073    /**
074     * Get the directory that the files are cached in. Subclasses should override this to create their own directory
075     * with this directory. The full directory structure will be created if required in the constructor.
076     *
077     * @return A directory that cache files can reside in.
078     */
079    public File getCacheDir() {
080        return cacheDir;
081    }
082
083    /**
084     * Get the file that caches content for the given ID.
085     *
086     * @param id Some sort of id that uniquely identifies the item within the cache.
087     * @return A file (possibly nonexistant or empty) that can cache the data for the id.
088     */
089    public abstract File getCacheFile(T id);
090
091    /**
092     * Fill in actual data in the file in the cache. This is the workhorse method that is allowed to modify the cache.
093     * When this method is called, the cache can assume that getCacheFile(id) does not exist.
094     *
095     * @param id Some identifier for the item to be cached.
096     * @return An id of content actually available. In most cases, this will be the same as id, but for complex I it
097     * could be a subset (or null if the type argument I is a simple type). If the return value is not the same as id,
098     * the file will not contain cached data, and may not even exist.
099     */
100    protected abstract T cacheData(T id);
101
102    /**
103     * Ensure that a file containing the appropriate content exists for the ID. If the content cannot be found, this
104     * method may return null (if I is a simple type) or an appropriate subset (if I is, say, a Set) indicating the data
105     * that is actually available. In the latter case, calling cache on the returned set should always fill the file for
106     * that subset (barring catastrophic failure).
107     * <p>
108     * Locking: If the file is not immediately found, we enter a file-creation state. To avoid corrupted data, we must
109     * ensure that only one cache instance, and only one thread within any instance, creates the file. Thus as long as
110     * somebody else seems to be creating the file, we wait and see if they finish. This is checked by having an
111     * exclusive lock on a ".working" file (we cannot use the result file, as it has to be created to be locked, and we
112     * may end up with a different cached file than we thought, see above). The .working file itself is irrelevant, only
113     * the lock on it matters.
114     *
115     * @param id Some sort of id that uniquely identifies the item within the cache.
116     * @return The id given if it was successfully fetched, otherwise null if the type parameter I does not allow
117     * subsets, or a subset of id if it does. This subset should be immediately cacheable.
118     */
119    public T cache(T id) {
120        ArgumentNotValid.checkNotNull(id, "id");
121        File cachedFile = getCacheFile(id);
122        try {
123            File fileBehindLockFile = new File(cachedFile.getAbsolutePath() + ".working");
124            FileOutputStream lockFile = new FileOutputStream(fileBehindLockFile);
125            FileLock lock = null;
126            // Make sure no other thread tries to create this
127            // FIXME welcome to a memory leak, intern strings are never freed from memory again!
128            log.debug("Waiting to enter synchronization on {}", fileBehindLockFile.getAbsolutePath().intern());
129            // FIXME Potential memory leak. intern() remembers all strings until JVM exits.
130            synchronized (fileBehindLockFile.getAbsolutePath().intern()) {
131                try {
132                    // Make sure no other process tries to create this.
133                    log.debug("locking filechannel for file '{}' (thread = {})", fileBehindLockFile.getAbsolutePath(),
134                            Thread.currentThread().getName());
135                    try {
136                        lock = lockFile.getChannel().lock();
137                    } catch (OverlappingFileLockException e) {
138                        // Exception is logged below
139                        throw new IOException(e.getMessage(), e);
140                    }
141                    // Now we know nobody else touches the file.
142                    // If the file already exists, just return it.
143                    if (cachedFile.exists()) {
144                        return id;
145                    }
146                    return cacheData(id);
147                } finally {
148                    if (lock != null) {
149                        log.debug("release lock on filechannel {}", lockFile.getChannel());
150                        lock.release();
151                    }
152                    lockFile.close();
153                }
154            }
155        } catch (IOException e) {
156            String errMsg = "Error obtaining lock for file '" + cachedFile.getAbsolutePath() + "'.";
157            log.warn(errMsg, e);
158            throw new IOFailure(errMsg, e);
159        }
160    }
161
162    /**
163     * Utility method to get a number of cache entries at a time. Implementations of FileBasedCache may override this to
164     * perform the caching more efficiently, if caching overhead per file is large.
165     *
166     * @param ids List of IDs that uniquely identify a set of items within the cache.
167     * @return A map from ID to the files containing cached data for those IDs. If caching failed, even partially, for
168     * an ID, the entry for the ID doesn't exist.
169     */
170    public Map<T, File> get(Set<T> ids) {
171        ArgumentNotValid.checkNotNull(ids, "Set<I> ids");
172        Map<T, File> result = new HashMap<T, File>(ids.size());
173        for (T id : ids) {
174            if (id.equals(cache(id))) {
175                result.put(id, getCacheFile(id));
176            } else {
177                result.put(id, null);
178            }
179        }
180        return result;
181    }
182
183    /**
184     * Forgiving index generating method, that returns a file with an index, of the greatest possible subset of a given
185     * id, and the subset.
186     * <p>
187     * If the type I for instance is a Set, you may get an index of only a subset. If I is a File, null may be seen as a
188     * subset.
189     *
190     * @param id The requested index.
191     * @return An index over the greatest possible subset, and the subset.
192     * @see #cache for more information.
193     */
194    public Index<T> getIndex(T id) {
195        T response = id;
196        T lastResponse = null;
197        while (response != null && !response.equals(lastResponse)) {
198            if (lastResponse != null) {
199                log.info("Requested index of type '{}' data '{}' not available. Retrying with available subset '{}'",
200                        this.getCacheDir().getName(), lastResponse, response);
201            }
202            lastResponse = response;
203            response = cache(lastResponse);
204        }
205        File cacheFile = getCacheFile(response);
206        log.info("Generated index '{}' of id '{}', request was for '{}'", cacheFile, response, id);
207        return new Index<T>(cacheFile, response);
208    }
209
210}