001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.harvester.indexserver; 025 026import java.io.File; 027import java.io.FileOutputStream; 028import java.io.IOException; 029import java.nio.channels.FileLock; 030import java.nio.channels.OverlappingFileLockException; 031import java.util.HashMap; 032import java.util.Map; 033import java.util.Set; 034 035import org.slf4j.Logger; 036import org.slf4j.LoggerFactory; 037 038import dk.netarkivet.common.CommonSettings; 039import dk.netarkivet.common.distribute.indexserver.Index; 040import dk.netarkivet.common.exceptions.ArgumentNotValid; 041import dk.netarkivet.common.exceptions.IOFailure; 042import dk.netarkivet.common.utils.FileUtils; 043import dk.netarkivet.common.utils.Settings; 044 045/** 046 * A generic cache that stores items in files. This abstract superclass handles placement of the cache directory and 047 * adding/getting files using the subclasses' methods for generating filenames. 048 * 049 * @param <T> The type of cache. 050 */ 051public abstract class FileBasedCache<T> { 052 053 /** Logger. */ 054 private static final Logger log = LoggerFactory.getLogger(FileBasedCache.class); 055 056 /** Cache directory. */ 057 protected File cacheDir; 058 059 /** 060 * Creates a new FileBasedCache object. This creates a directory under the main cache directory holding cached 061 * files. 062 * 063 * @param cacheName Name of this cache (enabling sharing among processes). The directory created in the cachedir 064 * will have this name. 065 */ 066 public FileBasedCache(String cacheName) { 067 ArgumentNotValid.checkNotNullOrEmpty(cacheName, "cacheName"); 068 this.cacheDir = new File(new File(Settings.get(CommonSettings.CACHE_DIR)), cacheName).getAbsoluteFile(); 069 log.info("Metadata cache for '{}' uses directory '{}'", cacheName, getCacheDir().getAbsolutePath()); 070 FileUtils.createDir(getCacheDir()); 071 } 072 073 /** 074 * Get the directory that the files are cached in. Subclasses should override this to create their own directory 075 * with this directory. The full directory structure will be created if required in the constructor. 076 * 077 * @return A directory that cache files can reside in. 078 */ 079 public File getCacheDir() { 080 return cacheDir; 081 } 082 083 /** 084 * Get the file that caches content for the given ID. 085 * 086 * @param id Some sort of id that uniquely identifies the item within the cache. 087 * @return A file (possibly nonexistant or empty) that can cache the data for the id. 088 */ 089 public abstract File getCacheFile(T id); 090 091 /** 092 * Fill in actual data in the file in the cache. This is the workhorse method that is allowed to modify the cache. 093 * When this method is called, the cache can assume that getCacheFile(id) does not exist. 094 * 095 * @param id Some identifier for the item to be cached. 096 * @return An id of content actually available. In most cases, this will be the same as id, but for complex I it 097 * could be a subset (or null if the type argument I is a simple type). If the return value is not the same as id, 098 * the file will not contain cached data, and may not even exist. 099 */ 100 protected abstract T cacheData(T id); 101 102 /** 103 * Ensure that a file containing the appropriate content exists for the ID. If the content cannot be found, this 104 * method may return null (if I is a simple type) or an appropriate subset (if I is, say, a Set) indicating the data 105 * that is actually available. In the latter case, calling cache on the returned set should always fill the file for 106 * that subset (barring catastrophic failure). 107 * <p> 108 * Locking: If the file is not immediately found, we enter a file-creation state. To avoid corrupted data, we must 109 * ensure that only one cache instance, and only one thread within any instance, creates the file. Thus as long as 110 * somebody else seems to be creating the file, we wait and see if they finish. This is checked by having an 111 * exclusive lock on a ".working" file (we cannot use the result file, as it has to be created to be locked, and we 112 * may end up with a different cached file than we thought, see above). The .working file itself is irrelevant, only 113 * the lock on it matters. 114 * 115 * @param id Some sort of id that uniquely identifies the item within the cache. 116 * @return The id given if it was successfully fetched, otherwise null if the type parameter I does not allow 117 * subsets, or a subset of id if it does. This subset should be immediately cacheable. 118 */ 119 public T cache(T id) { 120 ArgumentNotValid.checkNotNull(id, "id"); 121 File cachedFile = getCacheFile(id); 122 try { 123 File fileBehindLockFile = new File(cachedFile.getAbsolutePath() + ".working"); 124 FileOutputStream lockFile = new FileOutputStream(fileBehindLockFile); 125 FileLock lock = null; 126 // Make sure no other thread tries to create this 127 // FIXME welcome to a memory leak, intern strings are never freed from memory again! 128 log.debug("Waiting to enter synchronization on {}", fileBehindLockFile.getAbsolutePath().intern()); 129 // FIXME Potential memory leak. intern() remembers all strings until JVM exits. 130 synchronized (fileBehindLockFile.getAbsolutePath().intern()) { 131 try { 132 // Make sure no other process tries to create this. 133 log.debug("locking filechannel for file '{}' (thread = {})", fileBehindLockFile.getAbsolutePath(), 134 Thread.currentThread().getName()); 135 try { 136 lock = lockFile.getChannel().lock(); 137 } catch (OverlappingFileLockException e) { 138 // Exception is logged below 139 throw new IOException(e.getMessage(), e); 140 } 141 // Now we know nobody else touches the file. 142 // If the file already exists, just return it. 143 if (cachedFile.exists()) { 144 return id; 145 } 146 return cacheData(id); 147 } finally { 148 if (lock != null) { 149 log.debug("release lock on filechannel {}", lockFile.getChannel()); 150 lock.release(); 151 } 152 lockFile.close(); 153 } 154 } 155 } catch (IOException e) { 156 String errMsg = "Error obtaining lock for file '" + cachedFile.getAbsolutePath() + "'."; 157 log.warn(errMsg, e); 158 throw new IOFailure(errMsg, e); 159 } 160 } 161 162 /** 163 * Utility method to get a number of cache entries at a time. Implementations of FileBasedCache may override this to 164 * perform the caching more efficiently, if caching overhead per file is large. 165 * 166 * @param ids List of IDs that uniquely identify a set of items within the cache. 167 * @return A map from ID to the files containing cached data for those IDs. If caching failed, even partially, for 168 * an ID, the entry for the ID doesn't exist. 169 */ 170 public Map<T, File> get(Set<T> ids) { 171 ArgumentNotValid.checkNotNull(ids, "Set<I> ids"); 172 Map<T, File> result = new HashMap<T, File>(ids.size()); 173 for (T id : ids) { 174 if (id.equals(cache(id))) { 175 result.put(id, getCacheFile(id)); 176 } else { 177 result.put(id, null); 178 } 179 } 180 return result; 181 } 182 183 /** 184 * Forgiving index generating method, that returns a file with an index, of the greatest possible subset of a given 185 * id, and the subset. 186 * <p> 187 * If the type I for instance is a Set, you may get an index of only a subset. If I is a File, null may be seen as a 188 * subset. 189 * 190 * @param id The requested index. 191 * @return An index over the greatest possible subset, and the subset. 192 * @see #cache for more information. 193 */ 194 public Index<T> getIndex(T id) { 195 T response = id; 196 T lastResponse = null; 197 while (response != null && !response.equals(lastResponse)) { 198 if (lastResponse != null) { 199 log.info("Requested index of type '{}' data '{}' not available. Retrying with available subset '{}'", 200 this.getCacheDir().getName(), lastResponse, response); 201 } 202 lastResponse = response; 203 response = cache(lastResponse); 204 } 205 File cacheFile = getCacheFile(response); 206 log.info("Generated index '{}' of id '{}', request was for '{}'", cacheFile, response, id); 207 return new Index<T>(cacheFile, response); 208 } 209 210}