001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.harvester.indexserver; 025 026import java.io.BufferedReader; 027import java.io.BufferedWriter; 028import java.io.File; 029import java.io.FileReader; 030import java.io.FileWriter; 031import java.io.IOException; 032import java.util.Collection; 033import java.util.Map; 034import java.util.Set; 035 036import dk.netarkivet.common.distribute.indexserver.JobIndexCache; 037import dk.netarkivet.common.exceptions.IOFailure; 038import dk.netarkivet.common.exceptions.NotImplementedException; 039import dk.netarkivet.common.utils.FileUtils; 040 041/** 042 * A cache that serves CDX index files for job IDs. 043 * <p> 044 * Notice that since data for some IDs may not be available, the actual cached file might not correspond in its content 045 * to what was asked for. For instance, if asking for data for IDs 2, 3, and 4, and 3 fails, a cached file for IDs 2 and 046 * 4 will be returned. There is currently no way to tell if you got everything you asked for. 047 * <p> 048 * This cache uses the Unix sort(1) command as an external process call, as that one is optimized for handling large, 049 * disk-based sorts. 050 */ 051public class CDXIndexCache extends CombiningMultiFileBasedCache<Long> implements JobIndexCache { 052 053 /** A suffix used by the sortFile method in the sorting process. */ 054 private static final String WORK_SUFFIX = ".unsorted"; 055 056 /** 057 * Creates a new cache for CDX index files. 058 */ 059 public CDXIndexCache() { 060 super("cdxindex", new CDXDataCache()); 061 } 062 063 /** 064 * Combine parts of an index into one big index. 065 * 066 * @param filesFound A map of IDs and the files caching their content. 067 */ 068 protected void combine(Map<Long, File> filesFound) { 069 File resultFile = getCacheFile(filesFound.keySet()); 070 concatenateFiles(filesFound.values(), resultFile); 071 File workFile = new File(resultFile.getAbsolutePath() + WORK_SUFFIX); 072 workFile.deleteOnExit(); 073 try { 074 FileUtils.sortCDX(resultFile, workFile); 075 workFile.renameTo(resultFile); 076 } finally { 077 FileUtils.remove(workFile); 078 } 079 } 080 081 /** 082 * Concatenate a set of files into a single file. 083 * 084 * @param files The files to concatenate. 085 * @param resultFile The file where the files are concatenated into. 086 */ 087 private static void concatenateFiles(Collection<File> files, File resultFile) { 088 try { 089 BufferedWriter out = null; 090 try { 091 out = new BufferedWriter(new FileWriter(resultFile)); 092 for (File f : files) { 093 BufferedReader in = null; 094 try { 095 in = new BufferedReader(new FileReader(f)); 096 String s; 097 while ((s = in.readLine()) != null) { 098 out.write(s); 099 out.newLine(); 100 } 101 } finally { 102 if (in != null) { 103 in.close(); 104 } 105 } 106 } 107 } finally { 108 if (out != null) { 109 out.close(); 110 } 111 } 112 } catch (IOException e) { 113 throw new IOFailure("Couldn't combine indexes for " + files.size() + " jobs into " + resultFile, e); 114 } 115 } 116 117 @Override 118 public void requestIndex(Set<Long> jobSet, Long harvestId) { 119 throw new NotImplementedException("This feature is not implemented for this type of cache"); 120 } 121 122}