001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.harvester.indexserver;
025
026import java.io.BufferedReader;
027import java.io.BufferedWriter;
028import java.io.File;
029import java.io.FileReader;
030import java.io.FileWriter;
031import java.io.IOException;
032import java.util.Collection;
033import java.util.Map;
034import java.util.Set;
035
036import dk.netarkivet.common.distribute.indexserver.JobIndexCache;
037import dk.netarkivet.common.exceptions.IOFailure;
038import dk.netarkivet.common.exceptions.NotImplementedException;
039import dk.netarkivet.common.utils.FileUtils;
040
041/**
042 * A cache that serves CDX index files for job IDs.
043 * <p>
044 * Notice that since data for some IDs may not be available, the actual cached file might not correspond in its content
045 * to what was asked for. For instance, if asking for data for IDs 2, 3, and 4, and 3 fails, a cached file for IDs 2 and
046 * 4 will be returned. There is currently no way to tell if you got everything you asked for.
047 * <p>
048 * This cache uses the Unix sort(1) command as an external process call, as that one is optimized for handling large,
049 * disk-based sorts.
050 */
051public class CDXIndexCache extends CombiningMultiFileBasedCache<Long> implements JobIndexCache {
052
053    /** A suffix used by the sortFile method in the sorting process. */
054    private static final String WORK_SUFFIX = ".unsorted";
055
056    /**
057     * Creates a new cache for CDX index files.
058     */
059    public CDXIndexCache() {
060        super("cdxindex", new CDXDataCache());
061    }
062
063    /**
064     * Combine parts of an index into one big index.
065     *
066     * @param filesFound A map of IDs and the files caching their content.
067     */
068    protected void combine(Map<Long, File> filesFound) {
069        File resultFile = getCacheFile(filesFound.keySet());
070        concatenateFiles(filesFound.values(), resultFile);
071        File workFile = new File(resultFile.getAbsolutePath() + WORK_SUFFIX);
072        workFile.deleteOnExit();
073        try {
074            FileUtils.sortCDX(resultFile, workFile);
075            workFile.renameTo(resultFile);
076        } finally {
077            FileUtils.remove(workFile);
078        }
079    }
080
081    /**
082     * Concatenate a set of files into a single file.
083     *
084     * @param files The files to concatenate.
085     * @param resultFile The file where the files are concatenated into.
086     */
087    private static void concatenateFiles(Collection<File> files, File resultFile) {
088        try {
089            BufferedWriter out = null;
090            try {
091                out = new BufferedWriter(new FileWriter(resultFile));
092                for (File f : files) {
093                    BufferedReader in = null;
094                    try {
095                        in = new BufferedReader(new FileReader(f));
096                        String s;
097                        while ((s = in.readLine()) != null) {
098                            out.write(s);
099                            out.newLine();
100                        }
101                    } finally {
102                        if (in != null) {
103                            in.close();
104                        }
105                    }
106                }
107            } finally {
108                if (out != null) {
109                    out.close();
110                }
111            }
112        } catch (IOException e) {
113            throw new IOFailure("Couldn't combine indexes for " + files.size() + " jobs into " + resultFile, e);
114        }
115    }
116
117    @Override
118    public void requestIndex(Set<Long> jobSet, Long harvestId) {
119        throw new NotImplementedException("This feature is not implemented for this type of cache");
120    }
121
122}