001/*
002 * #%L
003 * Netarchivesuite - wayback
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.wayback.aggregator;
025
026import java.io.File;
027import java.util.LinkedList;
028import java.util.List;
029
030import org.slf4j.Logger;
031import org.slf4j.LoggerFactory;
032
033import dk.netarkivet.common.exceptions.ArgumentNotValid;
034import dk.netarkivet.common.utils.Settings;
035import dk.netarkivet.wayback.WaybackSettings;
036
037/**
038 * Encapsulates the functionality for sorting and merging index files. Uses the Unix sort cmd for optimized sorting and
039 * file merging. Operations in this class are synchronized to avoid multiple jobs running at the same time (by the same
040 * object at least).
041 */
042public class IndexAggregator {
043    /** The logger for this class. */
044    //private Log log = LogFactory.getLog(getClass().getName());
045    private final Logger log = LoggerFactory.getLogger(IndexAggregator.class);
046
047    /**
048     * Generates a sorted CDX index file based on the set of unsorted CDX input files.
049     * <p>
050     * The operation will not run on a folder which already has a process job running.
051     *
052     * @param files A list of the files to aggregate
053     * @param outputFile Name of the output file. In case of a empty filesNames array no outputFiles will be generated
054     */
055    public void sortAndMergeFiles(File[] files, File outputFile) {
056        processFiles(files, outputFile, null);
057    }
058
059    /**
060     * Takes a list of sorted files and merges them.
061     *
062     * @param files The files to merge.
063     * @param outputFile The resulting file containing total sorted set of index lines found in all the provided index
064     * files
065     */
066
067    public void mergeFiles(File[] files, File outputFile) {
068        List<String> args = new LinkedList<String>();
069        args.add("-m");
070        processFiles(files, outputFile, args);
071    }
072
073    /**
074     * Calls the Unix sort command with the options <code>$filesNames -o
075     * $outputfile -T WaybackSettings#WAYBACK_AGGREGATOR_TEMP_DIR.
076     * <p>
077     * Sets the LC_ALL environment variable before making the call.
078     *
079     * @param files The files to merge and sort
080     * @param outputFile The resulting sorted file
081     * @param additionalArgs A list af extra arguments, which (if different from null) are added to the sort call.<p>
082     * Note: If any of the args contain a whitespace the call will fail.
083     */
084    private void processFiles(File[] files, File outputFile, List<String> additionalArgs) {
085        if (files.length == 0) {
086            // Empty file list will cause sort to wait for further input,
087            // and the call will therefore never return
088            return;
089        }
090
091        Process p = null;
092
093        try {
094            List<String> inputFileList = new LinkedList<String>();
095            for (int i = 0; i < files.length; i++) {
096                if (files[i].exists() && files[i].isFile()) {
097                    inputFileList.add(files[i].getCanonicalPath());
098                } else {
099                    log.warn("File " + files[i] + " doesn't exist or isn't a regular file, "
100                            + "dropping from list of files to " + "sort and merge");
101                }
102            }
103            List<String> cmd = new LinkedList<String>();
104            // Prepare to run the unix sort command, see sort manual page for
105            // details
106            cmd.add("sort");
107            cmd.addAll(inputFileList);
108            cmd.add("-o");
109            cmd.add(outputFile.getCanonicalPath());
110            cmd.add("-T");
111            cmd.add(Settings.get(WaybackSettings.WAYBACK_AGGREGATOR_TEMP_DIR));
112            if (additionalArgs != null && !additionalArgs.isEmpty()) {
113                for (String argument : additionalArgs) {
114                    ArgumentNotValid.checkTrue(argument.indexOf(' ') == -1, "The argument '" + argument
115                            + "' contains spaces, this isn't allowed ");
116                }
117                cmd.addAll(additionalArgs);
118            }
119            ProcessBuilder pb = new ProcessBuilder(cmd);
120            // Reset all locale definitions
121            pb.environment().put("LC_ALL", "C");
122            // Run the command in the user.dir directory
123            pb.directory(new File(System.getProperty("user.dir")));
124            p = pb.start();
125            p.waitFor();
126            if (p.exitValue() != 0) {
127                log.error("Failed to sort index files, sort exited with " + "return code " + p.exitValue());
128            }
129        } catch (Exception e) {
130            log.error("Failed to aggregate indexes ", e);
131        }
132    }
133}