001/* 002 * #%L 003 * Netarchivesuite - wayback 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.wayback.aggregator; 025 026import java.io.File; 027import java.util.LinkedList; 028import java.util.List; 029 030import org.slf4j.Logger; 031import org.slf4j.LoggerFactory; 032 033import dk.netarkivet.common.exceptions.ArgumentNotValid; 034import dk.netarkivet.common.utils.Settings; 035import dk.netarkivet.wayback.WaybackSettings; 036 037/** 038 * Encapsulates the functionality for sorting and merging index files. Uses the Unix sort cmd for optimized sorting and 039 * file merging. Operations in this class are synchronized to avoid multiple jobs running at the same time (by the same 040 * object at least). 041 */ 042public class IndexAggregator { 043 /** The logger for this class. */ 044 //private Log log = LogFactory.getLog(getClass().getName()); 045 private final Logger log = LoggerFactory.getLogger(IndexAggregator.class); 046 047 /** 048 * Generates a sorted CDX index file based on the set of unsorted CDX input files. 049 * <p> 050 * The operation will not run on a folder which already has a process job running. 051 * 052 * @param files A list of the files to aggregate 053 * @param outputFile Name of the output file. In case of a empty filesNames array no outputFiles will be generated 054 */ 055 public void sortAndMergeFiles(File[] files, File outputFile) { 056 processFiles(files, outputFile, null); 057 } 058 059 /** 060 * Takes a list of sorted files and merges them. 061 * 062 * @param files The files to merge. 063 * @param outputFile The resulting file containing total sorted set of index lines found in all the provided index 064 * files 065 */ 066 067 public void mergeFiles(File[] files, File outputFile) { 068 List<String> args = new LinkedList<String>(); 069 args.add("-m"); 070 processFiles(files, outputFile, args); 071 } 072 073 /** 074 * Calls the Unix sort command with the options <code>$filesNames -o 075 * $outputfile -T WaybackSettings#WAYBACK_AGGREGATOR_TEMP_DIR. 076 * <p> 077 * Sets the LC_ALL environment variable before making the call. 078 * 079 * @param files The files to merge and sort 080 * @param outputFile The resulting sorted file 081 * @param additionalArgs A list af extra arguments, which (if different from null) are added to the sort call.<p> 082 * Note: If any of the args contain a whitespace the call will fail. 083 */ 084 private void processFiles(File[] files, File outputFile, List<String> additionalArgs) { 085 if (files.length == 0) { 086 // Empty file list will cause sort to wait for further input, 087 // and the call will therefore never return 088 return; 089 } 090 091 Process p = null; 092 093 try { 094 List<String> inputFileList = new LinkedList<String>(); 095 for (int i = 0; i < files.length; i++) { 096 if (files[i].exists() && files[i].isFile()) { 097 inputFileList.add(files[i].getCanonicalPath()); 098 } else { 099 log.warn("File " + files[i] + " doesn't exist or isn't a regular file, " 100 + "dropping from list of files to " + "sort and merge"); 101 } 102 } 103 List<String> cmd = new LinkedList<String>(); 104 // Prepare to run the unix sort command, see sort manual page for 105 // details 106 cmd.add("sort"); 107 cmd.addAll(inputFileList); 108 cmd.add("-o"); 109 cmd.add(outputFile.getCanonicalPath()); 110 cmd.add("-T"); 111 cmd.add(Settings.get(WaybackSettings.WAYBACK_AGGREGATOR_TEMP_DIR)); 112 if (additionalArgs != null && !additionalArgs.isEmpty()) { 113 for (String argument : additionalArgs) { 114 ArgumentNotValid.checkTrue(argument.indexOf(' ') == -1, "The argument '" + argument 115 + "' contains spaces, this isn't allowed "); 116 } 117 cmd.addAll(additionalArgs); 118 } 119 ProcessBuilder pb = new ProcessBuilder(cmd); 120 // Reset all locale definitions 121 pb.environment().put("LC_ALL", "C"); 122 // Run the command in the user.dir directory 123 pb.directory(new File(System.getProperty("user.dir"))); 124 p = pb.start(); 125 p.waitFor(); 126 if (p.exitValue() != 0) { 127 log.error("Failed to sort index files, sort exited with " + "return code " + p.exitValue()); 128 } 129 } catch (Exception e) { 130 log.error("Failed to aggregate indexes ", e); 131 } 132 } 133}