001/*
002 * #%L
003 * Netarchivesuite - common
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.common.utils;
024
025import java.io.BufferedReader;
026import java.io.File;
027import java.io.FileInputStream;
028import java.io.FileOutputStream;
029import java.io.FileReader;
030import java.io.FileWriter;
031import java.io.FilenameFilter;
032import java.io.IOException;
033import java.io.InputStream;
034import java.io.OutputStream;
035import java.io.PrintWriter;
036import java.io.RandomAccessFile;
037import java.nio.channels.FileChannel;
038import java.text.DecimalFormat;
039import java.text.NumberFormat;
040import java.util.ArrayList;
041import java.util.Collection;
042import java.util.Collections;
043import java.util.List;
044import java.util.Set;
045
046import org.slf4j.Logger;
047import org.slf4j.LoggerFactory;
048
049import dk.netarkivet.common.CommonSettings;
050import dk.netarkivet.common.Constants;
051import dk.netarkivet.common.exceptions.ArgumentNotValid;
052import dk.netarkivet.common.exceptions.IOFailure;
053import dk.netarkivet.common.exceptions.PermissionDenied;
054import dk.netarkivet.common.exceptions.UnknownID;
055
056/**
057 * Misc. handy file utilities.
058 */
059public class FileUtils {
060
061    /** The logger for this class. */
062    private static final Logger log = LoggerFactory.getLogger(FileUtils.class);
063
064    /** Extension used for CDX files, including separator . */
065    public static final String CDX_EXTENSION = ".cdx";
066
067    /** Extension used for ARC files, including separator . */
068    public static final String ARC_EXTENSION = ".arc";
069
070    /** Extension used for gzipped ARC files, including separator . */
071    public static final String ARC_GZIPPED_EXTENSION = ".arc.gz";
072
073    /** Extension used for WARC files, including separator . */
074    public static final String WARC_EXTENSION = ".warc";
075
076    /** Extension used for gzipped WARC files, including separator . */
077    public static final String WARC_GZIPPED_EXTENSION = ".warc.gz";
078
079    /**
080     * Pattern matching ARC files, including separator. Note: (?i) means case insensitive, (\\.gz)? means .gz is
081     * optionally matched, and $ means matches end-of-line. Thus this pattern will match file.arc.gz, file.ARC,
082     * file.aRc.GZ, but not file.ARC.open
083     */
084    public static final String ARC_PATTERN = "(?i)\\.arc(\\.gz)?$";
085
086    /**
087     * Pattern matching open ARC files, including separator . Note: (?i) means case insensitive, (\\.gz)? means .gz is
088     * optionally matched, and $ means matches end-of-line. Thus this pattern will match file.arc.gz.open,
089     * file.ARC.open, file.arc.GZ.OpEn, but not file.ARC.open.txt
090     */
091    public static final String OPEN_ARC_PATTERN = "(?i)\\.arc(\\.gz)?\\.open$";
092
093    /**
094     * Pattern matching WARC files, including separator. Note: (?i) means case insensitive, (\\.gz)? means .gz is
095     * optionally matched, and $ means matches end-of-line. Thus this pattern will match file.warc.gz, file.WARC,
096     * file.WaRc.GZ, but not file.WARC.open
097     */
098    public static final String WARC_PATTERN = "(?i)\\.warc(\\.gz)?$";
099
100    /**
101     * Pattern matching open WARC files, including separator . Note: (?i) means case insensitive, (\\.gz)? means .gz is
102     * optionally matched, and $ means matches end-of-line. Thus this pattern will match file.warc.gz.open,
103     * file.WARC.open, file.warc.GZ.OpEn, but not file.wARC.open.txt
104     */
105    public static final String OPEN_WARC_PATTERN = "(?i)\\.warc(\\.gz)?\\.open$";
106
107    /**
108     * Pattern matching WARC and ARC files, including separator. Note: (?i) means case insensitive, (\\.gz)? means .gz
109     * is optionally matched, and $ means matches end-of-line. Thus this pattern will match file.warc.gz, file.WARC,
110     * file.WaRc.GZ, file.arc.gz, file.ARC, file.aRc.GZ but not file.WARC.open or file.ARC.open
111     */
112    public static final String WARC_ARC_PATTERN = "(?i)\\.(w)?arc(\\.gz)?$";
113
114    /**
115     * A FilenameFilter accepting a file if and only if its name (transformed to lower case) ends on ".cdx".
116     */
117    public static final FilenameFilter CDX_FILE_FILTER = new FilenameFilter() {
118        public boolean accept(File directory, String filename) {
119            return filename.toLowerCase().endsWith(CDX_EXTENSION);
120        }
121    };
122
123    /**
124     * A filter that matches files left open by a crashed Heritrix process. Don't work on these files while Heritrix is
125     * still working on them.
126     */
127    public static final FilenameFilter OPEN_ARCS_FILTER = new FilenameFilter() {
128        public boolean accept(File dir, String name) {
129            return name.matches(".*" + OPEN_ARC_PATTERN);
130        }
131    };
132
133    /**
134     * A filter that matches warcfiles left open by a crashed Heritrix process. Don't work on these files while Heritrix
135     * is still working on them.
136     */
137    public static final FilenameFilter OPEN_WARCS_FILTER = new FilenameFilter() {
138        public boolean accept(File dir, String name) {
139            return name.matches(".*" + OPEN_WARC_PATTERN);
140        }
141    };
142
143    /**
144     * A filter that matches arc files, that is any file that ends on .arc or .arc.gz in any case.
145     */
146    public static final FilenameFilter ARCS_FILTER = new FilenameFilter() {
147        public boolean accept(File directory, String filename) {
148            return filename.toLowerCase().matches(".*" + ARC_PATTERN);
149        }
150    };
151
152    /**
153     * A filter that matches warc files, that is any file that ends on .warc or .warc.gz in any case.
154     */
155    public static final FilenameFilter WARCS_FILTER = new FilenameFilter() {
156        public boolean accept(File directory, String filename) {
157            return filename.toLowerCase().matches(".*" + WARC_PATTERN);
158        }
159    };
160
161    /**
162     * A filter that matches warc and arc files, that is any file that ends on .warc, .warc.gz, .arc or .arc.gz in any
163     * case.
164     */
165    public static final FilenameFilter WARCS_ARCS_FILTER = new FilenameFilter() {
166        public boolean accept(File directory, String filename) {
167            return filename.toLowerCase().matches(".*" + WARC_ARC_PATTERN);
168        }
169    };
170
171    /** How many times we will retry making a unique directory name. */
172    private static final int MAX_RETRIES = 10;
173
174    /** How many times we will retry making a directory. */
175    private static final int CREATE_DIR_RETRIES = 3;
176    /**
177     * Maximum number of IDs we will put in a filename. Above this number, a checksum of the ids is generated instead.
178     * This is done to protect us from getting filenames too long for the filesystem.
179     */
180    public static final int MAX_IDS_IN_FILENAME = 4;
181
182    /**
183     * Remove a file and any subfiles in case of directories.
184     *
185     * @param f A file to completely and utterly remove.
186     * @return true if the file did exist, false otherwise.
187     * @throws SecurityException If a security manager exists and its <code>{@link
188     * java.lang.SecurityManager#checkDelete}</code> method denies delete access to the file
189     */
190    public static boolean removeRecursively(File f) {
191        ArgumentNotValid.checkNotNull(f, "File f");
192        if (!f.exists()) {
193            return false;
194        }
195
196        // If the file is a directory, delete all files in this directory,
197        // and its subdirectories
198        if (f.isDirectory()) {
199            File[] subfiles = f.listFiles();
200
201            if (subfiles != null) { // Can be null in case of error
202                for (File subfile : subfiles) {
203                    removeRecursively(subfile);
204                }
205            }
206        }
207        if (!f.delete()) {
208            boolean isDir = f.isDirectory();
209            if (!isDir) {
210                log.debug("Try once more deleting file '{}", f.getAbsolutePath());
211                final boolean success = remove(f);
212                if (!success) {
213                    log.warn("Unable to remove file: '{}'", f.getAbsolutePath());
214                    return false;
215                }
216            } else {
217                log.warn("Problem with deletion of directory: '{}'.", f.getAbsolutePath());
218                return false;
219            }
220        }
221
222        return true;
223    }
224
225    /**
226     * Remove a file.
227     *
228     * @param f A file to completely and utterly remove.
229     * @return true if the file did exist, false otherwise.
230     * @throws ArgumentNotValid if f is null.
231     * @throws SecurityException If a security manager exists and its <code>{@link
232     * java.lang.SecurityManager#checkDelete}</code> method denies delete access to the file
233     */
234    public static boolean remove(File f) {
235        ArgumentNotValid.checkNotNull(f, "f");
236        if (!f.exists()) {
237            return false;
238        }
239        if (f.isDirectory()) {
240            return false; // Do not attempt to delete a directory
241        }
242        if (!f.delete()) {
243            // Hack to remove file on windows! Works only sometimes!
244            File delFile = new File(f.getAbsolutePath());
245            delFile.delete();
246            if (delFile.exists()) {
247                log.warn("Unable to remove file '{}'.", f.getAbsolutePath());
248                return false;
249            }
250        }
251
252        return true;
253    }
254
255    /**
256     * Returns a valid filename for most filesystems. Exchanges the following characters:
257     * <p/>
258     * " " -> "_" ":" -> "_" "+" -> "_"
259     *
260     * @param filename the filename to format correctly
261     * @return a new formatted filename
262     */
263    public static String formatFilename(String filename) {
264        ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename");
265        String formattedFilename = filename;
266
267        // remove spaces
268        formattedFilename = formattedFilename.replace(' ', '_');
269
270        // remove colons
271        formattedFilename = formattedFilename.replace(':', '_');
272
273        // remove add sign
274        formattedFilename = formattedFilename.replace('+', '_');
275
276        return formattedFilename;
277    }
278
279    /**
280     * Retrieves all files whose names ends with 'type' from directory 'dir' and all its subdirectories.
281     *
282     * @param dir Path of base directory
283     * @param files Initially, an empty list (e.g. an ArrayList)
284     * @param type The extension/ending of the files to retrieve (e.g. ".xml", ".ARC")
285     * @return A list of files from directory 'dir' and all its subdirectories
286     */
287    public static List<File> getFilesRecursively(String dir, List<File> files, String type) {
288        ArgumentNotValid.checkNotNullOrEmpty(dir, "String dir");
289        File theDirectory = new File(dir);
290        ArgumentNotValid.checkTrue(theDirectory.isDirectory(), "File '" + theDirectory.getAbsolutePath()
291                + "' does not represent a directory");
292        ArgumentNotValid.checkNotNull(files, "files");
293        ArgumentNotValid.checkNotNull(type, "type");
294
295        File[] top = new File(dir).listFiles();
296        for (File aTop : top) {
297            if (aTop.isDirectory()) {
298                getFilesRecursively(aTop.getAbsolutePath(), files, type);
299            } else if (aTop.isFile() && aTop.getName().endsWith(type)) {
300                files.add(aTop);
301            }
302        }
303
304        return files;
305    }
306
307    /**
308     * Load file content into text string.
309     *
310     * @param file The file to load
311     * @return file content loaded into text string
312     * @throws java.io.IOException If any IO trouble occurs while reading the file, or the file cannot be found.
313     */
314    public static String readFile(File file) throws IOException {
315        ArgumentNotValid.checkNotNull(file, "File file");
316        StringBuffer sb = new StringBuffer();
317
318        BufferedReader br = new BufferedReader(new FileReader(file));
319
320        try {
321            int i;
322
323            while ((i = br.read()) != -1) {
324                sb.append((char) i);
325            }
326        } finally {
327            br.close();
328        }
329
330        return sb.toString();
331    }
332
333    /**
334     * Copy file from one location to another. Will silently overwrite an already existing file.
335     *
336     * @param from original to copy
337     * @param to destination of copy
338     * @throws IOFailure if an io error occurs while copying file, or the original file does not exist.
339     */
340    public static void copyFile(File from, File to) {
341        ArgumentNotValid.checkNotNull(from, "File from");
342        ArgumentNotValid.checkNotNull(to, "File to");
343        if (!from.exists()) {
344            String errMsg = "Original file '" + from.getAbsolutePath() + "' does not exist";
345            log.warn(errMsg);
346            throw new IOFailure(errMsg);
347        }
348        try {
349            FileInputStream inStream = null;
350            FileOutputStream outStream = null;
351            FileChannel in = null;
352            FileChannel out = null;
353            try {
354                inStream = new FileInputStream(from);
355                outStream = new FileOutputStream(to);
356                in = inStream.getChannel();
357                out = outStream.getChannel();
358                long bytesTransferred = 0;
359                do {
360                    // Note: in.size() is called every loop, because if it should
361                    // change size, we might end up in an infinite loop trying to
362                    // copy more bytes than are actually available.
363                    bytesTransferred += in.transferTo(bytesTransferred,
364                            Math.min(Constants.IO_CHUNK_SIZE, in.size() - bytesTransferred), out);
365                } while (bytesTransferred < in.size());
366            } finally {
367                if (inStream != null) {
368                    inStream.close();
369                }
370                if (outStream != null) {
371                    outStream.close();
372                }
373                if (in != null) {
374                    in.close();
375                }
376                if (out != null) {
377                    out.close();
378                }
379            }
380        } catch (IOException e) {
381            final String errMsg = "Error copying file '" + from.getAbsolutePath() + "' to '" + to.getAbsolutePath()
382                    + "'";
383            log.warn(errMsg, e);
384            throw new IOFailure(errMsg, e);
385        }
386    }
387
388    /**
389     * Copy an entire directory from one location to another. Note that this will silently overwrite old files, just
390     * like copyFile().
391     *
392     * @param from Original directory (or file, for that matter) to copy.
393     * @param to Destination directory, i.e. the 'new name' of the copy of the from directory.
394     * @throws IOFailure On IO trouble copying files.
395     */
396    public static void copyDirectory(File from, File to) throws IOFailure {
397        ArgumentNotValid.checkNotNull(from, "File from");
398        ArgumentNotValid.checkNotNull(to, "File to");
399        String errMsg;
400        if (from.isFile()) {
401            try {
402                copyFile(from, to);
403            } catch (Exception e) {
404                errMsg = "Error copying from file '" + from.getAbsolutePath() + "' to file '" + to.getAbsolutePath()
405                        + "'.";
406                log.warn(errMsg, e);
407                throw new IOFailure(errMsg, e);
408            }
409        } else {
410            if (!from.exists()) {
411                errMsg = "Can't find directory '" + from.getAbsolutePath() + "'.";
412                log.warn(errMsg);
413                throw new IOFailure(errMsg);
414            }
415
416            if (!from.isDirectory()) {
417                errMsg = "File '" + from.getAbsolutePath() + "' is not a directory";
418                log.warn(errMsg);
419                throw new IOFailure(errMsg);
420            }
421
422            to.mkdir();
423
424            if (!to.exists()) {
425                errMsg = "Failed to create destination directory '" + to.getAbsolutePath() + "'.";
426                log.warn(errMsg);
427                throw new IOFailure(errMsg);
428            }
429
430            File[] subfiles = from.listFiles();
431
432            for (File subfile : subfiles) {
433                copyDirectory(subfile, new File(to, subfile.getName()));
434            }
435        }
436    }
437
438    /**
439     * Read an entire file, byte by byte, into a byte array, ignoring any locale issues.
440     *
441     * @param file A file to be read.
442     * @return A byte array with the contents of the file.
443     * @throws IOFailure on IO trouble reading the file, or the file does not exist
444     * @throws IndexOutOfBoundsException If the file is too large to be in an array.
445     */
446    public static byte[] readBinaryFile(File file) throws IOFailure, IndexOutOfBoundsException {
447        ArgumentNotValid.checkNotNull(file, "File file");
448        if (!file.exists()) {
449            String errMsg = "File '" + file.getAbsolutePath() + "' does not exist";
450            log.warn(errMsg);
451            throw new IOFailure(errMsg);
452        }
453
454        String errMsg;
455        if (file.length() > Integer.MAX_VALUE) {
456            errMsg = "File '" + file.getAbsolutePath() + "' of size " + file.length()
457                    + " (bytes) is too long to fit in an array";
458            log.warn(errMsg);
459            throw new IndexOutOfBoundsException(errMsg);
460        }
461
462        byte[] result = new byte[(int) file.length()];
463        FileInputStream in = null;
464        try {
465            try {
466                in = new FileInputStream(file);
467                int bytesRead;
468                for (int i = 0; i < result.length && (bytesRead = in.read(result, i, result.length - i)) != -1; i += bytesRead) {
469                }
470            } finally {
471                if (in != null) {
472                    in.close();
473                }
474            }
475        } catch (IOException e) {
476            errMsg = "Error reading file '" + file.getAbsolutePath() + "'";
477            log.warn(errMsg);
478            throw new IOFailure(errMsg, e);
479        }
480
481        return result;
482    }
483
484    /**
485     * Write an entire byte array to a file, ignoring any locale issues.
486     *
487     * @param file The file to write the data to
488     * @param b The byte array to write to the file
489     * @throws IOFailure If an exception occurs during the writing.
490     */
491    public static void writeBinaryFile(File file, byte[] b) {
492        ArgumentNotValid.checkNotNull(file, "File file");
493        ArgumentNotValid.checkNotNull(b, "byte[] b");
494        FileOutputStream out = null;
495        try {
496            try {
497                out = new FileOutputStream(file);
498                out.write(b);
499            } finally {
500                if (out != null) {
501                    out.close();
502                }
503            }
504        } catch (Exception e) {
505            final String errMsg = "writeBinaryFile exception";
506            log.warn(errMsg, e);
507            throw new IOFailure(errMsg, e);
508        }
509    }
510
511    /**
512     * Return a filter that only accepts XML files (ending with .xml), irrespective of their location.
513     *
514     * @return A new filter for XML files.
515     */
516    public static FilenameFilter getXmlFilesFilter() {
517        return new FilenameFilter() {
518            /**
519             * Tests if a specified file should be included in a file list.
520             *
521             * @param dir the directory in which the file was found. Unused in this implementation of accept.
522             * @param name the name of the file.
523             * @return <code>true</code> if and only if the name should be included in the file list; <code>false</code>
524             * otherwise.
525             * @see FilenameFilter#accept(java.io.File, java.lang.String)
526             */
527            public boolean accept(File dir, String name) {
528                return name.endsWith(Constants.XML_EXTENSION);
529            }
530        };
531    }
532
533    /**
534     * Read all lines from a file into a list of strings.
535     *
536     * @param file The file to read from.
537     * @return The list of lines.
538     * @throws IOFailure on trouble reading the file, or if the file does not exist
539     */
540    public static List<String> readListFromFile(File file) {
541        ArgumentNotValid.checkNotNull(file, "File file");
542        if (!file.exists()) {
543            String errMsg = "File '" + file.getAbsolutePath() + "' does not exist";
544            log.warn(errMsg);
545            throw new IOFailure(errMsg);
546        }
547        List<String> lines = new ArrayList<String>();
548        BufferedReader in = null;
549        try {
550            try {
551                in = new BufferedReader(new FileReader(file));
552                String line;
553                while ((line = in.readLine()) != null) {
554                    lines.add(line);
555                }
556            } finally {
557                if (in != null) {
558                    in.close();
559                }
560            }
561        } catch (IOException e) {
562            String msg = "Could not read data from " + file.getAbsolutePath();
563            log.warn(msg, e);
564            throw new IOFailure(msg, e);
565        }
566        return lines;
567    }
568
569    /**
570     * Writes a collection of strings to a file, each string on one line.
571     *
572     * @param file A file to write to. The contents of this file will be overwritten.
573     * @param collection The collection to write. The order it will be written in is unspecified.
574     * @throws IOFailure if any error occurs writing to the file.
575     * @throws ArgumentNotValid if file or collection is null.
576     */
577    public static void writeCollectionToFile(File file, Collection<String> collection) {
578        ArgumentNotValid.checkNotNull(file, "file");
579        ArgumentNotValid.checkNotNull(collection, "collection");
580        try {
581            PrintWriter writer = null;
582            try {
583                writer = new PrintWriter(new FileWriter(file));
584                for (String fileName : collection) {
585                    writer.println(fileName);
586                }
587                writer.flush();
588            } finally {
589                if (writer != null) {
590                    writer.close();
591                }
592            }
593        } catch (IOException e) {
594            String msg = "Error writing collection to file '" + file.getAbsolutePath() + "'";
595            log.warn(msg, e);
596            throw new IOFailure(msg, e);
597        }
598    }
599
600    /**
601     * Sort a file into another. The current implementation slurps all lines into memory. This will not scale forever.
602     *
603     * @param unsortedFile A file to sort
604     * @param sortedOutput The file to sort into
605     */
606    public static void makeSortedFile(File unsortedFile, File sortedOutput) {
607        ArgumentNotValid.checkNotNull(unsortedFile, "File unsortedFile");
608        ArgumentNotValid.checkNotNull(sortedOutput, "File sortedOutput");
609        List<String> lines;
610        lines = readListFromFile(unsortedFile);
611        Collections.sort(lines);
612        writeCollectionToFile(sortedOutput, lines);
613    }
614
615    /**
616     * Remove a line from a given file.
617     *
618     * @param line The full line to remove
619     * @param file The file to remove the line from. This file will be rewritten in full, and the entire contents will
620     * be kept in memory
621     * @throws UnknownID If the file does not exist
622     */
623    public static void removeLineFromFile(String line, File file) {
624        ArgumentNotValid.checkNotNull(line, "String line");
625        ArgumentNotValid.checkNotNull(file, "File file");
626        if (!file.exists()) {
627            String errMsg = "The file '" + file.getAbsolutePath() + "' does not exist.";
628            log.warn(errMsg);
629            throw new UnknownID(errMsg);
630        }
631
632        List<String> lines = readListFromFile(file);
633        lines.remove(line);
634        writeCollectionToFile(file, lines);
635    }
636
637    /**
638     * Check if the directory exists and is writable and create it if needed. The complete path down to the directory is
639     * created. If the directory creation fails a PermissionDenied exception is thrown.
640     *
641     * @param dir The directory to create
642     * @return true if dir created.
643     * @throws ArgumentNotValid If dir is null or its name is the empty string
644     * @throws PermissionDenied If directory cannot be created for any reason, or is not writable.
645     */
646    public static boolean createDir(File dir) throws PermissionDenied {
647        ArgumentNotValid.checkNotNull(dir, "File dir");
648        ArgumentNotValid.checkNotNullOrEmpty(dir.getName(), "File dir");
649        boolean didCreate = false;
650        if (!dir.exists()) {
651            didCreate = true;
652            int i = 0;
653            // retrying creation due to sun bug (race condition)
654            // See http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4742723
655            while ((i++ < CREATE_DIR_RETRIES) && !(dir.isDirectory() && dir.canWrite())) {
656                dir.mkdirs();
657            }
658            if (!(dir.isDirectory() && dir.canWrite())) {
659                String msg = "Could not create directory '" + dir.getAbsolutePath() + "'";
660                log.warn(msg);
661                throw new PermissionDenied(msg);
662            }
663        } else {
664            if (!dir.isDirectory()) {
665                String msg = "Cannot make directory '" + dir.getAbsolutePath() + "' - a file is in the way";
666                log.warn(msg);
667                throw new PermissionDenied(msg);
668            }
669        }
670        if (!dir.canWrite()) {
671            String msg = "Cannot write to required directory '" + dir.getAbsolutePath() + "'";
672            log.warn(msg);
673            throw new PermissionDenied(msg);
674        }
675        return didCreate;
676    }
677
678    /**
679     * Returns the number of bytes free on the file system calling the FreeSpaceProvider class defined by the setting
680     * CommonSettings.FREESPACE_PROVIDER_CLASS (a.k.a. settings.common.freespaceprovider.class)
681     *
682     * @param f a given file
683     * @return the number of bytes free defined in the settings.xml
684     */
685    public static long getBytesFree(File f) {
686        return FreeSpaceProviderFactory.getInstance().getBytesFree(f);
687    }
688
689    /**
690     * @param theFile A file to make relative
691     * @param theDir A directory
692     * @return the filepath of the theFile relative to theDir. null, if theFile is not relative to theDir. null, if
693     * theDir is not a directory.
694     */
695    public static String relativeTo(File theFile, File theDir) {
696        ArgumentNotValid.checkNotNull(theFile, "File theFile");
697        ArgumentNotValid.checkNotNull(theDir, "File theDir");
698        if (!theDir.isDirectory()) {
699            log.trace("The File '{}' does not represent a directory. Null returned", theDir.getAbsolutePath());
700            return null;
701        }
702
703        List<String> filePathList = new ArrayList<String>();
704        List<String> theDirPath = new ArrayList<String>();
705        File tempFile = theFile.getAbsoluteFile();
706
707        filePathList.add(tempFile.getName());
708        while ((tempFile = tempFile.getParentFile()) != null) {
709            filePathList.add(tempFile.getName());
710        }
711
712        tempFile = theDir.getAbsoluteFile();
713        theDirPath.add(tempFile.getName());
714        while ((tempFile = tempFile.getParentFile()) != null) {
715            theDirPath.add(tempFile.getName());
716        }
717
718        // check, at the path prefix is the same
719        List<String> sublist = filePathList.subList(theDirPath.size() - 2, filePathList.size());
720        if (!theDirPath.equals(sublist)) {
721            log.trace("The file '{}' is not relative to the directory '{}'. Null returned", theFile.getAbsolutePath(),
722                    theDir.getAbsolutePath());
723            return null;
724        }
725
726        List<String> relativeList = filePathList.subList(0, theDirPath.size() - 2);
727
728        StringBuffer sb = new StringBuffer();
729        Collections.reverse(relativeList);
730        for (String aRelativeList : relativeList) {
731            sb.append(aRelativeList);
732            sb.append(File.separatorChar);
733        }
734        sb.deleteCharAt(sb.length() - 1); // remove last separatorChar
735        return sb.toString();
736    }
737
738    /**
739     * Count the number of lines in a file.
740     *
741     * @param file the file to read
742     * @return the number of lines in the file
743     * @throws IOFailure If an error occurred while reading the file
744     */
745    public static long countLines(File file) {
746        ArgumentNotValid.checkNotNull(file, "file");
747        BufferedReader in = null;
748        long count = 0;
749        try {
750            try {
751                in = new BufferedReader(new FileReader(file));
752                while (in.readLine() != null) {
753                    ++count;
754                }
755            } finally {
756                if (in != null) {
757                    in.close();
758                }
759            }
760        } catch (IOException e) {
761            String msg = "Could not check number of lines in '" + file.getAbsolutePath() + "'";
762            log.warn(msg, e);
763            throw new IOFailure(msg, e);
764        }
765        return count;
766    }
767
768    /**
769     * Create an InputStream that reads from a file but removes the file when all data has been read.
770     *
771     * @param file A file to read. This file will be deleted when the inputstream is closed, finalized, reaches
772     * end-of-file, or when the VM closes.
773     * @return An InputStream containing the file's contents.
774     * @throws IOFailure If an error occurs in creating the ephemeral input stream
775     */
776    public static InputStream getEphemeralInputStream(final File file) {
777        ArgumentNotValid.checkNotNull(file, "file");
778        // First make sure we remove the file if the VM dies
779        file.deleteOnExit();
780        try {
781            // Then create an input stream that deletes the file upon exit.
782            // Note that FileInputStream.finalize calls close().
783            return new FileInputStream(file) {
784                public void close() throws IOException {
785                    super.close();
786                    file.delete();
787                }
788            };
789        } catch (IOException e) {
790            String msg = "Error creating ephemeral input stream for " + file;
791            log.warn(msg, e);
792            throw new IOFailure(msg, e);
793        }
794    }
795
796    /**
797     * Makes a valid file from filename passed in String. Ensures that the File object returned is not null, and that
798     * isFile() returns true.
799     *
800     * @param filename The file to create the File object from
801     * @return A valid, non-null File object.
802     * @throws IOFailure if file cannot be created.
803     */
804    public static File makeValidFileFromExisting(String filename) throws IOFailure {
805        ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename");
806
807        File res = new File(filename);
808        if (!res.isFile()) {
809            String errMsg = "Error: File object created from filename '" + filename
810                    + "' is not a proper file, isFile() failed.";
811            log.warn(errMsg);
812            throw new IOFailure(errMsg);
813        }
814        return res;
815    }
816
817    /**
818     * Write the entire contents of a file to a stream.
819     *
820     * @param f A file to write to the stream.
821     * @param out The stream to write to.
822     * @throws IOFailure If any error occurs while writing the file to a stream
823     */
824    public static void writeFileToStream(File f, OutputStream out) {
825        ArgumentNotValid.checkNotNull(f, "File f");
826        ArgumentNotValid.checkNotNull(out, "OutputStream out");
827
828        byte[] buffer = new byte[Constants.IO_BUFFER_SIZE];
829        try {
830            FileInputStream in = new FileInputStream(f);
831            try {
832                int bytesRead;
833                while ((bytesRead = in.read(buffer)) > 0) {
834                    out.write(buffer, 0, bytesRead);
835                }
836            } finally {
837                in.close();
838            }
839        } catch (IOException e) {
840            final String errMsg = "Error writing file '" + f.getAbsolutePath() + "' to stream";
841            log.warn(errMsg, e);
842            throw new IOFailure(errMsg, e);
843        }
844    }
845
846    /**
847     * Write the contents of a stream into a file.
848     *
849     * @param in A stream to read from. This stream is not closed by this method.
850     * @param f The file to write the stream contents into.
851     * @throws IOFailure If any error occurs while writing the stream to a file
852     */
853    public static void writeStreamToFile(InputStream in, File f) {
854        ArgumentNotValid.checkNotNull(f, "File f");
855        ArgumentNotValid.checkNotNull(in, "InputStream in");
856
857        byte[] buffer = new byte[Constants.IO_BUFFER_SIZE];
858        try {
859            FileOutputStream out = new FileOutputStream(f);
860            try {
861                int bytesRead;
862                while ((bytesRead = in.read(buffer)) > 0) {
863                    out.write(buffer, 0, bytesRead);
864                }
865            } finally {
866                out.close();
867            }
868        } catch (IOException e) {
869            final String errMsg = "Error writing stream to file '" + f.getAbsolutePath() + "'.";
870            log.warn(errMsg, e);
871            throw new IOFailure(errMsg, e);
872
873        }
874    }
875
876    /**
877     * Get the location of the standard temporary directory. The existence of this directory should be ensure at the
878     * start of every application.
879     *
880     * @return The directory that should be used for temporary files.
881     */
882    public static File getTempDir() {
883        return new File(Settings.get(CommonSettings.DIR_COMMONTEMPDIR));
884    }
885
886    /**
887     * Attempt to move a file using rename, and if that fails, move the file by copy-and-delete.
888     *
889     * @param fromFile The source
890     * @param toFile The target
891     */
892    public static void moveFile(File fromFile, File toFile) {
893        ArgumentNotValid.checkNotNull(fromFile, "File fromFile");
894        ArgumentNotValid.checkNotNull(toFile, "File toFile");
895
896        if (!fromFile.renameTo(toFile)) {
897            copyFile(fromFile, toFile);
898            remove(fromFile);
899        }
900    }
901
902    /**
903     * Given a set, generate a reasonable file name from the set.
904     *
905     * @param <T> The type of objects, that the Set IDs argument contains.
906     * @param IDs A set of IDs.
907     * @param suffix A suffix. May be empty string.
908     * @return A reasonable file name.
909     */
910    public static <T extends Comparable<T>> String generateFileNameFromSet(Set<T> IDs, String suffix) {
911        ArgumentNotValid.checkNotNull(IDs, "Set<T> IDs");
912        ArgumentNotValid.checkNotNull(suffix, "String suffix");
913
914        if (IDs.isEmpty()) {
915            return "empty" + suffix;
916        }
917
918        List<T> sorted = new ArrayList<T>(IDs);
919        Collections.sort(sorted);
920
921        String allIDsString = StringUtils.conjoin("-", sorted);
922        String fileName;
923        if (sorted.size() > MAX_IDS_IN_FILENAME) {
924            String firstNIDs = StringUtils.conjoin("-", sorted.subList(0, MAX_IDS_IN_FILENAME));
925            fileName = firstNIDs + "-" + ChecksumCalculator.calculateMd5(allIDsString.getBytes()) + suffix;
926        } else {
927            fileName = allIDsString + suffix;
928        }
929        return fileName;
930    }
931
932    /**
933     * Sort a crawl.log file according to the url.
934     *
935     * @param file The file containing the unsorted data.
936     * @param toFile The file that the sorted data can be put into.
937     * @throws IOFailure if there were errors running the sort process, or if the file does not exist.
938     */
939    public static void sortCrawlLog(File file, File toFile) {
940        ArgumentNotValid.checkNotNull(file, "File file");
941        ArgumentNotValid.checkNotNull(toFile, "File toFile");
942        if (!file.exists()) {
943            String errMsg = "The file '" + file.getAbsolutePath() + "' does not exist.";
944            log.warn(errMsg);
945            throw new IOFailure(errMsg);
946        }
947
948        File sortTempDir = null;
949        if (Settings.getBoolean(CommonSettings.UNIX_SORT_USE_COMMON_TEMP_DIR)) {
950            sortTempDir = FileUtils.getTempDir();
951            if (!sortTempDir.isDirectory()) {
952                log.warn("We should be using commontempdir {} in the sort process, but the directory doesn't exist", 
953                                sortTempDir.getAbsolutePath());
954                sortTempDir = null;
955            }
956        }
957        boolean sortLikeCrawllog = true;
958        int error = ProcessUtils.runUnixSort(file, toFile, sortTempDir, sortLikeCrawllog);
959        if (error != 0) {
960            final String errMsg = "Error code " + error + " sorting crawl log '" + file + "'";
961            log.warn(errMsg);
962            throw new IOFailure(errMsg);
963        }
964    }
965
966    /**
967     * Sort a crawl.log file according to the timestamp.
968     *
969     * @param file The file containing the unsorted data.
970     * @param toFile The file that the sorted data can be put into.
971     * @throws IOFailure if there were errors running the sort process, or if the file does not exist.
972     */
973    public static void sortCrawlLogOnTimestamp(File file, File toFile) {
974        ArgumentNotValid.checkNotNull(file, "File file");
975        ArgumentNotValid.checkNotNull(toFile, "File toFile");
976        if (!file.exists()) {
977            String errMsg = "The file '" + file.getAbsolutePath() + "' does not exist.";
978            log.warn(errMsg);
979            throw new IOFailure(errMsg);
980        }
981
982        File sortTempDir = null;
983        if (Settings.getBoolean(CommonSettings.UNIX_SORT_USE_COMMON_TEMP_DIR)) {
984            sortTempDir = FileUtils.getTempDir();
985            if (!sortTempDir.isDirectory()) {
986                log.warn("We should be using commontempdir {} in the sort process, but the directory doesn't exist", 
987                                sortTempDir.getAbsolutePath());
988                sortTempDir = null;
989            }
990        }
991        boolean sortLikeCrawllog = false;
992        int error = ProcessUtils.runUnixSort(file, toFile, sortTempDir, sortLikeCrawllog);
993        if (error != 0) {
994            final String errMsg = "Error code " + error + " sorting crawl log '" + file + "'";
995            log.warn(errMsg);
996            throw new IOFailure(errMsg);
997        }
998    }
999
1000    /**
1001     * Sort a CDX file according to our standard for CDX file sorting. This method depends on the Unix sort() command.
1002     *
1003     * @param file The raw unsorted CDX file.
1004     * @param toFile The file that the result will be put into.
1005     * @throws IOFailure If the file does not exist, or could not be sorted
1006     */
1007    public static void sortCDX(File file, File toFile) {
1008        ArgumentNotValid.checkNotNull(file, "File file");
1009        ArgumentNotValid.checkNotNull(toFile, "File toFile");
1010        if (!file.exists()) {
1011            String errMsg = "The file '" + file.getAbsolutePath() + "' does not exist.";
1012            log.warn(errMsg);
1013            throw new IOFailure(errMsg);
1014        }
1015        boolean sortLikeCrawllog = false;
1016        File sortTempDir = null;
1017        if (Settings.getBoolean(CommonSettings.UNIX_SORT_USE_COMMON_TEMP_DIR)) {
1018            sortTempDir = FileUtils.getTempDir();
1019            if (!sortTempDir.isDirectory()) {
1020                log.warn("We should be using commontempdir {} in the sort process, but the directory doesn't exist", 
1021                                sortTempDir.getAbsolutePath());
1022                sortTempDir = null;
1023            }
1024
1025        }
1026        int error = ProcessUtils.runUnixSort(file, toFile, sortTempDir, sortLikeCrawllog);
1027        if (error != 0) {
1028            final String errMsg = "Error code " + error + " sorting cdx file '" + file.getAbsolutePath() + "'";
1029            log.warn(errMsg);
1030            throw new IOFailure(errMsg);
1031        }
1032    }
1033
1034    /**
1035     * Sort a file using UNIX sort.
1036     *
1037     * @param file the file that you want to sort.
1038     * @param toFile The destination file.
1039     */
1040    public static void sortFile(File file, File toFile) {
1041        sortCDX(file, toFile);
1042    }
1043
1044    /**
1045     * Creates a new temporary directory with a unique name. This directory will be deleted automatically at the end of
1046     * the VM (though behaviour if there are files in it is undefined). This method will try a limited number of times
1047     * to create a directory, using a randomly generated suffix, before giving up.
1048     *
1049     * @param inDir The directory where the temporary directory should be created.
1050     * @param prefix The prefix of the directory name, for identification purposes.
1051     * @return A newly created directory that no other calls to createUniqueDir returns.
1052     * @throws ArgumentNotValid if inDir is not an existing directory that can be written to.
1053     * @throws IOFailure if a free name couldn't be found within a reasonable number of tries.
1054     */
1055    public static File createUniqueTempDir(File inDir, String prefix) {
1056        ArgumentNotValid.checkNotNull(inDir, "File inDir");
1057        ArgumentNotValid.checkNotNullOrEmpty(prefix, "String prefix");
1058        ArgumentNotValid.checkTrue(inDir.isDirectory(), inDir + " must be a directory");
1059        ArgumentNotValid.checkTrue(inDir.canWrite(), inDir + " must be writeable");
1060        for (int tries = 0; tries < MAX_RETRIES; tries++) {
1061            File newDir;
1062            try {
1063                newDir = File.createTempFile(prefix, null, inDir);
1064            } catch (IOException e) {
1065                final String errMsg = "Couldn't create temporary file in '" + inDir.getAbsolutePath()
1066                        + "' with prefix '" + prefix + "'";
1067                log.warn(errMsg, e);
1068                throw new IOFailure(errMsg, e);
1069            }
1070            newDir.delete();
1071            if (newDir.mkdir()) {
1072                newDir.deleteOnExit();
1073                return newDir;
1074            }
1075        }
1076        final String errMsg = "Too many similar files around, cannot create " + "unique dir with prefix " + prefix
1077                + " in '" + inDir.getAbsolutePath() + "'.";
1078        log.warn(errMsg);
1079        throw new IOFailure(errMsg);
1080    }
1081
1082    /**
1083     * Read the last line in a file. Note this method is not UTF-8 safe.
1084     *
1085     * @param file input file to read last line from.
1086     * @return The last line in the file (ending newline is irrelevant), returns an empty string if file is empty.
1087     * @throws ArgumentNotValid on null argument, or file is not a readable file.
1088     * @throws IOFailure on IO trouble reading file.
1089     */
1090    public static String readLastLine(File file) {
1091        ArgumentNotValid.checkNotNull(file, "File file");
1092        if (!file.isFile() || !file.canRead()) {
1093            final String errMsg = "File '" + file.getAbsolutePath() + "' is not a readable file.";
1094            log.warn(errMsg);
1095            throw new ArgumentNotValid(errMsg);
1096        }
1097        if (file.length() == 0) {
1098            return "";
1099        }
1100        RandomAccessFile rafile = null;
1101        try {
1102            rafile = new RandomAccessFile(file, "r");
1103            // seek to byte one before end of file (remember we know the file is
1104            // not empty) - this ensures that an ending newline is not read
1105            rafile.seek(rafile.length() - 2);
1106            // now search to the last linebreak, or beginning of file
1107            while (rafile.getFilePointer() != 0 && rafile.read() != '\n') {
1108                // search back two, because we just searched forward one to find
1109                // newline
1110                rafile.seek(rafile.getFilePointer() - 2);
1111            }
1112            return rafile.readLine();
1113        } catch (IOException e) {
1114            final String errMsg = "Unable to access file '" + file.getAbsolutePath() + "'";
1115            log.warn(errMsg, e);
1116            throw new IOFailure(errMsg, e);
1117        } finally {
1118            try {
1119                if (rafile != null) {
1120                    rafile.close();
1121                }
1122            } catch (IOException e) {
1123                log.debug("Unable to close file '{}' after reading", file.getAbsolutePath(), e);
1124            }
1125        }
1126    }
1127
1128    /**
1129     * Append the given lines to a file. Each lines is terminated by a newline.
1130     *
1131     * @param file A file to append to.
1132     * @param lines The lines to write.
1133     */
1134    public static void appendToFile(File file, String... lines) {
1135        ArgumentNotValid.checkNotNull(file, "File file");
1136        ArgumentNotValid.checkNotNull(lines, "String... lines");
1137
1138        PrintWriter writer = null;
1139        int linesAppended = 0;
1140        try {
1141            boolean appendMode = true;
1142            writer = new PrintWriter(new FileWriter(file, appendMode));
1143            for (String line : lines) {
1144                writer.println(line);
1145                linesAppended++;
1146            }
1147        } catch (IOException e) {
1148            log.warn("Error appending {} lines to file '{}'. Only appended {} lines. ", lines.length,
1149                    file.getAbsolutePath(), linesAppended, e);
1150        } finally {
1151            if (writer != null) {
1152                writer.close();
1153            }
1154        }
1155    }
1156
1157    /**
1158     * Loads an file from the class path (for retrieving a file from '.jar').
1159     *
1160     * @param filePath The path of the file.
1161     * @return The file from the class path.
1162     * @throws IOFailure If resource cannot be retrieved from the class path.
1163     */
1164    public static File getResourceFileFromClassPath(String filePath) throws IOFailure {
1165        ArgumentNotValid.checkNotNullOrEmpty(filePath, "String filePath");
1166        try {
1167            // retrieve the file as a stream from the classpath.
1168            InputStream stream = Thread.currentThread().getContextClassLoader().getResourceAsStream(filePath);
1169
1170            if (stream != null) {
1171                // Make stream into file, and return it.
1172                File tmpFile = File.createTempFile("tmp", "tmp");
1173                StreamUtils.copyInputStreamToOutputStream(stream, new FileOutputStream(tmpFile));
1174                return tmpFile;
1175            } else {
1176                String msg = "The resource was not retrieved correctly from the class path: '" + filePath + "'";
1177                log.trace(msg);
1178                throw new IOFailure(msg);
1179            }
1180        } catch (IOException e) {
1181            String msg = "Problems making stream of resource in class path into a file. Filepath: '" + filePath + "'";
1182            log.warn(msg, e);
1183            throw new IOFailure(msg, e);
1184        }
1185    }
1186
1187    /**
1188     * Get a humanly readable representation of the file size. If the file is a directory, the size is the aggregate of
1189     * the files in the directory except that subdirectories are ignored. The number is given with 2 decimals.
1190     *
1191     * @param aFile a File object
1192     * @return a humanly readable representation of the file size (rounded)
1193     */
1194    public static String getHumanReadableFileSize(File aFile) {
1195        ArgumentNotValid.checkNotNull(aFile, "File aFile");
1196        final long bytesPerOneKilobyte = 1000L;
1197        final long bytesPerOneMegabyte = 1000000L;
1198        final long bytesPerOneGigabyte = 1000000000L;
1199        double filesize = 0L;
1200        if (aFile.isDirectory()) {
1201            for (File f : aFile.listFiles()) {
1202                if (f.isFile()) {
1203                    filesize = filesize + f.length();
1204                }
1205            }
1206
1207        } else {
1208            filesize = aFile.length(); // normal file.
1209        }
1210
1211        NumberFormat decFormat = new DecimalFormat("##.##");
1212        if (filesize < bytesPerOneKilobyte) {
1213            // represent size in bytes without the ".0"
1214            return (long) filesize + " bytes";
1215        } else if (filesize >= bytesPerOneKilobyte && filesize < bytesPerOneMegabyte) {
1216            // represent size in Kbytes
1217            return decFormat.format(filesize / bytesPerOneKilobyte) + " Kbytes";
1218        } else if (filesize >= bytesPerOneMegabyte && filesize < bytesPerOneGigabyte) {
1219            // represent size in Mbytes
1220            return decFormat.format(filesize / bytesPerOneMegabyte) + " Mbytes";
1221        } else {
1222            // represent in Gbytes
1223            return decFormat.format(filesize / bytesPerOneGigabyte) + " Gbytes";
1224        }
1225    }
1226
1227    /**
1228     * @param aDir A directory
1229     * @return true, if the given directory contains files; else returns false
1230     */
1231    public static boolean hasFiles(File aDir) {
1232        ArgumentNotValid.checkExistsDirectory(aDir, "aDir");
1233        return (aDir.listFiles().length > 0);
1234    }
1235
1236}