001/* 002 * #%L 003 * Netarchivesuite - common 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.common.utils; 024 025import java.io.BufferedReader; 026import java.io.File; 027import java.io.FileInputStream; 028import java.io.FileOutputStream; 029import java.io.FileReader; 030import java.io.FileWriter; 031import java.io.FilenameFilter; 032import java.io.IOException; 033import java.io.InputStream; 034import java.io.OutputStream; 035import java.io.PrintWriter; 036import java.io.RandomAccessFile; 037import java.nio.channels.FileChannel; 038import java.text.DecimalFormat; 039import java.text.NumberFormat; 040import java.util.ArrayList; 041import java.util.Collection; 042import java.util.Collections; 043import java.util.List; 044import java.util.Set; 045 046import org.slf4j.Logger; 047import org.slf4j.LoggerFactory; 048 049import dk.netarkivet.common.CommonSettings; 050import dk.netarkivet.common.Constants; 051import dk.netarkivet.common.exceptions.ArgumentNotValid; 052import dk.netarkivet.common.exceptions.IOFailure; 053import dk.netarkivet.common.exceptions.PermissionDenied; 054import dk.netarkivet.common.exceptions.UnknownID; 055 056/** 057 * Misc. handy file utilities. 058 */ 059public class FileUtils { 060 061 /** The logger for this class. */ 062 private static final Logger log = LoggerFactory.getLogger(FileUtils.class); 063 064 /** Extension used for CDX files, including separator . */ 065 public static final String CDX_EXTENSION = ".cdx"; 066 067 /** Extension used for ARC files, including separator . */ 068 public static final String ARC_EXTENSION = ".arc"; 069 070 /** Extension used for gzipped ARC files, including separator . */ 071 public static final String ARC_GZIPPED_EXTENSION = ".arc.gz"; 072 073 /** Extension used for WARC files, including separator . */ 074 public static final String WARC_EXTENSION = ".warc"; 075 076 /** Extension used for gzipped WARC files, including separator . */ 077 public static final String WARC_GZIPPED_EXTENSION = ".warc.gz"; 078 079 /** 080 * Pattern matching ARC files, including separator. Note: (?i) means case insensitive, (\\.gz)? means .gz is 081 * optionally matched, and $ means matches end-of-line. Thus this pattern will match file.arc.gz, file.ARC, 082 * file.aRc.GZ, but not file.ARC.open 083 */ 084 public static final String ARC_PATTERN = "(?i)\\.arc(\\.gz)?$"; 085 086 /** 087 * Pattern matching open ARC files, including separator . Note: (?i) means case insensitive, (\\.gz)? means .gz is 088 * optionally matched, and $ means matches end-of-line. Thus this pattern will match file.arc.gz.open, 089 * file.ARC.open, file.arc.GZ.OpEn, but not file.ARC.open.txt 090 */ 091 public static final String OPEN_ARC_PATTERN = "(?i)\\.arc(\\.gz)?\\.open$"; 092 093 /** 094 * Pattern matching WARC files, including separator. Note: (?i) means case insensitive, (\\.gz)? means .gz is 095 * optionally matched, and $ means matches end-of-line. Thus this pattern will match file.warc.gz, file.WARC, 096 * file.WaRc.GZ, but not file.WARC.open 097 */ 098 public static final String WARC_PATTERN = "(?i)\\.warc(\\.gz)?$"; 099 100 /** 101 * Pattern matching open WARC files, including separator . Note: (?i) means case insensitive, (\\.gz)? means .gz is 102 * optionally matched, and $ means matches end-of-line. Thus this pattern will match file.warc.gz.open, 103 * file.WARC.open, file.warc.GZ.OpEn, but not file.wARC.open.txt 104 */ 105 public static final String OPEN_WARC_PATTERN = "(?i)\\.warc(\\.gz)?\\.open$"; 106 107 /** 108 * Pattern matching WARC and ARC files, including separator. Note: (?i) means case insensitive, (\\.gz)? means .gz 109 * is optionally matched, and $ means matches end-of-line. Thus this pattern will match file.warc.gz, file.WARC, 110 * file.WaRc.GZ, file.arc.gz, file.ARC, file.aRc.GZ but not file.WARC.open or file.ARC.open 111 */ 112 public static final String WARC_ARC_PATTERN = "(?i)\\.(w)?arc(\\.gz)?$"; 113 114 /** 115 * A FilenameFilter accepting a file if and only if its name (transformed to lower case) ends on ".cdx". 116 */ 117 public static final FilenameFilter CDX_FILE_FILTER = new FilenameFilter() { 118 public boolean accept(File directory, String filename) { 119 return filename.toLowerCase().endsWith(CDX_EXTENSION); 120 } 121 }; 122 123 /** 124 * A filter that matches files left open by a crashed Heritrix process. Don't work on these files while Heritrix is 125 * still working on them. 126 */ 127 public static final FilenameFilter OPEN_ARCS_FILTER = new FilenameFilter() { 128 public boolean accept(File dir, String name) { 129 return name.matches(".*" + OPEN_ARC_PATTERN); 130 } 131 }; 132 133 /** 134 * A filter that matches warcfiles left open by a crashed Heritrix process. Don't work on these files while Heritrix 135 * is still working on them. 136 */ 137 public static final FilenameFilter OPEN_WARCS_FILTER = new FilenameFilter() { 138 public boolean accept(File dir, String name) { 139 return name.matches(".*" + OPEN_WARC_PATTERN); 140 } 141 }; 142 143 /** 144 * A filter that matches arc files, that is any file that ends on .arc or .arc.gz in any case. 145 */ 146 public static final FilenameFilter ARCS_FILTER = new FilenameFilter() { 147 public boolean accept(File directory, String filename) { 148 return filename.toLowerCase().matches(".*" + ARC_PATTERN); 149 } 150 }; 151 152 /** 153 * A filter that matches warc files, that is any file that ends on .warc or .warc.gz in any case. 154 */ 155 public static final FilenameFilter WARCS_FILTER = new FilenameFilter() { 156 public boolean accept(File directory, String filename) { 157 return filename.toLowerCase().matches(".*" + WARC_PATTERN); 158 } 159 }; 160 161 /** 162 * A filter that matches warc and arc files, that is any file that ends on .warc, .warc.gz, .arc or .arc.gz in any 163 * case. 164 */ 165 public static final FilenameFilter WARCS_ARCS_FILTER = new FilenameFilter() { 166 public boolean accept(File directory, String filename) { 167 return filename.toLowerCase().matches(".*" + WARC_ARC_PATTERN); 168 } 169 }; 170 171 /** How many times we will retry making a unique directory name. */ 172 private static final int MAX_RETRIES = 10; 173 174 /** How many times we will retry making a directory. */ 175 private static final int CREATE_DIR_RETRIES = 3; 176 /** 177 * Maximum number of IDs we will put in a filename. Above this number, a checksum of the ids is generated instead. 178 * This is done to protect us from getting filenames too long for the filesystem. 179 */ 180 public static final int MAX_IDS_IN_FILENAME = 4; 181 182 /** 183 * Remove a file and any subfiles in case of directories. 184 * 185 * @param f A file to completely and utterly remove. 186 * @return true if the file did exist, false otherwise. 187 * @throws SecurityException If a security manager exists and its <code>{@link 188 * java.lang.SecurityManager#checkDelete}</code> method denies delete access to the file 189 */ 190 public static boolean removeRecursively(File f) { 191 ArgumentNotValid.checkNotNull(f, "File f"); 192 if (!f.exists()) { 193 return false; 194 } 195 196 // If the file is a directory, delete all files in this directory, 197 // and its subdirectories 198 if (f.isDirectory()) { 199 File[] subfiles = f.listFiles(); 200 201 if (subfiles != null) { // Can be null in case of error 202 for (File subfile : subfiles) { 203 removeRecursively(subfile); 204 } 205 } 206 } 207 if (!f.delete()) { 208 boolean isDir = f.isDirectory(); 209 if (!isDir) { 210 log.debug("Try once more deleting file '{}", f.getAbsolutePath()); 211 final boolean success = remove(f); 212 if (!success) { 213 log.warn("Unable to remove file: '{}'", f.getAbsolutePath()); 214 return false; 215 } 216 } else { 217 log.warn("Problem with deletion of directory: '{}'.", f.getAbsolutePath()); 218 return false; 219 } 220 } 221 222 return true; 223 } 224 225 /** 226 * Remove a file. 227 * 228 * @param f A file to completely and utterly remove. 229 * @return true if the file did exist, false otherwise. 230 * @throws ArgumentNotValid if f is null. 231 * @throws SecurityException If a security manager exists and its <code>{@link 232 * java.lang.SecurityManager#checkDelete}</code> method denies delete access to the file 233 */ 234 public static boolean remove(File f) { 235 ArgumentNotValid.checkNotNull(f, "f"); 236 if (!f.exists()) { 237 return false; 238 } 239 if (f.isDirectory()) { 240 return false; // Do not attempt to delete a directory 241 } 242 if (!f.delete()) { 243 // Hack to remove file on windows! Works only sometimes! 244 File delFile = new File(f.getAbsolutePath()); 245 delFile.delete(); 246 if (delFile.exists()) { 247 log.warn("Unable to remove file '{}'.", f.getAbsolutePath()); 248 return false; 249 } 250 } 251 252 return true; 253 } 254 255 /** 256 * Returns a valid filename for most filesystems. Exchanges the following characters: 257 * <p/> 258 * " " -> "_" ":" -> "_" "+" -> "_" 259 * 260 * @param filename the filename to format correctly 261 * @return a new formatted filename 262 */ 263 public static String formatFilename(String filename) { 264 ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename"); 265 String formattedFilename = filename; 266 267 // remove spaces 268 formattedFilename = formattedFilename.replace(' ', '_'); 269 270 // remove colons 271 formattedFilename = formattedFilename.replace(':', '_'); 272 273 // remove add sign 274 formattedFilename = formattedFilename.replace('+', '_'); 275 276 return formattedFilename; 277 } 278 279 /** 280 * Retrieves all files whose names ends with 'type' from directory 'dir' and all its subdirectories. 281 * 282 * @param dir Path of base directory 283 * @param files Initially, an empty list (e.g. an ArrayList) 284 * @param type The extension/ending of the files to retrieve (e.g. ".xml", ".ARC") 285 * @return A list of files from directory 'dir' and all its subdirectories 286 */ 287 public static List<File> getFilesRecursively(String dir, List<File> files, String type) { 288 ArgumentNotValid.checkNotNullOrEmpty(dir, "String dir"); 289 File theDirectory = new File(dir); 290 ArgumentNotValid.checkTrue(theDirectory.isDirectory(), "File '" + theDirectory.getAbsolutePath() 291 + "' does not represent a directory"); 292 ArgumentNotValid.checkNotNull(files, "files"); 293 ArgumentNotValid.checkNotNull(type, "type"); 294 295 File[] top = new File(dir).listFiles(); 296 for (File aTop : top) { 297 if (aTop.isDirectory()) { 298 getFilesRecursively(aTop.getAbsolutePath(), files, type); 299 } else if (aTop.isFile() && aTop.getName().endsWith(type)) { 300 files.add(aTop); 301 } 302 } 303 304 return files; 305 } 306 307 /** 308 * Load file content into text string. 309 * 310 * @param file The file to load 311 * @return file content loaded into text string 312 * @throws java.io.IOException If any IO trouble occurs while reading the file, or the file cannot be found. 313 */ 314 public static String readFile(File file) throws IOException { 315 ArgumentNotValid.checkNotNull(file, "File file"); 316 StringBuffer sb = new StringBuffer(); 317 318 BufferedReader br = new BufferedReader(new FileReader(file)); 319 320 try { 321 int i; 322 323 while ((i = br.read()) != -1) { 324 sb.append((char) i); 325 } 326 } finally { 327 br.close(); 328 } 329 330 return sb.toString(); 331 } 332 333 /** 334 * Copy file from one location to another. Will silently overwrite an already existing file. 335 * 336 * @param from original to copy 337 * @param to destination of copy 338 * @throws IOFailure if an io error occurs while copying file, or the original file does not exist. 339 */ 340 public static void copyFile(File from, File to) { 341 ArgumentNotValid.checkNotNull(from, "File from"); 342 ArgumentNotValid.checkNotNull(to, "File to"); 343 if (!from.exists()) { 344 String errMsg = "Original file '" + from.getAbsolutePath() + "' does not exist"; 345 log.warn(errMsg); 346 throw new IOFailure(errMsg); 347 } 348 try { 349 FileInputStream inStream = null; 350 FileOutputStream outStream = null; 351 FileChannel in = null; 352 FileChannel out = null; 353 try { 354 inStream = new FileInputStream(from); 355 outStream = new FileOutputStream(to); 356 in = inStream.getChannel(); 357 out = outStream.getChannel(); 358 long bytesTransferred = 0; 359 do { 360 // Note: in.size() is called every loop, because if it should 361 // change size, we might end up in an infinite loop trying to 362 // copy more bytes than are actually available. 363 bytesTransferred += in.transferTo(bytesTransferred, 364 Math.min(Constants.IO_CHUNK_SIZE, in.size() - bytesTransferred), out); 365 } while (bytesTransferred < in.size()); 366 } finally { 367 if (inStream != null) { 368 inStream.close(); 369 } 370 if (outStream != null) { 371 outStream.close(); 372 } 373 if (in != null) { 374 in.close(); 375 } 376 if (out != null) { 377 out.close(); 378 } 379 } 380 } catch (IOException e) { 381 final String errMsg = "Error copying file '" + from.getAbsolutePath() + "' to '" + to.getAbsolutePath() 382 + "'"; 383 log.warn(errMsg, e); 384 throw new IOFailure(errMsg, e); 385 } 386 } 387 388 /** 389 * Copy an entire directory from one location to another. Note that this will silently overwrite old files, just 390 * like copyFile(). 391 * 392 * @param from Original directory (or file, for that matter) to copy. 393 * @param to Destination directory, i.e. the 'new name' of the copy of the from directory. 394 * @throws IOFailure On IO trouble copying files. 395 */ 396 public static void copyDirectory(File from, File to) throws IOFailure { 397 ArgumentNotValid.checkNotNull(from, "File from"); 398 ArgumentNotValid.checkNotNull(to, "File to"); 399 String errMsg; 400 if (from.isFile()) { 401 try { 402 copyFile(from, to); 403 } catch (Exception e) { 404 errMsg = "Error copying from file '" + from.getAbsolutePath() + "' to file '" + to.getAbsolutePath() 405 + "'."; 406 log.warn(errMsg, e); 407 throw new IOFailure(errMsg, e); 408 } 409 } else { 410 if (!from.exists()) { 411 errMsg = "Can't find directory '" + from.getAbsolutePath() + "'."; 412 log.warn(errMsg); 413 throw new IOFailure(errMsg); 414 } 415 416 if (!from.isDirectory()) { 417 errMsg = "File '" + from.getAbsolutePath() + "' is not a directory"; 418 log.warn(errMsg); 419 throw new IOFailure(errMsg); 420 } 421 422 to.mkdir(); 423 424 if (!to.exists()) { 425 errMsg = "Failed to create destination directory '" + to.getAbsolutePath() + "'."; 426 log.warn(errMsg); 427 throw new IOFailure(errMsg); 428 } 429 430 File[] subfiles = from.listFiles(); 431 432 for (File subfile : subfiles) { 433 copyDirectory(subfile, new File(to, subfile.getName())); 434 } 435 } 436 } 437 438 /** 439 * Read an entire file, byte by byte, into a byte array, ignoring any locale issues. 440 * 441 * @param file A file to be read. 442 * @return A byte array with the contents of the file. 443 * @throws IOFailure on IO trouble reading the file, or the file does not exist 444 * @throws IndexOutOfBoundsException If the file is too large to be in an array. 445 */ 446 public static byte[] readBinaryFile(File file) throws IOFailure, IndexOutOfBoundsException { 447 ArgumentNotValid.checkNotNull(file, "File file"); 448 if (!file.exists()) { 449 String errMsg = "File '" + file.getAbsolutePath() + "' does not exist"; 450 log.warn(errMsg); 451 throw new IOFailure(errMsg); 452 } 453 454 String errMsg; 455 if (file.length() > Integer.MAX_VALUE) { 456 errMsg = "File '" + file.getAbsolutePath() + "' of size " + file.length() 457 + " (bytes) is too long to fit in an array"; 458 log.warn(errMsg); 459 throw new IndexOutOfBoundsException(errMsg); 460 } 461 462 byte[] result = new byte[(int) file.length()]; 463 FileInputStream in = null; 464 try { 465 try { 466 in = new FileInputStream(file); 467 int bytesRead; 468 for (int i = 0; i < result.length && (bytesRead = in.read(result, i, result.length - i)) != -1; i += bytesRead) { 469 } 470 } finally { 471 if (in != null) { 472 in.close(); 473 } 474 } 475 } catch (IOException e) { 476 errMsg = "Error reading file '" + file.getAbsolutePath() + "'"; 477 log.warn(errMsg); 478 throw new IOFailure(errMsg, e); 479 } 480 481 return result; 482 } 483 484 /** 485 * Write an entire byte array to a file, ignoring any locale issues. 486 * 487 * @param file The file to write the data to 488 * @param b The byte array to write to the file 489 * @throws IOFailure If an exception occurs during the writing. 490 */ 491 public static void writeBinaryFile(File file, byte[] b) { 492 ArgumentNotValid.checkNotNull(file, "File file"); 493 ArgumentNotValid.checkNotNull(b, "byte[] b"); 494 FileOutputStream out = null; 495 try { 496 try { 497 out = new FileOutputStream(file); 498 out.write(b); 499 } finally { 500 if (out != null) { 501 out.close(); 502 } 503 } 504 } catch (Exception e) { 505 final String errMsg = "writeBinaryFile exception"; 506 log.warn(errMsg, e); 507 throw new IOFailure(errMsg, e); 508 } 509 } 510 511 /** 512 * Return a filter that only accepts XML files (ending with .xml), irrespective of their location. 513 * 514 * @return A new filter for XML files. 515 */ 516 public static FilenameFilter getXmlFilesFilter() { 517 return new FilenameFilter() { 518 /** 519 * Tests if a specified file should be included in a file list. 520 * 521 * @param dir the directory in which the file was found. Unused in this implementation of accept. 522 * @param name the name of the file. 523 * @return <code>true</code> if and only if the name should be included in the file list; <code>false</code> 524 * otherwise. 525 * @see FilenameFilter#accept(java.io.File, java.lang.String) 526 */ 527 public boolean accept(File dir, String name) { 528 return name.endsWith(Constants.XML_EXTENSION); 529 } 530 }; 531 } 532 533 /** 534 * Read all lines from a file into a list of strings. 535 * 536 * @param file The file to read from. 537 * @return The list of lines. 538 * @throws IOFailure on trouble reading the file, or if the file does not exist 539 */ 540 public static List<String> readListFromFile(File file) { 541 ArgumentNotValid.checkNotNull(file, "File file"); 542 if (!file.exists()) { 543 String errMsg = "File '" + file.getAbsolutePath() + "' does not exist"; 544 log.warn(errMsg); 545 throw new IOFailure(errMsg); 546 } 547 List<String> lines = new ArrayList<String>(); 548 BufferedReader in = null; 549 try { 550 try { 551 in = new BufferedReader(new FileReader(file)); 552 String line; 553 while ((line = in.readLine()) != null) { 554 lines.add(line); 555 } 556 } finally { 557 if (in != null) { 558 in.close(); 559 } 560 } 561 } catch (IOException e) { 562 String msg = "Could not read data from " + file.getAbsolutePath(); 563 log.warn(msg, e); 564 throw new IOFailure(msg, e); 565 } 566 return lines; 567 } 568 569 /** 570 * Writes a collection of strings to a file, each string on one line. 571 * 572 * @param file A file to write to. The contents of this file will be overwritten. 573 * @param collection The collection to write. The order it will be written in is unspecified. 574 * @throws IOFailure if any error occurs writing to the file. 575 * @throws ArgumentNotValid if file or collection is null. 576 */ 577 public static void writeCollectionToFile(File file, Collection<String> collection) { 578 ArgumentNotValid.checkNotNull(file, "file"); 579 ArgumentNotValid.checkNotNull(collection, "collection"); 580 try { 581 PrintWriter writer = null; 582 try { 583 writer = new PrintWriter(new FileWriter(file)); 584 for (String fileName : collection) { 585 writer.println(fileName); 586 } 587 writer.flush(); 588 } finally { 589 if (writer != null) { 590 writer.close(); 591 } 592 } 593 } catch (IOException e) { 594 String msg = "Error writing collection to file '" + file.getAbsolutePath() + "'"; 595 log.warn(msg, e); 596 throw new IOFailure(msg, e); 597 } 598 } 599 600 /** 601 * Sort a file into another. The current implementation slurps all lines into memory. This will not scale forever. 602 * 603 * @param unsortedFile A file to sort 604 * @param sortedOutput The file to sort into 605 */ 606 public static void makeSortedFile(File unsortedFile, File sortedOutput) { 607 ArgumentNotValid.checkNotNull(unsortedFile, "File unsortedFile"); 608 ArgumentNotValid.checkNotNull(sortedOutput, "File sortedOutput"); 609 List<String> lines; 610 lines = readListFromFile(unsortedFile); 611 Collections.sort(lines); 612 writeCollectionToFile(sortedOutput, lines); 613 } 614 615 /** 616 * Remove a line from a given file. 617 * 618 * @param line The full line to remove 619 * @param file The file to remove the line from. This file will be rewritten in full, and the entire contents will 620 * be kept in memory 621 * @throws UnknownID If the file does not exist 622 */ 623 public static void removeLineFromFile(String line, File file) { 624 ArgumentNotValid.checkNotNull(line, "String line"); 625 ArgumentNotValid.checkNotNull(file, "File file"); 626 if (!file.exists()) { 627 String errMsg = "The file '" + file.getAbsolutePath() + "' does not exist."; 628 log.warn(errMsg); 629 throw new UnknownID(errMsg); 630 } 631 632 List<String> lines = readListFromFile(file); 633 lines.remove(line); 634 writeCollectionToFile(file, lines); 635 } 636 637 /** 638 * Check if the directory exists and is writable and create it if needed. The complete path down to the directory is 639 * created. If the directory creation fails a PermissionDenied exception is thrown. 640 * 641 * @param dir The directory to create 642 * @return true if dir created. 643 * @throws ArgumentNotValid If dir is null or its name is the empty string 644 * @throws PermissionDenied If directory cannot be created for any reason, or is not writable. 645 */ 646 public static boolean createDir(File dir) throws PermissionDenied { 647 ArgumentNotValid.checkNotNull(dir, "File dir"); 648 ArgumentNotValid.checkNotNullOrEmpty(dir.getName(), "File dir"); 649 boolean didCreate = false; 650 if (!dir.exists()) { 651 didCreate = true; 652 int i = 0; 653 // retrying creation due to sun bug (race condition) 654 // See http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4742723 655 while ((i++ < CREATE_DIR_RETRIES) && !(dir.isDirectory() && dir.canWrite())) { 656 dir.mkdirs(); 657 } 658 if (!(dir.isDirectory() && dir.canWrite())) { 659 String msg = "Could not create directory '" + dir.getAbsolutePath() + "'"; 660 log.warn(msg); 661 throw new PermissionDenied(msg); 662 } 663 } else { 664 if (!dir.isDirectory()) { 665 String msg = "Cannot make directory '" + dir.getAbsolutePath() + "' - a file is in the way"; 666 log.warn(msg); 667 throw new PermissionDenied(msg); 668 } 669 } 670 if (!dir.canWrite()) { 671 String msg = "Cannot write to required directory '" + dir.getAbsolutePath() + "'"; 672 log.warn(msg); 673 throw new PermissionDenied(msg); 674 } 675 return didCreate; 676 } 677 678 /** 679 * Returns the number of bytes free on the file system calling the FreeSpaceProvider class defined by the setting 680 * CommonSettings.FREESPACE_PROVIDER_CLASS (a.k.a. settings.common.freespaceprovider.class) 681 * 682 * @param f a given file 683 * @return the number of bytes free defined in the settings.xml 684 */ 685 public static long getBytesFree(File f) { 686 return FreeSpaceProviderFactory.getInstance().getBytesFree(f); 687 } 688 689 /** 690 * @param theFile A file to make relative 691 * @param theDir A directory 692 * @return the filepath of the theFile relative to theDir. null, if theFile is not relative to theDir. null, if 693 * theDir is not a directory. 694 */ 695 public static String relativeTo(File theFile, File theDir) { 696 ArgumentNotValid.checkNotNull(theFile, "File theFile"); 697 ArgumentNotValid.checkNotNull(theDir, "File theDir"); 698 if (!theDir.isDirectory()) { 699 log.trace("The File '{}' does not represent a directory. Null returned", theDir.getAbsolutePath()); 700 return null; 701 } 702 703 List<String> filePathList = new ArrayList<String>(); 704 List<String> theDirPath = new ArrayList<String>(); 705 File tempFile = theFile.getAbsoluteFile(); 706 707 filePathList.add(tempFile.getName()); 708 while ((tempFile = tempFile.getParentFile()) != null) { 709 filePathList.add(tempFile.getName()); 710 } 711 712 tempFile = theDir.getAbsoluteFile(); 713 theDirPath.add(tempFile.getName()); 714 while ((tempFile = tempFile.getParentFile()) != null) { 715 theDirPath.add(tempFile.getName()); 716 } 717 718 // check, at the path prefix is the same 719 List<String> sublist = filePathList.subList(theDirPath.size() - 2, filePathList.size()); 720 if (!theDirPath.equals(sublist)) { 721 log.trace("The file '{}' is not relative to the directory '{}'. Null returned", theFile.getAbsolutePath(), 722 theDir.getAbsolutePath()); 723 return null; 724 } 725 726 List<String> relativeList = filePathList.subList(0, theDirPath.size() - 2); 727 728 StringBuffer sb = new StringBuffer(); 729 Collections.reverse(relativeList); 730 for (String aRelativeList : relativeList) { 731 sb.append(aRelativeList); 732 sb.append(File.separatorChar); 733 } 734 sb.deleteCharAt(sb.length() - 1); // remove last separatorChar 735 return sb.toString(); 736 } 737 738 /** 739 * Count the number of lines in a file. 740 * 741 * @param file the file to read 742 * @return the number of lines in the file 743 * @throws IOFailure If an error occurred while reading the file 744 */ 745 public static long countLines(File file) { 746 ArgumentNotValid.checkNotNull(file, "file"); 747 BufferedReader in = null; 748 long count = 0; 749 try { 750 try { 751 in = new BufferedReader(new FileReader(file)); 752 while (in.readLine() != null) { 753 ++count; 754 } 755 } finally { 756 if (in != null) { 757 in.close(); 758 } 759 } 760 } catch (IOException e) { 761 String msg = "Could not check number of lines in '" + file.getAbsolutePath() + "'"; 762 log.warn(msg, e); 763 throw new IOFailure(msg, e); 764 } 765 return count; 766 } 767 768 /** 769 * Create an InputStream that reads from a file but removes the file when all data has been read. 770 * 771 * @param file A file to read. This file will be deleted when the inputstream is closed, finalized, reaches 772 * end-of-file, or when the VM closes. 773 * @return An InputStream containing the file's contents. 774 * @throws IOFailure If an error occurs in creating the ephemeral input stream 775 */ 776 public static InputStream getEphemeralInputStream(final File file) { 777 ArgumentNotValid.checkNotNull(file, "file"); 778 // First make sure we remove the file if the VM dies 779 file.deleteOnExit(); 780 try { 781 // Then create an input stream that deletes the file upon exit. 782 // Note that FileInputStream.finalize calls close(). 783 return new FileInputStream(file) { 784 public void close() throws IOException { 785 super.close(); 786 file.delete(); 787 } 788 }; 789 } catch (IOException e) { 790 String msg = "Error creating ephemeral input stream for " + file; 791 log.warn(msg, e); 792 throw new IOFailure(msg, e); 793 } 794 } 795 796 /** 797 * Makes a valid file from filename passed in String. Ensures that the File object returned is not null, and that 798 * isFile() returns true. 799 * 800 * @param filename The file to create the File object from 801 * @return A valid, non-null File object. 802 * @throws IOFailure if file cannot be created. 803 */ 804 public static File makeValidFileFromExisting(String filename) throws IOFailure { 805 ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename"); 806 807 File res = new File(filename); 808 if (!res.isFile()) { 809 String errMsg = "Error: File object created from filename '" + filename 810 + "' is not a proper file, isFile() failed."; 811 log.warn(errMsg); 812 throw new IOFailure(errMsg); 813 } 814 return res; 815 } 816 817 /** 818 * Write the entire contents of a file to a stream. 819 * 820 * @param f A file to write to the stream. 821 * @param out The stream to write to. 822 * @throws IOFailure If any error occurs while writing the file to a stream 823 */ 824 public static void writeFileToStream(File f, OutputStream out) { 825 ArgumentNotValid.checkNotNull(f, "File f"); 826 ArgumentNotValid.checkNotNull(out, "OutputStream out"); 827 828 byte[] buffer = new byte[Constants.IO_BUFFER_SIZE]; 829 try { 830 FileInputStream in = new FileInputStream(f); 831 try { 832 int bytesRead; 833 while ((bytesRead = in.read(buffer)) > 0) { 834 out.write(buffer, 0, bytesRead); 835 } 836 } finally { 837 in.close(); 838 } 839 } catch (IOException e) { 840 final String errMsg = "Error writing file '" + f.getAbsolutePath() + "' to stream"; 841 log.warn(errMsg, e); 842 throw new IOFailure(errMsg, e); 843 } 844 } 845 846 /** 847 * Write the contents of a stream into a file. 848 * 849 * @param in A stream to read from. This stream is not closed by this method. 850 * @param f The file to write the stream contents into. 851 * @throws IOFailure If any error occurs while writing the stream to a file 852 */ 853 public static void writeStreamToFile(InputStream in, File f) { 854 ArgumentNotValid.checkNotNull(f, "File f"); 855 ArgumentNotValid.checkNotNull(in, "InputStream in"); 856 857 byte[] buffer = new byte[Constants.IO_BUFFER_SIZE]; 858 try { 859 FileOutputStream out = new FileOutputStream(f); 860 try { 861 int bytesRead; 862 while ((bytesRead = in.read(buffer)) > 0) { 863 out.write(buffer, 0, bytesRead); 864 } 865 } finally { 866 out.close(); 867 } 868 } catch (IOException e) { 869 final String errMsg = "Error writing stream to file '" + f.getAbsolutePath() + "'."; 870 log.warn(errMsg, e); 871 throw new IOFailure(errMsg, e); 872 873 } 874 } 875 876 /** 877 * Get the location of the standard temporary directory. The existence of this directory should be ensure at the 878 * start of every application. 879 * 880 * @return The directory that should be used for temporary files. 881 */ 882 public static File getTempDir() { 883 return new File(Settings.get(CommonSettings.DIR_COMMONTEMPDIR)); 884 } 885 886 /** 887 * Attempt to move a file using rename, and if that fails, move the file by copy-and-delete. 888 * 889 * @param fromFile The source 890 * @param toFile The target 891 */ 892 public static void moveFile(File fromFile, File toFile) { 893 ArgumentNotValid.checkNotNull(fromFile, "File fromFile"); 894 ArgumentNotValid.checkNotNull(toFile, "File toFile"); 895 896 if (!fromFile.renameTo(toFile)) { 897 copyFile(fromFile, toFile); 898 remove(fromFile); 899 } 900 } 901 902 /** 903 * Given a set, generate a reasonable file name from the set. 904 * 905 * @param <T> The type of objects, that the Set IDs argument contains. 906 * @param IDs A set of IDs. 907 * @param suffix A suffix. May be empty string. 908 * @return A reasonable file name. 909 */ 910 public static <T extends Comparable<T>> String generateFileNameFromSet(Set<T> IDs, String suffix) { 911 ArgumentNotValid.checkNotNull(IDs, "Set<T> IDs"); 912 ArgumentNotValid.checkNotNull(suffix, "String suffix"); 913 914 if (IDs.isEmpty()) { 915 return "empty" + suffix; 916 } 917 918 List<T> sorted = new ArrayList<T>(IDs); 919 Collections.sort(sorted); 920 921 String allIDsString = StringUtils.conjoin("-", sorted); 922 String fileName; 923 if (sorted.size() > MAX_IDS_IN_FILENAME) { 924 String firstNIDs = StringUtils.conjoin("-", sorted.subList(0, MAX_IDS_IN_FILENAME)); 925 fileName = firstNIDs + "-" + ChecksumCalculator.calculateMd5(allIDsString.getBytes()) + suffix; 926 } else { 927 fileName = allIDsString + suffix; 928 } 929 return fileName; 930 } 931 932 /** 933 * Sort a crawl.log file according to the url. 934 * 935 * @param file The file containing the unsorted data. 936 * @param toFile The file that the sorted data can be put into. 937 * @throws IOFailure if there were errors running the sort process, or if the file does not exist. 938 */ 939 public static void sortCrawlLog(File file, File toFile) { 940 ArgumentNotValid.checkNotNull(file, "File file"); 941 ArgumentNotValid.checkNotNull(toFile, "File toFile"); 942 if (!file.exists()) { 943 String errMsg = "The file '" + file.getAbsolutePath() + "' does not exist."; 944 log.warn(errMsg); 945 throw new IOFailure(errMsg); 946 } 947 948 File sortTempDir = null; 949 if (Settings.getBoolean(CommonSettings.UNIX_SORT_USE_COMMON_TEMP_DIR)) { 950 sortTempDir = FileUtils.getTempDir(); 951 if (!sortTempDir.isDirectory()) { 952 log.warn("We should be using commontempdir {} in the sort process, but the directory doesn't exist", 953 sortTempDir.getAbsolutePath()); 954 sortTempDir = null; 955 } 956 } 957 boolean sortLikeCrawllog = true; 958 int error = ProcessUtils.runUnixSort(file, toFile, sortTempDir, sortLikeCrawllog); 959 if (error != 0) { 960 final String errMsg = "Error code " + error + " sorting crawl log '" + file + "'"; 961 log.warn(errMsg); 962 throw new IOFailure(errMsg); 963 } 964 } 965 966 /** 967 * Sort a crawl.log file according to the timestamp. 968 * 969 * @param file The file containing the unsorted data. 970 * @param toFile The file that the sorted data can be put into. 971 * @throws IOFailure if there were errors running the sort process, or if the file does not exist. 972 */ 973 public static void sortCrawlLogOnTimestamp(File file, File toFile) { 974 ArgumentNotValid.checkNotNull(file, "File file"); 975 ArgumentNotValid.checkNotNull(toFile, "File toFile"); 976 if (!file.exists()) { 977 String errMsg = "The file '" + file.getAbsolutePath() + "' does not exist."; 978 log.warn(errMsg); 979 throw new IOFailure(errMsg); 980 } 981 982 File sortTempDir = null; 983 if (Settings.getBoolean(CommonSettings.UNIX_SORT_USE_COMMON_TEMP_DIR)) { 984 sortTempDir = FileUtils.getTempDir(); 985 if (!sortTempDir.isDirectory()) { 986 log.warn("We should be using commontempdir {} in the sort process, but the directory doesn't exist", 987 sortTempDir.getAbsolutePath()); 988 sortTempDir = null; 989 } 990 } 991 boolean sortLikeCrawllog = false; 992 int error = ProcessUtils.runUnixSort(file, toFile, sortTempDir, sortLikeCrawllog); 993 if (error != 0) { 994 final String errMsg = "Error code " + error + " sorting crawl log '" + file + "'"; 995 log.warn(errMsg); 996 throw new IOFailure(errMsg); 997 } 998 } 999 1000 /** 1001 * Sort a CDX file according to our standard for CDX file sorting. This method depends on the Unix sort() command. 1002 * 1003 * @param file The raw unsorted CDX file. 1004 * @param toFile The file that the result will be put into. 1005 * @throws IOFailure If the file does not exist, or could not be sorted 1006 */ 1007 public static void sortCDX(File file, File toFile) { 1008 ArgumentNotValid.checkNotNull(file, "File file"); 1009 ArgumentNotValid.checkNotNull(toFile, "File toFile"); 1010 if (!file.exists()) { 1011 String errMsg = "The file '" + file.getAbsolutePath() + "' does not exist."; 1012 log.warn(errMsg); 1013 throw new IOFailure(errMsg); 1014 } 1015 boolean sortLikeCrawllog = false; 1016 File sortTempDir = null; 1017 if (Settings.getBoolean(CommonSettings.UNIX_SORT_USE_COMMON_TEMP_DIR)) { 1018 sortTempDir = FileUtils.getTempDir(); 1019 if (!sortTempDir.isDirectory()) { 1020 log.warn("We should be using commontempdir {} in the sort process, but the directory doesn't exist", 1021 sortTempDir.getAbsolutePath()); 1022 sortTempDir = null; 1023 } 1024 1025 } 1026 int error = ProcessUtils.runUnixSort(file, toFile, sortTempDir, sortLikeCrawllog); 1027 if (error != 0) { 1028 final String errMsg = "Error code " + error + " sorting cdx file '" + file.getAbsolutePath() + "'"; 1029 log.warn(errMsg); 1030 throw new IOFailure(errMsg); 1031 } 1032 } 1033 1034 /** 1035 * Sort a file using UNIX sort. 1036 * 1037 * @param file the file that you want to sort. 1038 * @param toFile The destination file. 1039 */ 1040 public static void sortFile(File file, File toFile) { 1041 sortCDX(file, toFile); 1042 } 1043 1044 /** 1045 * Creates a new temporary directory with a unique name. This directory will be deleted automatically at the end of 1046 * the VM (though behaviour if there are files in it is undefined). This method will try a limited number of times 1047 * to create a directory, using a randomly generated suffix, before giving up. 1048 * 1049 * @param inDir The directory where the temporary directory should be created. 1050 * @param prefix The prefix of the directory name, for identification purposes. 1051 * @return A newly created directory that no other calls to createUniqueDir returns. 1052 * @throws ArgumentNotValid if inDir is not an existing directory that can be written to. 1053 * @throws IOFailure if a free name couldn't be found within a reasonable number of tries. 1054 */ 1055 public static File createUniqueTempDir(File inDir, String prefix) { 1056 ArgumentNotValid.checkNotNull(inDir, "File inDir"); 1057 ArgumentNotValid.checkNotNullOrEmpty(prefix, "String prefix"); 1058 ArgumentNotValid.checkTrue(inDir.isDirectory(), inDir + " must be a directory"); 1059 ArgumentNotValid.checkTrue(inDir.canWrite(), inDir + " must be writeable"); 1060 for (int tries = 0; tries < MAX_RETRIES; tries++) { 1061 File newDir; 1062 try { 1063 newDir = File.createTempFile(prefix, null, inDir); 1064 } catch (IOException e) { 1065 final String errMsg = "Couldn't create temporary file in '" + inDir.getAbsolutePath() 1066 + "' with prefix '" + prefix + "'"; 1067 log.warn(errMsg, e); 1068 throw new IOFailure(errMsg, e); 1069 } 1070 newDir.delete(); 1071 if (newDir.mkdir()) { 1072 newDir.deleteOnExit(); 1073 return newDir; 1074 } 1075 } 1076 final String errMsg = "Too many similar files around, cannot create " + "unique dir with prefix " + prefix 1077 + " in '" + inDir.getAbsolutePath() + "'."; 1078 log.warn(errMsg); 1079 throw new IOFailure(errMsg); 1080 } 1081 1082 /** 1083 * Read the last line in a file. Note this method is not UTF-8 safe. 1084 * 1085 * @param file input file to read last line from. 1086 * @return The last line in the file (ending newline is irrelevant), returns an empty string if file is empty. 1087 * @throws ArgumentNotValid on null argument, or file is not a readable file. 1088 * @throws IOFailure on IO trouble reading file. 1089 */ 1090 public static String readLastLine(File file) { 1091 ArgumentNotValid.checkNotNull(file, "File file"); 1092 if (!file.isFile() || !file.canRead()) { 1093 final String errMsg = "File '" + file.getAbsolutePath() + "' is not a readable file."; 1094 log.warn(errMsg); 1095 throw new ArgumentNotValid(errMsg); 1096 } 1097 if (file.length() == 0) { 1098 return ""; 1099 } 1100 RandomAccessFile rafile = null; 1101 try { 1102 rafile = new RandomAccessFile(file, "r"); 1103 // seek to byte one before end of file (remember we know the file is 1104 // not empty) - this ensures that an ending newline is not read 1105 rafile.seek(rafile.length() - 2); 1106 // now search to the last linebreak, or beginning of file 1107 while (rafile.getFilePointer() != 0 && rafile.read() != '\n') { 1108 // search back two, because we just searched forward one to find 1109 // newline 1110 rafile.seek(rafile.getFilePointer() - 2); 1111 } 1112 return rafile.readLine(); 1113 } catch (IOException e) { 1114 final String errMsg = "Unable to access file '" + file.getAbsolutePath() + "'"; 1115 log.warn(errMsg, e); 1116 throw new IOFailure(errMsg, e); 1117 } finally { 1118 try { 1119 if (rafile != null) { 1120 rafile.close(); 1121 } 1122 } catch (IOException e) { 1123 log.debug("Unable to close file '{}' after reading", file.getAbsolutePath(), e); 1124 } 1125 } 1126 } 1127 1128 /** 1129 * Append the given lines to a file. Each lines is terminated by a newline. 1130 * 1131 * @param file A file to append to. 1132 * @param lines The lines to write. 1133 */ 1134 public static void appendToFile(File file, String... lines) { 1135 ArgumentNotValid.checkNotNull(file, "File file"); 1136 ArgumentNotValid.checkNotNull(lines, "String... lines"); 1137 1138 PrintWriter writer = null; 1139 int linesAppended = 0; 1140 try { 1141 boolean appendMode = true; 1142 writer = new PrintWriter(new FileWriter(file, appendMode)); 1143 for (String line : lines) { 1144 writer.println(line); 1145 linesAppended++; 1146 } 1147 } catch (IOException e) { 1148 log.warn("Error appending {} lines to file '{}'. Only appended {} lines. ", lines.length, 1149 file.getAbsolutePath(), linesAppended, e); 1150 } finally { 1151 if (writer != null) { 1152 writer.close(); 1153 } 1154 } 1155 } 1156 1157 /** 1158 * Loads an file from the class path (for retrieving a file from '.jar'). 1159 * 1160 * @param filePath The path of the file. 1161 * @return The file from the class path. 1162 * @throws IOFailure If resource cannot be retrieved from the class path. 1163 */ 1164 public static File getResourceFileFromClassPath(String filePath) throws IOFailure { 1165 ArgumentNotValid.checkNotNullOrEmpty(filePath, "String filePath"); 1166 try { 1167 // retrieve the file as a stream from the classpath. 1168 InputStream stream = Thread.currentThread().getContextClassLoader().getResourceAsStream(filePath); 1169 1170 if (stream != null) { 1171 // Make stream into file, and return it. 1172 File tmpFile = File.createTempFile("tmp", "tmp"); 1173 StreamUtils.copyInputStreamToOutputStream(stream, new FileOutputStream(tmpFile)); 1174 return tmpFile; 1175 } else { 1176 String msg = "The resource was not retrieved correctly from the class path: '" + filePath + "'"; 1177 log.trace(msg); 1178 throw new IOFailure(msg); 1179 } 1180 } catch (IOException e) { 1181 String msg = "Problems making stream of resource in class path into a file. Filepath: '" + filePath + "'"; 1182 log.warn(msg, e); 1183 throw new IOFailure(msg, e); 1184 } 1185 } 1186 1187 /** 1188 * Get a humanly readable representation of the file size. If the file is a directory, the size is the aggregate of 1189 * the files in the directory except that subdirectories are ignored. The number is given with 2 decimals. 1190 * 1191 * @param aFile a File object 1192 * @return a humanly readable representation of the file size (rounded) 1193 */ 1194 public static String getHumanReadableFileSize(File aFile) { 1195 ArgumentNotValid.checkNotNull(aFile, "File aFile"); 1196 final long bytesPerOneKilobyte = 1000L; 1197 final long bytesPerOneMegabyte = 1000000L; 1198 final long bytesPerOneGigabyte = 1000000000L; 1199 double filesize = 0L; 1200 if (aFile.isDirectory()) { 1201 for (File f : aFile.listFiles()) { 1202 if (f.isFile()) { 1203 filesize = filesize + f.length(); 1204 } 1205 } 1206 1207 } else { 1208 filesize = aFile.length(); // normal file. 1209 } 1210 1211 NumberFormat decFormat = new DecimalFormat("##.##"); 1212 if (filesize < bytesPerOneKilobyte) { 1213 // represent size in bytes without the ".0" 1214 return (long) filesize + " bytes"; 1215 } else if (filesize >= bytesPerOneKilobyte && filesize < bytesPerOneMegabyte) { 1216 // represent size in Kbytes 1217 return decFormat.format(filesize / bytesPerOneKilobyte) + " Kbytes"; 1218 } else if (filesize >= bytesPerOneMegabyte && filesize < bytesPerOneGigabyte) { 1219 // represent size in Mbytes 1220 return decFormat.format(filesize / bytesPerOneMegabyte) + " Mbytes"; 1221 } else { 1222 // represent in Gbytes 1223 return decFormat.format(filesize / bytesPerOneGigabyte) + " Gbytes"; 1224 } 1225 } 1226 1227 /** 1228 * @param aDir A directory 1229 * @return true, if the given directory contains files; else returns false 1230 */ 1231 public static boolean hasFiles(File aDir) { 1232 ArgumentNotValid.checkExistsDirectory(aDir, "aDir"); 1233 return (aDir.listFiles().length > 0); 1234 } 1235 1236}