001/* 002 * #%L 003 * Netarchivesuite - archive 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.archive.arcrepository.bitpreservation; 024 025import java.io.File; 026import java.util.ArrayList; 027import java.util.Collections; 028import java.util.Date; 029import java.util.HashMap; 030import java.util.HashSet; 031import java.util.List; 032import java.util.Map; 033import java.util.Set; 034 035import org.slf4j.Logger; 036import org.slf4j.LoggerFactory; 037 038import dk.netarkivet.archive.arcrepositoryadmin.AdminData; 039import dk.netarkivet.archive.arcrepositoryadmin.ArcRepositoryEntry; 040import dk.netarkivet.archive.arcrepositoryadmin.ReadOnlyAdminData; 041import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory; 042import dk.netarkivet.common.distribute.arcrepository.PreservationArcRepositoryClient; 043import dk.netarkivet.common.distribute.arcrepository.Replica; 044import dk.netarkivet.common.distribute.arcrepository.ReplicaStoreState; 045import dk.netarkivet.common.exceptions.ArgumentNotValid; 046import dk.netarkivet.common.exceptions.IOFailure; 047import dk.netarkivet.common.exceptions.IllegalState; 048import dk.netarkivet.common.exceptions.NetarkivetException; 049import dk.netarkivet.common.exceptions.NotImplementedException; 050import dk.netarkivet.common.exceptions.PermissionDenied; 051import dk.netarkivet.common.exceptions.UnknownID; 052import dk.netarkivet.common.utils.CleanupHook; 053import dk.netarkivet.common.utils.CleanupIF; 054import dk.netarkivet.common.utils.FileUtils; 055import dk.netarkivet.common.utils.StringUtils; 056import dk.netarkivet.common.utils.batch.ChecksumJob; 057 058/** 059 * Class handling integrity check of the arcrepository. 060 * <p> 061 * This class must run on the same machine as the arcrepository, as it uses the same admin data file (read-only). 062 * However, it still talks JMS with the arcrepository. 063 * 064 * @deprecated Use the DatabaseBasedActiveBitPreservation instead (define in the setting: 065 * <b>settings.archive.admin.class</b>). 066 */ 067@Deprecated 068public class FileBasedActiveBitPreservation implements ActiveBitPreservation, CleanupIF { 069 070 /** The class log. */ 071 private static final Logger log = LoggerFactory.getLogger(FileBasedActiveBitPreservation.class); 072 073 /** 074 * When replacing a broken file, the broken file is downloaded and stored in a temporary directory under 075 * Settings.COMMON_TEMP_DIR with this name. It can then be inspected at your leisure. 076 */ 077 private static final String REMOVED_FILES = "bitpreservation"; 078 079 /** 080 * The maximum size of logged collections. This is used either when a subcollection is extracted, or when objects 081 * are concatenated. Default value = 10. 082 */ 083 private static final int MAX_LIST_SIZE = 10; 084 085 /** 086 * This should be updated at the entrance of each major use block, to ensure it is reasonably in sync with the file. 087 * We cannot, however, guarantee total sync, as the file can change at any time. We consider it good enough that it 088 * is updated every time there is user interaction. 089 */ 090 private ReadOnlyAdminData admin; 091 092 /** 093 * File preservation is done in a singleton, which means that any user using the file preservation interface will 094 * update the same state. 095 * <p> 096 * Nothing breaks by two users simultaneously do bit preservation actions, but it may have undesirable consequences, 097 * such as two users simultaneously starting checksum jobs of the full archive. 098 */ 099 private static FileBasedActiveBitPreservation instance; 100 101 /** Hook to close down application. */ 102 private CleanupHook closeHook; 103 104 /** Initializes a FileBasedActiveBitPreservation instance. */ 105 protected FileBasedActiveBitPreservation() { 106 this.admin = AdminData.getReadOnlyInstance(); 107 this.closeHook = new CleanupHook(this); 108 Runtime.getRuntime().addShutdownHook(closeHook); 109 } 110 111 /** 112 * Get singleton instance. 113 * 114 * @return the singleton. 115 */ 116 public static synchronized FileBasedActiveBitPreservation getInstance() { 117 if (instance == null) { 118 instance = new FileBasedActiveBitPreservation(); 119 } 120 return instance; 121 } 122 123 /** 124 * Retrieve the preservation status for the files with the given filenames. This will ask for a fresh checksum from 125 * the bitarchives and admin data. 126 * 127 * @param filenames List of filenames 128 * @return a map ([filename]-> [FilePreservationState]) of the preservation status for the given files. The 129 * preservationstate is null, if the file named does not exist in admin data. 130 * @throws ArgumentNotValid If the list of filenames is null or contains a null. 131 */ 132 public Map<String, PreservationState> getPreservationStateMap(String... filenames) throws ArgumentNotValid { 133 ArgumentNotValid.checkNotNull(filenames, "String... filenames"); 134 // check, that the files are not empty strings 135 for (String file : filenames) { 136 ArgumentNotValid.checkNotNullOrEmpty(file, "String file"); 137 } 138 // Start by retrieving the admin status 139 admin.synchronize(); 140 141 // Temporary datastructures: 142 // adminInfo: A map ([filename]->[ArcRepositoryEntry]) to hold admindata 143 // info. Holds one entry for each of the files 144 // known by admin data. 145 // missingInAdminData: Contains the names of files that admindata just 146 // don't know. 147 Map<String, ArcRepositoryEntry> adminInfo = new HashMap<String, ArcRepositoryEntry>(); 148 Set<String> missingInAdmindata = new HashSet<String>(); 149 150 for (String filename : filenames) { 151 ArcRepositoryEntry ae = admin.getEntry(filename); 152 if (ae != null) { 153 adminInfo.put(filename, ae); 154 } else { 155 missingInAdmindata.add(filename); 156 } 157 } 158 159 if (missingInAdmindata.size() > 0) { 160 log.warn( 161 "The following {} files are unknown to admindata: {}", 162 missingInAdmindata.size(), 163 StringUtils.conjoin( 164 ",", 165 new ArrayList<String>(missingInAdmindata).subList(0, 166 Math.min(missingInAdmindata.size(), MAX_LIST_SIZE)))); 167 } 168 169 // filepreservationStates: map ([filename] -> [filepreservationstate]) 170 // This is the datastructure returned from this method 171 Map<String, PreservationState> filepreservationStates = new HashMap<String, PreservationState>(); 172 173 // Phase 1: Add null FilePreservationState entries for the files 174 // absent from admindata. 175 for (String missing : missingInAdmindata) { 176 filepreservationStates.put(missing, (FilePreservationState) null); 177 } 178 // Phase 2: For every filename present in admin data, 179 // construct a map ([replica] -> [list of checksums]). 180 // The resulting map: 181 // map ([filename] -> map ([replica] -> [list of checksums])). 182 // This takes a long time, as two batchjobs will be sent out to 183 // to the bitarchives to compute checksums for the files with these 184 // filenames. 185 Map<String, Map<Replica, List<String>>> checksumMaps = getChecksumMaps(adminInfo.keySet()); 186 187 // Phase 3: construct FilePreservationState objects for subset of 188 // filenames known by admin data. The rest of the filenames are 189 // represented with a null FilePreservationState object. 190 for (Map.Entry<String, ArcRepositoryEntry> entry : adminInfo.entrySet()) { 191 String filename = entry.getKey(); 192 ArcRepositoryEntry adminFileInfo = entry.getValue(); 193 filepreservationStates.put(filename, 194 new FilePreservationState(filename, adminFileInfo, checksumMaps.get(filename))); 195 } 196 return filepreservationStates; 197 } 198 199 /** 200 * Get the details of the state of the given file in the bitarchives and admin data. 201 * 202 * @param filename A given file 203 * @return the FilePreservationState for the given file. This will be null, if the filename is not found in admin 204 * data. 205 */ 206 public PreservationState getPreservationState(String filename) { 207 ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename"); 208 Map<String, PreservationState> filepreservationStates = getPreservationStateMap(filename); 209 210 return filepreservationStates.get(filename); 211 } 212 213 /** 214 * Generate a map of checksums for these filenames in the bitarchives ( map ([filename] -> map ([replica] -> [list 215 * of checksums]))). This takes a long time, as a batchjob will be sent out to all the bitarchives to compute 216 * checksums for the files with these filenames. 217 * 218 * @param filenames The filenames to get the checksums for. 219 * @return Map containing the output of checksum jobs from the bitarchives. 220 */ 221 private Map<String, Map<Replica, List<String>>> getChecksumMaps(Set<String> filenames) { 222 223 // checksummaps: map ([filename] -> map ([replica] 224 // -> [list of checksums])). 225 // This datastructure will contain for each filename the computed 226 // checksums for the file with this filename on all replicas 227 // (bitarchives). 228 Map<String, Map<Replica, List<String>>> checksummaps = new HashMap<String, Map<Replica, List<String>>>(); 229 230 // Only make one checksum job for each replica 231 for (Replica rep : Replica.getKnown()) { 232 // Get the checksum information from Replica 'rep' as 233 // a map ([filename]->[list of checksums]). 234 Map<String, List<String>> checksums = getChecksums(rep, filenames); 235 log.debug("Adding checksums for replica '{}' for filenames: {}", rep, 236 StringUtils.conjoin(",", filenames, MAX_LIST_SIZE)); 237 238 for (String filename : filenames) { 239 // Update 'checksummaps' datastructure with the checksums 240 // received from Replica 'rep'. 241 242 // replicaMap: map ([replica] 243 // -> [list of checksums for one filename]). 244 Map<Replica, List<String>> replicaMap; 245 // Get current map in 'checksummaps' datastructure for filename, 246 // if it exists. Otherwise a new one is created, and 247 // stored. 248 if (checksummaps.containsKey(filename)) { 249 replicaMap = checksummaps.get(filename); 250 } else { 251 replicaMap = new HashMap<Replica, List<String>>(); 252 checksummaps.put(filename, replicaMap); 253 } 254 // Extract the list of checksums for the given filename from 255 // the 'checksums' datastructure. 256 List<String> checksumsForFileOnRep = checksums.get(filename); 257 if (checksumsForFileOnRep == null) { 258 // If no checksums for file was available on replica 'ba' 259 // just add an empty list of checksums. 260 checksumsForFileOnRep = new ArrayList<String>(); 261 } 262 // Add the list of checksums for the given file 263 // on replica 'rep' to datastructure 'replicaMap'. 264 replicaMap.put(rep, checksumsForFileOnRep); 265 } 266 } 267 return checksummaps; 268 } 269 270 /** 271 * Get the checksum of a list of files in a replica (map ([filename] -> map ([replica] -> [list of checksums])). 272 * <p> 273 * Note that this method runs a batch job on the bitarchives, and therefore may take a long time, depending on 274 * network delays. 275 * 276 * @param rep The replica to ask for checksums. 277 * @param filenames The names of the files to ask for checksums for. 278 * @return The MD5 checksums of the files, or the empty string if the file was not in the replica. 279 * @see ChecksumJob#parseLine(String) 280 */ 281 private Map<String, List<String>> getChecksums(Replica rep, Set<String> filenames) { 282 // initialise the resulting map. 283 Map<String, List<String>> res = new HashMap<String, List<String>>(); 284 285 try { 286 PreservationArcRepositoryClient arcClient = ArcRepositoryClientFactory.getPreservationInstance(); 287 // for each file extract the checksum through a checksum message 288 // and then put it into the resulting map. 289 for (String file : filenames) { 290 // retrieve the checksum from the replica. 291 String checksum = arcClient.getChecksum(rep.getId(), file); 292 293 // put the checksum into a list, or make empty list if the 294 // checksum was not retrieved. 295 List<String> csList; 296 if (checksum == null || checksum.isEmpty()) { 297 log.warn("The checksum for file '{}' from replica '{}' was invalid. Empty list returned", file, rep); 298 csList = Collections.<String>emptyList(); 299 } else { 300 csList = new ArrayList<String>(); 301 csList.add(checksum); 302 } 303 304 // put the filename and list into the map. 305 res.put(file, csList); 306 } 307 308 log.debug("The map from a checksum archive: " + res.toString()); 309 } catch (NetarkivetException e) { 310 // This is not critical. Log and continue. 311 log.warn("The retrieval of checksums from a checksum archive was not successful.", e); 312 } 313 314 return res; 315 } 316 317 /** 318 * Get a list of missing files in a given replica. 319 * 320 * @param replica A given replica. 321 * @return A list of missing files in a given replica. 322 * @throws IllegalState if the file with the list cannot be found. 323 * @throws ArgumentNotValid If the replica is null. 324 */ 325 public Iterable<String> getMissingFiles(Replica replica) throws IllegalState, ArgumentNotValid { 326 ArgumentNotValid.checkNotNull(replica, "Replica replica"); 327 File missingOutput = WorkFiles.getFile(replica, WorkFiles.MISSING_FILES_BA); 328 if (!missingOutput.exists()) { 329 throw new IllegalState("Could not find the file: " + missingOutput.getAbsolutePath()); 330 } 331 return FileUtils.readListFromFile(missingOutput); 332 } 333 334 /** 335 * This method takes as input the name of a replica for which we wish to retrieve the list of files, either through 336 * a FileListJob or a GetAllFilenamesMessage. It also reads in the known files in the arcrepository from the 337 * AdminData directory specified in the Setting DIRS_ARCREPOSITORY_ADMIN. The two file lists are compared and a 338 * subdirectory missingFiles is created with two unsorted files: 'missingba.txt' containing missing files, ie those 339 * registered in the admin data, but not found in the replica, and 'missingadmindata.txt' containing extra files, 340 * ie. those found in the replica but not in the arcrepository admin data. 341 * <p> 342 * TODO The second file is never used on the current implementation. 343 * <p> 344 * FIXME: It is unclear if the decision if which files are missing isn't better suited to be in getMissingFiles, so 345 * this method only runs the batch job. 346 * 347 * @param replica the replica to search for missing files 348 * @throws ArgumentNotValid If the given directory does not contain a file filelistOutput/sorted.txt, or the 349 * argument replica is null. 350 * @throws PermissionDenied If the output directory cannot be created. 351 */ 352 public void findMissingFiles(Replica replica) throws ArgumentNotValid, PermissionDenied { 353 ArgumentNotValid.checkNotNull(replica, "Replica replica"); 354 runFileListJob(replica); 355 log.trace("Finding missing files in directory '" + WorkFiles.getPreservationDir(replica) + "'"); 356 admin.synchronize(); 357 358 // Create set of file names from replica data 359 Set<String> filesInReplica = new HashSet<String>(WorkFiles.getLines(replica, WorkFiles.FILES_ON_BA)); 360 361 // Get set of files in arcrepository 362 Set<String> arcrepNameSet = admin.getAllFileNames(); 363 364 // Find difference set 1 (the files missing from the replica). 365 Set<String> extraFilesInAdminData = new HashSet<String>(arcrepNameSet); 366 extraFilesInAdminData.removeAll(filesInReplica); 367 368 // Log result 369 if (extraFilesInAdminData.size() > 0) { 370 log.warn("The " 371 + extraFilesInAdminData.size() 372 + " files '" 373 + new ArrayList<String>(extraFilesInAdminData).subList(0, 374 Math.min(extraFilesInAdminData.size(), MAX_LIST_SIZE)) 375 + "' are not present in the replica listing in '" 376 + WorkFiles.getPreservationDir(replica).getAbsolutePath() + "'"); 377 } 378 379 // Write output data 380 WorkFiles.write(replica, WorkFiles.MISSING_FILES_BA, extraFilesInAdminData); 381 382 // Find difference set 2 (the files missing in admin.data). 383 Set<String> extraFilesInRep = new HashSet<String>(filesInReplica); 384 extraFilesInRep.removeAll(arcrepNameSet); 385 386 // Log result 387 if (extraFilesInRep.size() > 0) { 388 log.warn("The " 389 + extraFilesInRep.size() 390 + " files '" 391 + new ArrayList<String>(extraFilesInRep).subList(0, Math.min(extraFilesInRep.size(), MAX_LIST_SIZE)) 392 + "' have been found in the replica listing in '" 393 + WorkFiles.getPreservationDir(replica).getAbsolutePath() + "' though they are not known by the " 394 + "system."); 395 } 396 397 // Write output data 398 WorkFiles.write(replica, WorkFiles.MISSING_FILES_ADMINDATA, extraFilesInRep); 399 log.trace("Finished finding missing files."); 400 } 401 402 /** 403 * Method to get a list of all files in a given bitarchive. The result is stored (unsorted) in the area specified by 404 * WorkFiles.FILES_ON_BA. 405 * 406 * @param replica the replica where the given bitarchive lies 407 * @throws PermissionDenied if the output directories cannot be created 408 * @throws IOFailure if there is a problem writing the output file, or if the job fails for some reason 409 * @throws UnknownID If the replica has an unknown replicaType. 410 */ 411 private void runFileListJob(Replica replica) throws IOFailure, UnknownID, PermissionDenied { 412 // Pick the right directory to output to 413 File batchOutputFile = WorkFiles.getFile(replica, WorkFiles.FILES_ON_BA); 414 log.trace("runFileListJob for replica '{}', output file '{}'", replica, batchOutputFile); 415 416 // Retrieve a file containing all the filenames of the replica through 417 // a GetAllFilenamesMessage 418 File filenames = ArcRepositoryClientFactory.getPreservationInstance().getAllFilenames(replica.getId()); 419 420 // copy the list of filenames to the output file. 421 FileUtils.copyFile(filenames, batchOutputFile); 422 } 423 424 /** 425 * Get a list of corrupt files in a given bitarchive. 426 * 427 * @param bitarchive a bitarchive 428 * @return a list of wrong files in a given bitarchive. 429 * @throws IllegalState if the file with the list cannot be found. 430 */ 431 public Iterable<String> getChangedFiles(Replica bitarchive) throws IllegalState { 432 ArgumentNotValid.checkNotNull(bitarchive, "Replica bitarchive"); 433 File wrongFilesOutput = WorkFiles.getFile(bitarchive, WorkFiles.WRONG_FILES); 434 435 if (!wrongFilesOutput.exists()) { 436 throw new IllegalState("Could not find the file: " + wrongFilesOutput.getAbsolutePath()); 437 } 438 439 // Create set of file names from bitarchive data 440 return FileUtils.readListFromFile(wrongFilesOutput); 441 } 442 443 /** 444 * This method finds out which files in a given bitarchive are misrepresented in the admin data: Either having the 445 * wrong checksum or not being marked as uploaded when it actually is. 446 * <p> 447 * It uses the admindata file from the DIRS_ARCREPOSITORY_ADMIN directory, as well as the files output by a 448 * runChecksumJob. The erroneous files are stored in files. 449 * <p> 450 * FIXME: It is unclear if the decision if which files are changed isn't better suited to be in getChangedFiles, so 451 * this method only runs the batch job. 452 * 453 * @param replica the bitarchive replica the checksumjob came from 454 * @throws IOFailure On file or network trouble. 455 * @throws PermissionDenied if the output directory cannot be created 456 * @throws ArgumentNotValid if argument replica is null 457 */ 458 public void findChangedFiles(Replica replica) throws IOFailure, PermissionDenied, ArgumentNotValid { 459 ArgumentNotValid.checkNotNull(replica, "Replica replica"); 460 runChecksumJob(replica); 461 admin.synchronize(); 462 463 // Create set of checksums from bitarchive data 464 Set<String> replicaChecksumSet = new HashSet<String>(WorkFiles.getLines(replica, WorkFiles.CHECKSUMS_ON_BA)); 465 466 // Get set of files in arcrepository 467 Set<String> arcrepChecksumSet = new HashSet<String>(); 468 for (String fileName : admin.getAllFileNames()) { 469 arcrepChecksumSet.add(ChecksumJob.makeLine(fileName, admin.getCheckSum(fileName))); 470 } 471 472 // Get set of completed files in arcrepository 473 // Note that these files use the format <filename>##<checksum> to 474 // conform to the checksum output. 475 Set<String> arcrepCompletedChecksumSet = new HashSet<String>(); 476 for (String fileName : admin.getAllFileNames(replica, ReplicaStoreState.UPLOAD_COMPLETED)) { 477 arcrepCompletedChecksumSet.add(ChecksumJob.makeLine(fileName, admin.getCheckSum(fileName))); 478 } 479 480 // Find files where checksums differ 481 Set<String> wrongChecksums = new HashSet<String>(replicaChecksumSet); 482 wrongChecksums.removeAll(arcrepChecksumSet); 483 484 // Find files where state is wrong 485 Set<String> wrongStates = new HashSet<String>(replicaChecksumSet); 486 wrongStates.removeAll(wrongChecksums); 487 wrongStates.removeAll(arcrepCompletedChecksumSet); 488 489 // Remove files unknown in admin data (note - these are not ignored, 490 // they will be handled by missing files operations) 491 for (String checksum : new ArrayList<String>(wrongChecksums)) { 492 Map.Entry<String, String> entry = ChecksumJob.parseLine(checksum); 493 if (!admin.hasEntry(entry.getKey())) { 494 wrongChecksums.remove(checksum); 495 wrongStates.remove(checksum); 496 } 497 } 498 499 // Log result 500 if (wrongChecksums.size() > 0) { 501 log.warn("The " + wrongChecksums.size() + " files '" 502 + new ArrayList<String>(wrongChecksums).subList(0, Math.min(wrongChecksums.size(), MAX_LIST_SIZE)) 503 + "' have wrong checksum in the bitarchive listing in '" 504 + WorkFiles.getPreservationDir(replica).getAbsolutePath() + "'"); 505 } 506 if (wrongStates.size() > 0) { 507 log.warn("The " + wrongStates.size() + " files '" 508 + new ArrayList<String>(wrongStates).subList(0, Math.min(wrongStates.size(), MAX_LIST_SIZE)) 509 + "' have wrong states in the bitarchive listing in '" 510 + WorkFiles.getPreservationDir(replica).getAbsolutePath() + "'"); 511 } 512 513 // Collect all names of files with the wrong checksum 514 Set<String> wrongChecksumFilenames = new HashSet<String>(); 515 for (String checksum : wrongChecksums) { 516 Map.Entry<String, String> entry = ChecksumJob.parseLine(checksum); 517 wrongChecksumFilenames.add(entry.getKey()); 518 } 519 520 // Collect all names of files with the wrong state 521 Set<String> wrongStateFilenames = new HashSet<String>(); 522 for (String checksum : wrongStates) { 523 Map.Entry<String, String> entry = ChecksumJob.parseLine(checksum); 524 wrongStateFilenames.add(entry.getKey()); 525 } 526 527 // Write output data to the files. 528 WorkFiles.write(replica, WorkFiles.WRONG_FILES, wrongChecksumFilenames); 529 WorkFiles.write(replica, WorkFiles.WRONG_STATES, wrongStateFilenames); 530 } 531 532 /** 533 * Runs a checksum job on if the replica is a bitarchive replica and sends a GetAllChecksumsMessage if the replica 534 * is a checksum replica. Output is written to file returned by WorkFiles.getChecksumOutputFile(replica). 535 * 536 * @param replica One of the bitarchive replicas. 537 * @throws IOFailure If unable to create output dirs or if unable to write/read output to files. 538 */ 539 private void runChecksumJob(Replica replica) throws IOFailure { 540 // Create directories for output 541 File outputFile = WorkFiles.getFile(replica, WorkFiles.CHECKSUMS_ON_BA); 542 543 // Retrieve a file containing the checksums of the replica through a 544 // GetAllChecksumsMessage. 545 File checksumFile = ArcRepositoryClientFactory.getPreservationInstance().getAllChecksums(replica.getId()); 546 547 // copy the resulting file to the output file. 548 FileUtils.copyFile(checksumFile, outputFile); 549 } 550 551 /** 552 * Return the number of files found in the replica. If nothing is known about the replica, -1 is returned. 553 * 554 * @param replica the bitarchive to check 555 * @return the number of files found in the bitarchive. If nothing is known about the bitarchive replica, -1 is 556 * returned. 557 * @throws ArgumentNotValid If the replica is null. 558 */ 559 public long getNumberOfFiles(Replica replica) throws ArgumentNotValid { 560 ArgumentNotValid.checkNotNull(replica, "Replica replica"); 561 File unsortedOutput = WorkFiles.getFile(replica, WorkFiles.FILES_ON_BA); 562 563 if (!unsortedOutput.exists()) { 564 return -1; 565 } 566 567 return FileUtils.countLines(unsortedOutput); 568 } 569 570 /** 571 * Get the number of missing files in a given replica. If nothing is known about the replica, -1 is returned. 572 * 573 * @param replica a given replica. 574 * @return the number of missing files in the given replica. If nothing is known about the replica, -1 is returned. 575 * @throws ArgumentNotValid If the replica is null. 576 */ 577 public long getNumberOfMissingFiles(Replica replica) throws ArgumentNotValid { 578 ArgumentNotValid.checkNotNull(replica, "Replica replica"); 579 580 File missingOutput = WorkFiles.getFile(replica, WorkFiles.MISSING_FILES_BA); 581 if (!missingOutput.exists()) { 582 return -1; 583 } 584 585 return FileUtils.countLines(missingOutput); 586 } 587 588 /** 589 * Get the number of wrong files for a replica. If nothing is known about the replica, -1 is returned. 590 * 591 * @param replica a replica. 592 * @return the number of wrong files for the replica. If nothing is known about the replica, -1 is returned. 593 * @throws ArgumentNotValid If the replica is null. 594 */ 595 public long getNumberOfChangedFiles(Replica replica) throws ArgumentNotValid { 596 ArgumentNotValid.checkNotNull(replica, "Replica bitarchive"); 597 File wrongFileOutput = WorkFiles.getFile(replica, WorkFiles.WRONG_FILES); 598 599 if (!wrongFileOutput.exists()) { 600 return -1; 601 } 602 603 return FileUtils.countLines(wrongFileOutput); 604 } 605 606 /** 607 * Get the date for last time the checksum information was updated for this replica. 608 * 609 * @param replica The replica to check last time for. 610 * @return The date for last check. Will return 1970-01-01 for never. 611 * @throws ArgumentNotValid If the replica is null. 612 */ 613 public Date getDateForChangedFiles(Replica replica) throws ArgumentNotValid { 614 ArgumentNotValid.checkNotNull(replica, "Replica replica"); 615 return WorkFiles.getLastUpdate(replica, WorkFiles.WRONG_FILES); 616 } 617 618 /** 619 * Get the date for last time the missing files information was updated for this replica. 620 * 621 * @param replica The replica to check last time for. 622 * @return The date for last check. Will return 1970-01-01 for never. 623 * @throws ArgumentNotValid If the replica is null. 624 */ 625 public Date getDateForMissingFiles(Replica replica) throws ArgumentNotValid { 626 ArgumentNotValid.checkNotNull(replica, "Replica replica"); 627 return WorkFiles.getLastUpdate(replica, WorkFiles.FILES_ON_BA); 628 } 629 630 /** 631 * Check that the files we want to restore are indeed missing on the replica, and present in admin data and the 632 * reference bitarchive. If so, upload missing files from reference replica to this replica. 633 * 634 * @param replica The replica to restore files to 635 * @param filenames The names of the files. 636 * @throws IllegalState If one of the files is unknown (For all known files, there will be an attempt at udpload) 637 * @throws IOFailure If some file cannot be reestablished. All files will be attempted, though. 638 * @throws ArgumentNotValid If the replica or the list of filenames are null. 639 */ 640 public void uploadMissingFiles(Replica replica, String... filenames) throws IOFailure, IllegalState, 641 ArgumentNotValid { 642 ArgumentNotValid.checkNotNull(replica, "Replica replica"); 643 ArgumentNotValid.checkNotNull(filenames, "String... filenames"); 644 645 // Contains all files that we couldn't reestablish 646 List<String> troubleNames = new ArrayList<String>(); 647 648 // preservationStates: map [filename]->[filepreservationstate] 649 // Initialized here to contain an entry for each filename in vargargs 650 // 'filenames'. 651 Map<String, PreservationState> preservationStates = getPreservationStateMap(filenames); 652 653 // For each given filename, try to reestablish it on 654 // Replica 'replica' 655 for (String fn : filenames) { 656 PreservationState fps = preservationStates.get(fn); 657 try { 658 if (fps == null) { 659 throw new IllegalState("No state known about '" + fn + "'"); 660 } 661 if (!fps.isAdminDataOk()) { 662 setAdminDataFailed(fn, replica); 663 admin.synchronize(); 664 fps = getPreservationState(fn); 665 if (fps == null) { 666 throw new IllegalState("No state known about '" + fn + "'"); 667 } 668 } 669 reestablishMissingFile(fn, replica, fps); 670 } catch (Exception e) { 671 log.warn("Trouble reestablishing file '{}' on replica {}", fn, replica.getName(), e); 672 troubleNames.add(fn); 673 } 674 } 675 if (troubleNames.size() > 0) { 676 throw new IOFailure("Could not reestablish all files. The following files were not reestablished: " 677 + troubleNames); 678 } 679 } 680 681 /** 682 * Reestablish a file missing in a replica. The following pre-conditions for reestablishing the file are checked 683 * before changing anything: 684 * <p> 685 * 1) the file is registered correctly in AdminData. <br> 686 * 2) the file is missing in the given replica. <br> 687 * 3) the file is present in another replica, which must be a bitarchive replica (the reference archive).<br> 688 * 4) admin data and the reference archive agree on the checksum of the file. 689 * 690 * @param fileName Name of the file to reestablish. 691 * @param damagedReplica Name of the replica missing the file. 692 * @param fps The FilePreservationStatus of the file to fix. 693 * @throws IOFailure On trouble updating the file. 694 */ 695 private void reestablishMissingFile(String fileName, Replica damagedReplica, PreservationState fps) 696 throws IOFailure { 697 log.debug("Reestablishing missing file '{}' in replica '{}'.", fileName, damagedReplica); 698 if (!satisfiesMissingFileConditions(fps, damagedReplica, fileName)) { 699 throw new IOFailure("Unable to reestablish missing file. '" + fileName + "'. It is not in the right state."); 700 } 701 // Retrieve the file from the reference archive (must be a bitarchive) 702 Replica referenceArchive = fps.getReferenceBitarchive(); 703 try { 704 PreservationArcRepositoryClient arcrep = ArcRepositoryClientFactory.getPreservationInstance(); 705 File tmpDir = FileUtils.createUniqueTempDir(FileUtils.getTempDir(), REMOVED_FILES); 706 File missingFile = new File(tmpDir, fileName); 707 arcrep.getFile(fileName, referenceArchive, missingFile); 708 arcrep.store(missingFile); 709 tmpDir.delete(); 710 } catch (IOFailure e) { 711 String errmsg = "Failed to reestablish '" + fileName + "' in '" + damagedReplica.getName() 712 + "' with copy from '" + referenceArchive + "'"; 713 log.warn(errmsg, e); 714 throw new IOFailure(errmsg, e); 715 } 716 log.info("Reestablished {} in {} with copy from {}", fileName, damagedReplica.getName(), 717 referenceArchive.getName()); 718 FileUtils.removeLineFromFile(fileName, WorkFiles.getFile(damagedReplica, WorkFiles.MISSING_FILES_BA)); 719 FileUtils.appendToFile(WorkFiles.getFile(damagedReplica, WorkFiles.FILES_ON_BA), fileName); 720 } 721 722 /** 723 * Checks the conditions that must be true before reestablishing a missing file. Returns true if and only if all of 724 * the below are true; returns false otherwise. 725 * <p> 726 * <p> 727 * 1) the file is registered correctly in AdminData.<br/> 728 * 2) the file is missing in the given bitarchive.<br/> 729 * 3) the file is present in another bitarchive (the reference archive). <br/> 730 * 4) admin data and the reference archive agree on the checksum. 731 * 732 * @param state the status for one file in the bitarchives. 733 * @param damagedReplica the replica where the file is corrupt or missing. 734 * @param fileName the name of the file being considered. 735 * @return true if all conditions are true, false otherwise. 736 */ 737 private boolean satisfiesMissingFileConditions(PreservationState state, Replica damagedReplica, String fileName) { 738 // condition 1 739 if (!state.isAdminDataOk()) { 740 log.warn("Admin.data is not consistent regarding file '{}'", fileName); 741 return false; 742 } 743 // condition 2 744 if (!state.fileIsMissing(damagedReplica)) { 745 log.warn("File '{}' is not missing in bitarchive on replica '{}'.", fileName, damagedReplica.getName()); 746 return false; 747 } 748 // conditions 3 and 4 749 Replica referenceArchive = state.getReferenceBitarchive(); 750 if (referenceArchive == null) { 751 log.warn("No correct version of file '{}' exists in any archive", fileName); 752 return false; 753 } 754 return true; 755 } 756 757 /** 758 * Calls upon the arcrepository to change the known state for the given file in one replica. This method uses JMS 759 * and blocks until a reply is sent. We don't wait for an acknowledgement that admin data indeed has been updated. 760 * 761 * @param filename The file to change state for 762 * @param rep The replica to change state for the file for. 763 * @throws ArgumentNotValid if arguments are null or empty strings 764 */ 765 private void setAdminDataFailed(String filename, Replica rep) throws ArgumentNotValid { 766 ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename"); 767 ArgumentNotValid.checkNotNull(rep, "Replica rep"); 768 769 ArcRepositoryClientFactory.getPreservationInstance().updateAdminData(filename, rep.getId(), 770 ReplicaStoreState.UPLOAD_FAILED); 771 } 772 773 /** 774 * Check that file checksum is indeed different to admin data and reference replica. If so, remove missing file and 775 * upload it from reference replica to this replica. 776 * 777 * @param replica The replica to restore file to 778 * @param filename The name of the file. 779 * @param credentials The credentials used to perform this replace operation 780 * @param checksum The expected checksum. 781 * @throws IOFailure if the file cannot be reestablished 782 * @throws PermissionDenied if the file is not in correct state 783 * @throws ArgumentNotValid If the filename, the credentials or the checksum either are null or contain the empty 784 * string, or if the replica is null. 785 */ 786 public void replaceChangedFile(Replica replica, String filename, String credentials, String checksum) 787 throws ArgumentNotValid, IOFailure, PermissionDenied { 788 ArgumentNotValid.checkNotNull(replica, "Replica replica"); 789 ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename"); 790 ArgumentNotValid.checkNotNullOrEmpty(checksum, "String checksum"); 791 ArgumentNotValid.checkNotNullOrEmpty(credentials, "String credentials"); 792 793 // Send a correct message to the archive. 794 correctArchiveEntry(replica, filename, checksum, credentials); 795 } 796 797 /** 798 * Method for correcting a corrupt entry in an archive. This message is handled different for the different replicas 799 * 800 * @param replica The replica which contains the bad entry. 801 * @param filename The name of the file. 802 * @param checksum The checksum of the bad entry. 803 * @param credentials The credentials for correcting the bad entry. 804 */ 805 private void correctArchiveEntry(Replica replica, String filename, String checksum, String credentials) { 806 // get the preservation state. 807 Map<String, PreservationState> preservationStates = getPreservationStateMap(filename); 808 PreservationState fps = preservationStates.get(filename); 809 810 // Use the preservation state to find a reference archive (bitarchive). 811 Replica referenceArchive = fps.getReferenceBitarchive(); 812 813 // Get the arc repository client and a temporary file 814 PreservationArcRepositoryClient arcrepClient = ArcRepositoryClientFactory.getPreservationInstance(); 815 File tmpDir = FileUtils.createUniqueTempDir(FileUtils.getTempDir(), REMOVED_FILES); 816 File missingFile = new File(tmpDir, filename); 817 818 // retrieve a good copy of the file 819 arcrepClient.getFile(filename, referenceArchive, missingFile); 820 821 // correct the bad entry in the archive with the retrieved good copy. 822 arcrepClient.correct(replica.getId(), checksum, missingFile, credentials); 823 824 // cleanup afterwards. 825 tmpDir.delete(); 826 } 827 828 /** 829 * Return a list of files present in bitarchive but missing in AdminData. 830 * 831 * @return A list of missing files. 832 * @throws NotImplementedException Always, since this will not been implemented. 833 */ 834 public Iterable<String> getMissingFilesForAdminData() throws NotImplementedException { 835 throw new NotImplementedException("Not to be implemented"); 836 } 837 838 /** 839 * Return a list of files with wrong checksum or status in admin data. 840 * 841 * @return A list of files with wrong checksum or status. 842 * @throws NotImplementedException Always, since this will not been implemented. 843 */ 844 public Iterable<String> getChangedFilesForAdminData() throws NotImplementedException { 845 throw new NotImplementedException("Not to be implemented"); 846 } 847 848 /** 849 * Reestablish admin data to match bitarchive states for files. 850 * 851 * @param filenames The files to reestablish state for. 852 * @throws NotImplementedException Always, since this will not been implemented. 853 * @throws ArgumentNotValid If the list of filenames are null. 854 */ 855 public void addMissingFilesToAdminData(String... filenames) throws NotImplementedException, ArgumentNotValid { 856 ArgumentNotValid.checkNotNull(filenames, "String... filenames"); 857 // TODO implement method 858 throw new NotImplementedException("Not to be implemented"); 859 } 860 861 /** 862 * Reestablish admin data to match replica states for file. 863 * 864 * @param filename The file to reestablish state for. 865 * @throws PermissionDenied if the file is not in correct state 866 * @throws ArgumentNotValid If the filename is null or empty. 867 */ 868 public void changeStateForAdminData(String filename) throws PermissionDenied, ArgumentNotValid { 869 ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename"); 870 admin.synchronize(); 871 PreservationState fps = getPreservationState(filename); 872 String checksum = fps.getReferenceCheckSum(); 873 if (checksum == null || checksum.isEmpty()) { 874 throw new PermissionDenied("No correct checksum for '" + filename + "'"); 875 } 876 if (!admin.getCheckSum(filename).equals(checksum)) { 877 ArcRepositoryClientFactory.getPreservationInstance().updateAdminChecksum(filename, checksum); 878 } 879 for (Replica rep : Replica.getKnown()) { 880 if (fps.getUniqueChecksum(rep).equals(admin.getCheckSum(filename))) { 881 FileUtils.removeLineFromFile(filename, WorkFiles.getFile(rep, WorkFiles.WRONG_FILES)); 882 } 883 } 884 } 885 886 /** Shut down cleanly. */ 887 public void close() { 888 if (closeHook != null) { 889 Runtime.getRuntime().removeShutdownHook(closeHook); 890 } 891 closeHook = null; 892 cleanup(); 893 } 894 895 /** @see CleanupIF#cleanup() */ 896 public void cleanup() { 897 // In case a listener was set up, remove it. 898 ArcRepositoryClientFactory.getPreservationInstance().close(); 899 instance = null; 900 } 901 902}