001/*
002 * #%L
003 * Netarchivesuite - archive
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.archive.arcrepository.bitpreservation;
024
025import java.io.File;
026import java.util.ArrayList;
027import java.util.Collections;
028import java.util.Date;
029import java.util.HashMap;
030import java.util.HashSet;
031import java.util.List;
032import java.util.Map;
033import java.util.Set;
034
035import org.slf4j.Logger;
036import org.slf4j.LoggerFactory;
037
038import dk.netarkivet.archive.arcrepositoryadmin.AdminData;
039import dk.netarkivet.archive.arcrepositoryadmin.ArcRepositoryEntry;
040import dk.netarkivet.archive.arcrepositoryadmin.ReadOnlyAdminData;
041import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory;
042import dk.netarkivet.common.distribute.arcrepository.PreservationArcRepositoryClient;
043import dk.netarkivet.common.distribute.arcrepository.Replica;
044import dk.netarkivet.common.distribute.arcrepository.ReplicaStoreState;
045import dk.netarkivet.common.exceptions.ArgumentNotValid;
046import dk.netarkivet.common.exceptions.IOFailure;
047import dk.netarkivet.common.exceptions.IllegalState;
048import dk.netarkivet.common.exceptions.NetarkivetException;
049import dk.netarkivet.common.exceptions.NotImplementedException;
050import dk.netarkivet.common.exceptions.PermissionDenied;
051import dk.netarkivet.common.exceptions.UnknownID;
052import dk.netarkivet.common.utils.CleanupHook;
053import dk.netarkivet.common.utils.CleanupIF;
054import dk.netarkivet.common.utils.FileUtils;
055import dk.netarkivet.common.utils.StringUtils;
056import dk.netarkivet.common.utils.batch.ChecksumJob;
057
058/**
059 * Class handling integrity check of the arcrepository.
060 * <p>
061 * This class must run on the same machine as the arcrepository, as it uses the same admin data file (read-only).
062 * However, it still talks JMS with the arcrepository.
063 *
064 * @deprecated Use the DatabaseBasedActiveBitPreservation instead (define in the setting:
065 * <b>settings.archive.admin.class</b>).
066 */
067@Deprecated
068public class FileBasedActiveBitPreservation implements ActiveBitPreservation, CleanupIF {
069
070    /** The class log. */
071    private static final Logger log = LoggerFactory.getLogger(FileBasedActiveBitPreservation.class);
072
073    /**
074     * When replacing a broken file, the broken file is downloaded and stored in a temporary directory under
075     * Settings.COMMON_TEMP_DIR with this name. It can then be inspected at your leisure.
076     */
077    private static final String REMOVED_FILES = "bitpreservation";
078
079    /**
080     * The maximum size of logged collections. This is used either when a subcollection is extracted, or when objects
081     * are concatenated. Default value = 10.
082     */
083    private static final int MAX_LIST_SIZE = 10;
084
085    /**
086     * This should be updated at the entrance of each major use block, to ensure it is reasonably in sync with the file.
087     * We cannot, however, guarantee total sync, as the file can change at any time. We consider it good enough that it
088     * is updated every time there is user interaction.
089     */
090    private ReadOnlyAdminData admin;
091
092    /**
093     * File preservation is done in a singleton, which means that any user using the file preservation interface will
094     * update the same state.
095     * <p>
096     * Nothing breaks by two users simultaneously do bit preservation actions, but it may have undesirable consequences,
097     * such as two users simultaneously starting checksum jobs of the full archive.
098     */
099    private static FileBasedActiveBitPreservation instance;
100
101    /** Hook to close down application. */
102    private CleanupHook closeHook;
103
104    /** Initializes a FileBasedActiveBitPreservation instance. */
105    protected FileBasedActiveBitPreservation() {
106        this.admin = AdminData.getReadOnlyInstance();
107        this.closeHook = new CleanupHook(this);
108        Runtime.getRuntime().addShutdownHook(closeHook);
109    }
110
111    /**
112     * Get singleton instance.
113     *
114     * @return the singleton.
115     */
116    public static synchronized FileBasedActiveBitPreservation getInstance() {
117        if (instance == null) {
118            instance = new FileBasedActiveBitPreservation();
119        }
120        return instance;
121    }
122
123    /**
124     * Retrieve the preservation status for the files with the given filenames. This will ask for a fresh checksum from
125     * the bitarchives and admin data.
126     *
127     * @param filenames List of filenames
128     * @return a map ([filename]-> [FilePreservationState]) of the preservation status for the given files. The
129     * preservationstate is null, if the file named does not exist in admin data.
130     * @throws ArgumentNotValid If the list of filenames is null or contains a null.
131     */
132    public Map<String, PreservationState> getPreservationStateMap(String... filenames) throws ArgumentNotValid {
133        ArgumentNotValid.checkNotNull(filenames, "String... filenames");
134        // check, that the files are not empty strings
135        for (String file : filenames) {
136            ArgumentNotValid.checkNotNullOrEmpty(file, "String file");
137        }
138        // Start by retrieving the admin status
139        admin.synchronize();
140
141        // Temporary datastructures:
142        // adminInfo: A map ([filename]->[ArcRepositoryEntry]) to hold admindata
143        // info. Holds one entry for each of the files
144        // known by admin data.
145        // missingInAdminData: Contains the names of files that admindata just
146        // don't know.
147        Map<String, ArcRepositoryEntry> adminInfo = new HashMap<String, ArcRepositoryEntry>();
148        Set<String> missingInAdmindata = new HashSet<String>();
149
150        for (String filename : filenames) {
151            ArcRepositoryEntry ae = admin.getEntry(filename);
152            if (ae != null) {
153                adminInfo.put(filename, ae);
154            } else {
155                missingInAdmindata.add(filename);
156            }
157        }
158
159        if (missingInAdmindata.size() > 0) {
160            log.warn(
161                    "The following {} files are unknown to admindata: {}",
162                    missingInAdmindata.size(),
163                    StringUtils.conjoin(
164                            ",",
165                            new ArrayList<String>(missingInAdmindata).subList(0,
166                                    Math.min(missingInAdmindata.size(), MAX_LIST_SIZE))));
167        }
168
169        // filepreservationStates: map ([filename] -> [filepreservationstate])
170        // This is the datastructure returned from this method
171        Map<String, PreservationState> filepreservationStates = new HashMap<String, PreservationState>();
172
173        // Phase 1: Add null FilePreservationState entries for the files
174        // absent from admindata.
175        for (String missing : missingInAdmindata) {
176            filepreservationStates.put(missing, (FilePreservationState) null);
177        }
178        // Phase 2: For every filename present in admin data,
179        // construct a map ([replica] -> [list of checksums]).
180        // The resulting map:
181        // map ([filename] -> map ([replica] -> [list of checksums])).
182        // This takes a long time, as two batchjobs will be sent out to
183        // to the bitarchives to compute checksums for the files with these
184        // filenames.
185        Map<String, Map<Replica, List<String>>> checksumMaps = getChecksumMaps(adminInfo.keySet());
186
187        // Phase 3: construct FilePreservationState objects for subset of
188        // filenames known by admin data. The rest of the filenames are
189        // represented with a null FilePreservationState object.
190        for (Map.Entry<String, ArcRepositoryEntry> entry : adminInfo.entrySet()) {
191            String filename = entry.getKey();
192            ArcRepositoryEntry adminFileInfo = entry.getValue();
193            filepreservationStates.put(filename,
194                    new FilePreservationState(filename, adminFileInfo, checksumMaps.get(filename)));
195        }
196        return filepreservationStates;
197    }
198
199    /**
200     * Get the details of the state of the given file in the bitarchives and admin data.
201     *
202     * @param filename A given file
203     * @return the FilePreservationState for the given file. This will be null, if the filename is not found in admin
204     * data.
205     */
206    public PreservationState getPreservationState(String filename) {
207        ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename");
208        Map<String, PreservationState> filepreservationStates = getPreservationStateMap(filename);
209
210        return filepreservationStates.get(filename);
211    }
212
213    /**
214     * Generate a map of checksums for these filenames in the bitarchives ( map ([filename] -> map ([replica] -> [list
215     * of checksums]))). This takes a long time, as a batchjob will be sent out to all the bitarchives to compute
216     * checksums for the files with these filenames.
217     *
218     * @param filenames The filenames to get the checksums for.
219     * @return Map containing the output of checksum jobs from the bitarchives.
220     */
221    private Map<String, Map<Replica, List<String>>> getChecksumMaps(Set<String> filenames) {
222
223        // checksummaps: map ([filename] -> map ([replica]
224        // -> [list of checksums])).
225        // This datastructure will contain for each filename the computed
226        // checksums for the file with this filename on all replicas
227        // (bitarchives).
228        Map<String, Map<Replica, List<String>>> checksummaps = new HashMap<String, Map<Replica, List<String>>>();
229
230        // Only make one checksum job for each replica
231        for (Replica rep : Replica.getKnown()) {
232            // Get the checksum information from Replica 'rep' as
233            // a map ([filename]->[list of checksums]).
234            Map<String, List<String>> checksums = getChecksums(rep, filenames);
235            log.debug("Adding checksums for replica '{}' for filenames: {}", rep,
236                    StringUtils.conjoin(",", filenames, MAX_LIST_SIZE));
237
238            for (String filename : filenames) {
239                // Update 'checksummaps' datastructure with the checksums
240                // received from Replica 'rep'.
241
242                // replicaMap: map ([replica]
243                // -> [list of checksums for one filename]).
244                Map<Replica, List<String>> replicaMap;
245                // Get current map in 'checksummaps' datastructure for filename,
246                // if it exists. Otherwise a new one is created, and
247                // stored.
248                if (checksummaps.containsKey(filename)) {
249                    replicaMap = checksummaps.get(filename);
250                } else {
251                    replicaMap = new HashMap<Replica, List<String>>();
252                    checksummaps.put(filename, replicaMap);
253                }
254                // Extract the list of checksums for the given filename from
255                // the 'checksums' datastructure.
256                List<String> checksumsForFileOnRep = checksums.get(filename);
257                if (checksumsForFileOnRep == null) {
258                    // If no checksums for file was available on replica 'ba'
259                    // just add an empty list of checksums.
260                    checksumsForFileOnRep = new ArrayList<String>();
261                }
262                // Add the list of checksums for the given file
263                // on replica 'rep' to datastructure 'replicaMap'.
264                replicaMap.put(rep, checksumsForFileOnRep);
265            }
266        }
267        return checksummaps;
268    }
269
270    /**
271     * Get the checksum of a list of files in a replica (map ([filename] -> map ([replica] -> [list of checksums])).
272     * <p>
273     * Note that this method runs a batch job on the bitarchives, and therefore may take a long time, depending on
274     * network delays.
275     *
276     * @param rep The replica to ask for checksums.
277     * @param filenames The names of the files to ask for checksums for.
278     * @return The MD5 checksums of the files, or the empty string if the file was not in the replica.
279     * @see ChecksumJob#parseLine(String)
280     */
281    private Map<String, List<String>> getChecksums(Replica rep, Set<String> filenames) {
282        // initialise the resulting map.
283        Map<String, List<String>> res = new HashMap<String, List<String>>();
284
285        try {
286            PreservationArcRepositoryClient arcClient = ArcRepositoryClientFactory.getPreservationInstance();
287            // for each file extract the checksum through a checksum message
288            // and then put it into the resulting map.
289            for (String file : filenames) {
290                // retrieve the checksum from the replica.
291                String checksum = arcClient.getChecksum(rep.getId(), file);
292
293                // put the checksum into a list, or make empty list if the
294                // checksum was not retrieved.
295                List<String> csList;
296                if (checksum == null || checksum.isEmpty()) {
297                    log.warn("The checksum for file '{}' from replica '{}' was invalid. Empty list returned", file, rep);
298                    csList = Collections.<String>emptyList();
299                } else {
300                    csList = new ArrayList<String>();
301                    csList.add(checksum);
302                }
303
304                // put the filename and list into the map.
305                res.put(file, csList);
306            }
307
308            log.debug("The map from a checksum archive: " + res.toString());
309        } catch (NetarkivetException e) {
310            // This is not critical. Log and continue.
311            log.warn("The retrieval of checksums from a checksum archive was not successful.", e);
312        }
313
314        return res;
315    }
316
317    /**
318     * Get a list of missing files in a given replica.
319     *
320     * @param replica A given replica.
321     * @return A list of missing files in a given replica.
322     * @throws IllegalState if the file with the list cannot be found.
323     * @throws ArgumentNotValid If the replica is null.
324     */
325    public Iterable<String> getMissingFiles(Replica replica) throws IllegalState, ArgumentNotValid {
326        ArgumentNotValid.checkNotNull(replica, "Replica replica");
327        File missingOutput = WorkFiles.getFile(replica, WorkFiles.MISSING_FILES_BA);
328        if (!missingOutput.exists()) {
329            throw new IllegalState("Could not find the file: " + missingOutput.getAbsolutePath());
330        }
331        return FileUtils.readListFromFile(missingOutput);
332    }
333
334    /**
335     * This method takes as input the name of a replica for which we wish to retrieve the list of files, either through
336     * a FileListJob or a GetAllFilenamesMessage. It also reads in the known files in the arcrepository from the
337     * AdminData directory specified in the Setting DIRS_ARCREPOSITORY_ADMIN. The two file lists are compared and a
338     * subdirectory missingFiles is created with two unsorted files: 'missingba.txt' containing missing files, ie those
339     * registered in the admin data, but not found in the replica, and 'missingadmindata.txt' containing extra files,
340     * ie. those found in the replica but not in the arcrepository admin data.
341     * <p>
342     * TODO The second file is never used on the current implementation.
343     * <p>
344     * FIXME: It is unclear if the decision if which files are missing isn't better suited to be in getMissingFiles, so
345     * this method only runs the batch job.
346     *
347     * @param replica the replica to search for missing files
348     * @throws ArgumentNotValid If the given directory does not contain a file filelistOutput/sorted.txt, or the
349     * argument replica is null.
350     * @throws PermissionDenied If the output directory cannot be created.
351     */
352    public void findMissingFiles(Replica replica) throws ArgumentNotValid, PermissionDenied {
353        ArgumentNotValid.checkNotNull(replica, "Replica replica");
354        runFileListJob(replica);
355        log.trace("Finding missing files in directory '" + WorkFiles.getPreservationDir(replica) + "'");
356        admin.synchronize();
357
358        // Create set of file names from replica data
359        Set<String> filesInReplica = new HashSet<String>(WorkFiles.getLines(replica, WorkFiles.FILES_ON_BA));
360
361        // Get set of files in arcrepository
362        Set<String> arcrepNameSet = admin.getAllFileNames();
363
364        // Find difference set 1 (the files missing from the replica).
365        Set<String> extraFilesInAdminData = new HashSet<String>(arcrepNameSet);
366        extraFilesInAdminData.removeAll(filesInReplica);
367
368        // Log result
369        if (extraFilesInAdminData.size() > 0) {
370            log.warn("The "
371                    + extraFilesInAdminData.size()
372                    + " files '"
373                    + new ArrayList<String>(extraFilesInAdminData).subList(0,
374                            Math.min(extraFilesInAdminData.size(), MAX_LIST_SIZE))
375                    + "' are not present in the replica listing in '"
376                    + WorkFiles.getPreservationDir(replica).getAbsolutePath() + "'");
377        }
378
379        // Write output data
380        WorkFiles.write(replica, WorkFiles.MISSING_FILES_BA, extraFilesInAdminData);
381
382        // Find difference set 2 (the files missing in admin.data).
383        Set<String> extraFilesInRep = new HashSet<String>(filesInReplica);
384        extraFilesInRep.removeAll(arcrepNameSet);
385
386        // Log result
387        if (extraFilesInRep.size() > 0) {
388            log.warn("The "
389                    + extraFilesInRep.size()
390                    + " files '"
391                    + new ArrayList<String>(extraFilesInRep).subList(0, Math.min(extraFilesInRep.size(), MAX_LIST_SIZE))
392                    + "' have been found in the replica listing in '"
393                    + WorkFiles.getPreservationDir(replica).getAbsolutePath() + "' though they are not known by the "
394                    + "system.");
395        }
396
397        // Write output data
398        WorkFiles.write(replica, WorkFiles.MISSING_FILES_ADMINDATA, extraFilesInRep);
399        log.trace("Finished finding missing files.");
400    }
401
402    /**
403     * Method to get a list of all files in a given bitarchive. The result is stored (unsorted) in the area specified by
404     * WorkFiles.FILES_ON_BA.
405     *
406     * @param replica the replica where the given bitarchive lies
407     * @throws PermissionDenied if the output directories cannot be created
408     * @throws IOFailure if there is a problem writing the output file, or if the job fails for some reason
409     * @throws UnknownID If the replica has an unknown replicaType.
410     */
411    private void runFileListJob(Replica replica) throws IOFailure, UnknownID, PermissionDenied {
412        // Pick the right directory to output to
413        File batchOutputFile = WorkFiles.getFile(replica, WorkFiles.FILES_ON_BA);
414        log.trace("runFileListJob for replica '{}', output file '{}'", replica, batchOutputFile);
415
416        // Retrieve a file containing all the filenames of the replica through
417        // a GetAllFilenamesMessage
418        File filenames = ArcRepositoryClientFactory.getPreservationInstance().getAllFilenames(replica.getId());
419
420        // copy the list of filenames to the output file.
421        FileUtils.copyFile(filenames, batchOutputFile);
422    }
423
424    /**
425     * Get a list of corrupt files in a given bitarchive.
426     *
427     * @param bitarchive a bitarchive
428     * @return a list of wrong files in a given bitarchive.
429     * @throws IllegalState if the file with the list cannot be found.
430     */
431    public Iterable<String> getChangedFiles(Replica bitarchive) throws IllegalState {
432        ArgumentNotValid.checkNotNull(bitarchive, "Replica bitarchive");
433        File wrongFilesOutput = WorkFiles.getFile(bitarchive, WorkFiles.WRONG_FILES);
434
435        if (!wrongFilesOutput.exists()) {
436            throw new IllegalState("Could not find the file: " + wrongFilesOutput.getAbsolutePath());
437        }
438
439        // Create set of file names from bitarchive data
440        return FileUtils.readListFromFile(wrongFilesOutput);
441    }
442
443    /**
444     * This method finds out which files in a given bitarchive are misrepresented in the admin data: Either having the
445     * wrong checksum or not being marked as uploaded when it actually is.
446     * <p>
447     * It uses the admindata file from the DIRS_ARCREPOSITORY_ADMIN directory, as well as the files output by a
448     * runChecksumJob. The erroneous files are stored in files.
449     * <p>
450     * FIXME: It is unclear if the decision if which files are changed isn't better suited to be in getChangedFiles, so
451     * this method only runs the batch job.
452     *
453     * @param replica the bitarchive replica the checksumjob came from
454     * @throws IOFailure On file or network trouble.
455     * @throws PermissionDenied if the output directory cannot be created
456     * @throws ArgumentNotValid if argument replica is null
457     */
458    public void findChangedFiles(Replica replica) throws IOFailure, PermissionDenied, ArgumentNotValid {
459        ArgumentNotValid.checkNotNull(replica, "Replica replica");
460        runChecksumJob(replica);
461        admin.synchronize();
462
463        // Create set of checksums from bitarchive data
464        Set<String> replicaChecksumSet = new HashSet<String>(WorkFiles.getLines(replica, WorkFiles.CHECKSUMS_ON_BA));
465
466        // Get set of files in arcrepository
467        Set<String> arcrepChecksumSet = new HashSet<String>();
468        for (String fileName : admin.getAllFileNames()) {
469            arcrepChecksumSet.add(ChecksumJob.makeLine(fileName, admin.getCheckSum(fileName)));
470        }
471
472        // Get set of completed files in arcrepository
473        // Note that these files use the format <filename>##<checksum> to
474        // conform to the checksum output.
475        Set<String> arcrepCompletedChecksumSet = new HashSet<String>();
476        for (String fileName : admin.getAllFileNames(replica, ReplicaStoreState.UPLOAD_COMPLETED)) {
477            arcrepCompletedChecksumSet.add(ChecksumJob.makeLine(fileName, admin.getCheckSum(fileName)));
478        }
479
480        // Find files where checksums differ
481        Set<String> wrongChecksums = new HashSet<String>(replicaChecksumSet);
482        wrongChecksums.removeAll(arcrepChecksumSet);
483
484        // Find files where state is wrong
485        Set<String> wrongStates = new HashSet<String>(replicaChecksumSet);
486        wrongStates.removeAll(wrongChecksums);
487        wrongStates.removeAll(arcrepCompletedChecksumSet);
488
489        // Remove files unknown in admin data (note - these are not ignored,
490        // they will be handled by missing files operations)
491        for (String checksum : new ArrayList<String>(wrongChecksums)) {
492            Map.Entry<String, String> entry = ChecksumJob.parseLine(checksum);
493            if (!admin.hasEntry(entry.getKey())) {
494                wrongChecksums.remove(checksum);
495                wrongStates.remove(checksum);
496            }
497        }
498
499        // Log result
500        if (wrongChecksums.size() > 0) {
501            log.warn("The " + wrongChecksums.size() + " files '"
502                    + new ArrayList<String>(wrongChecksums).subList(0, Math.min(wrongChecksums.size(), MAX_LIST_SIZE))
503                    + "' have wrong checksum in the bitarchive listing in '"
504                    + WorkFiles.getPreservationDir(replica).getAbsolutePath() + "'");
505        }
506        if (wrongStates.size() > 0) {
507            log.warn("The " + wrongStates.size() + " files '"
508                    + new ArrayList<String>(wrongStates).subList(0, Math.min(wrongStates.size(), MAX_LIST_SIZE))
509                    + "' have wrong states in the bitarchive listing in '"
510                    + WorkFiles.getPreservationDir(replica).getAbsolutePath() + "'");
511        }
512
513        // Collect all names of files with the wrong checksum
514        Set<String> wrongChecksumFilenames = new HashSet<String>();
515        for (String checksum : wrongChecksums) {
516            Map.Entry<String, String> entry = ChecksumJob.parseLine(checksum);
517            wrongChecksumFilenames.add(entry.getKey());
518        }
519
520        // Collect all names of files with the wrong state
521        Set<String> wrongStateFilenames = new HashSet<String>();
522        for (String checksum : wrongStates) {
523            Map.Entry<String, String> entry = ChecksumJob.parseLine(checksum);
524            wrongStateFilenames.add(entry.getKey());
525        }
526
527        // Write output data to the files.
528        WorkFiles.write(replica, WorkFiles.WRONG_FILES, wrongChecksumFilenames);
529        WorkFiles.write(replica, WorkFiles.WRONG_STATES, wrongStateFilenames);
530    }
531
532    /**
533     * Runs a checksum job on if the replica is a bitarchive replica and sends a GetAllChecksumsMessage if the replica
534     * is a checksum replica. Output is written to file returned by WorkFiles.getChecksumOutputFile(replica).
535     *
536     * @param replica One of the bitarchive replicas.
537     * @throws IOFailure If unable to create output dirs or if unable to write/read output to files.
538     */
539    private void runChecksumJob(Replica replica) throws IOFailure {
540        // Create directories for output
541        File outputFile = WorkFiles.getFile(replica, WorkFiles.CHECKSUMS_ON_BA);
542
543        // Retrieve a file containing the checksums of the replica through a
544        // GetAllChecksumsMessage.
545        File checksumFile = ArcRepositoryClientFactory.getPreservationInstance().getAllChecksums(replica.getId());
546
547        // copy the resulting file to the output file.
548        FileUtils.copyFile(checksumFile, outputFile);
549    }
550
551    /**
552     * Return the number of files found in the replica. If nothing is known about the replica, -1 is returned.
553     *
554     * @param replica the bitarchive to check
555     * @return the number of files found in the bitarchive. If nothing is known about the bitarchive replica, -1 is
556     * returned.
557     * @throws ArgumentNotValid If the replica is null.
558     */
559    public long getNumberOfFiles(Replica replica) throws ArgumentNotValid {
560        ArgumentNotValid.checkNotNull(replica, "Replica replica");
561        File unsortedOutput = WorkFiles.getFile(replica, WorkFiles.FILES_ON_BA);
562
563        if (!unsortedOutput.exists()) {
564            return -1;
565        }
566
567        return FileUtils.countLines(unsortedOutput);
568    }
569
570    /**
571     * Get the number of missing files in a given replica. If nothing is known about the replica, -1 is returned.
572     *
573     * @param replica a given replica.
574     * @return the number of missing files in the given replica. If nothing is known about the replica, -1 is returned.
575     * @throws ArgumentNotValid If the replica is null.
576     */
577    public long getNumberOfMissingFiles(Replica replica) throws ArgumentNotValid {
578        ArgumentNotValid.checkNotNull(replica, "Replica replica");
579
580        File missingOutput = WorkFiles.getFile(replica, WorkFiles.MISSING_FILES_BA);
581        if (!missingOutput.exists()) {
582            return -1;
583        }
584
585        return FileUtils.countLines(missingOutput);
586    }
587
588    /**
589     * Get the number of wrong files for a replica. If nothing is known about the replica, -1 is returned.
590     *
591     * @param replica a replica.
592     * @return the number of wrong files for the replica. If nothing is known about the replica, -1 is returned.
593     * @throws ArgumentNotValid If the replica is null.
594     */
595    public long getNumberOfChangedFiles(Replica replica) throws ArgumentNotValid {
596        ArgumentNotValid.checkNotNull(replica, "Replica bitarchive");
597        File wrongFileOutput = WorkFiles.getFile(replica, WorkFiles.WRONG_FILES);
598
599        if (!wrongFileOutput.exists()) {
600            return -1;
601        }
602
603        return FileUtils.countLines(wrongFileOutput);
604    }
605
606    /**
607     * Get the date for last time the checksum information was updated for this replica.
608     *
609     * @param replica The replica to check last time for.
610     * @return The date for last check. Will return 1970-01-01 for never.
611     * @throws ArgumentNotValid If the replica is null.
612     */
613    public Date getDateForChangedFiles(Replica replica) throws ArgumentNotValid {
614        ArgumentNotValid.checkNotNull(replica, "Replica replica");
615        return WorkFiles.getLastUpdate(replica, WorkFiles.WRONG_FILES);
616    }
617
618    /**
619     * Get the date for last time the missing files information was updated for this replica.
620     *
621     * @param replica The replica to check last time for.
622     * @return The date for last check. Will return 1970-01-01 for never.
623     * @throws ArgumentNotValid If the replica is null.
624     */
625    public Date getDateForMissingFiles(Replica replica) throws ArgumentNotValid {
626        ArgumentNotValid.checkNotNull(replica, "Replica replica");
627        return WorkFiles.getLastUpdate(replica, WorkFiles.FILES_ON_BA);
628    }
629
630    /**
631     * Check that the files we want to restore are indeed missing on the replica, and present in admin data and the
632     * reference bitarchive. If so, upload missing files from reference replica to this replica.
633     *
634     * @param replica The replica to restore files to
635     * @param filenames The names of the files.
636     * @throws IllegalState If one of the files is unknown (For all known files, there will be an attempt at udpload)
637     * @throws IOFailure If some file cannot be reestablished. All files will be attempted, though.
638     * @throws ArgumentNotValid If the replica or the list of filenames are null.
639     */
640    public void uploadMissingFiles(Replica replica, String... filenames) throws IOFailure, IllegalState,
641            ArgumentNotValid {
642        ArgumentNotValid.checkNotNull(replica, "Replica replica");
643        ArgumentNotValid.checkNotNull(filenames, "String... filenames");
644
645        // Contains all files that we couldn't reestablish
646        List<String> troubleNames = new ArrayList<String>();
647
648        // preservationStates: map [filename]->[filepreservationstate]
649        // Initialized here to contain an entry for each filename in vargargs
650        // 'filenames'.
651        Map<String, PreservationState> preservationStates = getPreservationStateMap(filenames);
652
653        // For each given filename, try to reestablish it on
654        // Replica 'replica'
655        for (String fn : filenames) {
656            PreservationState fps = preservationStates.get(fn);
657            try {
658                if (fps == null) {
659                    throw new IllegalState("No state known about '" + fn + "'");
660                }
661                if (!fps.isAdminDataOk()) {
662                    setAdminDataFailed(fn, replica);
663                    admin.synchronize();
664                    fps = getPreservationState(fn);
665                    if (fps == null) {
666                        throw new IllegalState("No state known about '" + fn + "'");
667                    }
668                }
669                reestablishMissingFile(fn, replica, fps);
670            } catch (Exception e) {
671                log.warn("Trouble reestablishing file '{}' on replica {}", fn, replica.getName(), e);
672                troubleNames.add(fn);
673            }
674        }
675        if (troubleNames.size() > 0) {
676            throw new IOFailure("Could not reestablish all files. The following files were not reestablished: "
677                    + troubleNames);
678        }
679    }
680
681    /**
682     * Reestablish a file missing in a replica. The following pre-conditions for reestablishing the file are checked
683     * before changing anything:
684     * <p>
685     * 1) the file is registered correctly in AdminData. <br>
686     * 2) the file is missing in the given replica. <br>
687     * 3) the file is present in another replica, which must be a bitarchive replica (the reference archive).<br>
688     * 4) admin data and the reference archive agree on the checksum of the file.
689     *
690     * @param fileName Name of the file to reestablish.
691     * @param damagedReplica Name of the replica missing the file.
692     * @param fps The FilePreservationStatus of the file to fix.
693     * @throws IOFailure On trouble updating the file.
694     */
695    private void reestablishMissingFile(String fileName, Replica damagedReplica, PreservationState fps)
696            throws IOFailure {
697        log.debug("Reestablishing missing file '{}' in replica '{}'.", fileName, damagedReplica);
698        if (!satisfiesMissingFileConditions(fps, damagedReplica, fileName)) {
699            throw new IOFailure("Unable to reestablish missing file. '" + fileName + "'. It is not in the right state.");
700        }
701        // Retrieve the file from the reference archive (must be a bitarchive)
702        Replica referenceArchive = fps.getReferenceBitarchive();
703        try {
704            PreservationArcRepositoryClient arcrep = ArcRepositoryClientFactory.getPreservationInstance();
705            File tmpDir = FileUtils.createUniqueTempDir(FileUtils.getTempDir(), REMOVED_FILES);
706            File missingFile = new File(tmpDir, fileName);
707            arcrep.getFile(fileName, referenceArchive, missingFile);
708            arcrep.store(missingFile);
709            tmpDir.delete();
710        } catch (IOFailure e) {
711            String errmsg = "Failed to reestablish '" + fileName + "' in '" + damagedReplica.getName()
712                    + "' with copy from '" + referenceArchive + "'";
713            log.warn(errmsg, e);
714            throw new IOFailure(errmsg, e);
715        }
716        log.info("Reestablished {} in {} with copy from {}", fileName, damagedReplica.getName(),
717                referenceArchive.getName());
718        FileUtils.removeLineFromFile(fileName, WorkFiles.getFile(damagedReplica, WorkFiles.MISSING_FILES_BA));
719        FileUtils.appendToFile(WorkFiles.getFile(damagedReplica, WorkFiles.FILES_ON_BA), fileName);
720    }
721
722    /**
723     * Checks the conditions that must be true before reestablishing a missing file. Returns true if and only if all of
724     * the below are true; returns false otherwise.
725     * <p>
726     * <p>
727     * 1) the file is registered correctly in AdminData.<br/>
728     * 2) the file is missing in the given bitarchive.<br/>
729     * 3) the file is present in another bitarchive (the reference archive). <br/>
730     * 4) admin data and the reference archive agree on the checksum.
731     *
732     * @param state the status for one file in the bitarchives.
733     * @param damagedReplica the replica where the file is corrupt or missing.
734     * @param fileName the name of the file being considered.
735     * @return true if all conditions are true, false otherwise.
736     */
737    private boolean satisfiesMissingFileConditions(PreservationState state, Replica damagedReplica, String fileName) {
738        // condition 1
739        if (!state.isAdminDataOk()) {
740            log.warn("Admin.data is not consistent regarding file '{}'", fileName);
741            return false;
742        }
743        // condition 2
744        if (!state.fileIsMissing(damagedReplica)) {
745            log.warn("File '{}' is not missing in bitarchive on replica '{}'.", fileName, damagedReplica.getName());
746            return false;
747        }
748        // conditions 3 and 4
749        Replica referenceArchive = state.getReferenceBitarchive();
750        if (referenceArchive == null) {
751            log.warn("No correct version of file '{}' exists in any archive", fileName);
752            return false;
753        }
754        return true;
755    }
756
757    /**
758     * Calls upon the arcrepository to change the known state for the given file in one replica. This method uses JMS
759     * and blocks until a reply is sent. We don't wait for an acknowledgement that admin data indeed has been updated.
760     *
761     * @param filename The file to change state for
762     * @param rep The replica to change state for the file for.
763     * @throws ArgumentNotValid if arguments are null or empty strings
764     */
765    private void setAdminDataFailed(String filename, Replica rep) throws ArgumentNotValid {
766        ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename");
767        ArgumentNotValid.checkNotNull(rep, "Replica rep");
768
769        ArcRepositoryClientFactory.getPreservationInstance().updateAdminData(filename, rep.getId(),
770                ReplicaStoreState.UPLOAD_FAILED);
771    }
772
773    /**
774     * Check that file checksum is indeed different to admin data and reference replica. If so, remove missing file and
775     * upload it from reference replica to this replica.
776     *
777     * @param replica The replica to restore file to
778     * @param filename The name of the file.
779     * @param credentials The credentials used to perform this replace operation
780     * @param checksum The expected checksum.
781     * @throws IOFailure if the file cannot be reestablished
782     * @throws PermissionDenied if the file is not in correct state
783     * @throws ArgumentNotValid If the filename, the credentials or the checksum either are null or contain the empty
784     * string, or if the replica is null.
785     */
786    public void replaceChangedFile(Replica replica, String filename, String credentials, String checksum)
787            throws ArgumentNotValid, IOFailure, PermissionDenied {
788        ArgumentNotValid.checkNotNull(replica, "Replica replica");
789        ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename");
790        ArgumentNotValid.checkNotNullOrEmpty(checksum, "String checksum");
791        ArgumentNotValid.checkNotNullOrEmpty(credentials, "String credentials");
792
793        // Send a correct message to the archive.
794        correctArchiveEntry(replica, filename, checksum, credentials);
795    }
796
797    /**
798     * Method for correcting a corrupt entry in an archive. This message is handled different for the different replicas
799     *
800     * @param replica The replica which contains the bad entry.
801     * @param filename The name of the file.
802     * @param checksum The checksum of the bad entry.
803     * @param credentials The credentials for correcting the bad entry.
804     */
805    private void correctArchiveEntry(Replica replica, String filename, String checksum, String credentials) {
806        // get the preservation state.
807        Map<String, PreservationState> preservationStates = getPreservationStateMap(filename);
808        PreservationState fps = preservationStates.get(filename);
809
810        // Use the preservation state to find a reference archive (bitarchive).
811        Replica referenceArchive = fps.getReferenceBitarchive();
812
813        // Get the arc repository client and a temporary file
814        PreservationArcRepositoryClient arcrepClient = ArcRepositoryClientFactory.getPreservationInstance();
815        File tmpDir = FileUtils.createUniqueTempDir(FileUtils.getTempDir(), REMOVED_FILES);
816        File missingFile = new File(tmpDir, filename);
817
818        // retrieve a good copy of the file
819        arcrepClient.getFile(filename, referenceArchive, missingFile);
820
821        // correct the bad entry in the archive with the retrieved good copy.
822        arcrepClient.correct(replica.getId(), checksum, missingFile, credentials);
823
824        // cleanup afterwards.
825        tmpDir.delete();
826    }
827
828    /**
829     * Return a list of files present in bitarchive but missing in AdminData.
830     *
831     * @return A list of missing files.
832     * @throws NotImplementedException Always, since this will not been implemented.
833     */
834    public Iterable<String> getMissingFilesForAdminData() throws NotImplementedException {
835        throw new NotImplementedException("Not to be implemented");
836    }
837
838    /**
839     * Return a list of files with wrong checksum or status in admin data.
840     *
841     * @return A list of files with wrong checksum or status.
842     * @throws NotImplementedException Always, since this will not been implemented.
843     */
844    public Iterable<String> getChangedFilesForAdminData() throws NotImplementedException {
845        throw new NotImplementedException("Not to be implemented");
846    }
847
848    /**
849     * Reestablish admin data to match bitarchive states for files.
850     *
851     * @param filenames The files to reestablish state for.
852     * @throws NotImplementedException Always, since this will not been implemented.
853     * @throws ArgumentNotValid If the list of filenames are null.
854     */
855    public void addMissingFilesToAdminData(String... filenames) throws NotImplementedException, ArgumentNotValid {
856        ArgumentNotValid.checkNotNull(filenames, "String... filenames");
857        // TODO implement method
858        throw new NotImplementedException("Not to be implemented");
859    }
860
861    /**
862     * Reestablish admin data to match replica states for file.
863     *
864     * @param filename The file to reestablish state for.
865     * @throws PermissionDenied if the file is not in correct state
866     * @throws ArgumentNotValid If the filename is null or empty.
867     */
868    public void changeStateForAdminData(String filename) throws PermissionDenied, ArgumentNotValid {
869        ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename");
870        admin.synchronize();
871        PreservationState fps = getPreservationState(filename);
872        String checksum = fps.getReferenceCheckSum();
873        if (checksum == null || checksum.isEmpty()) {
874            throw new PermissionDenied("No correct checksum for '" + filename + "'");
875        }
876        if (!admin.getCheckSum(filename).equals(checksum)) {
877            ArcRepositoryClientFactory.getPreservationInstance().updateAdminChecksum(filename, checksum);
878        }
879        for (Replica rep : Replica.getKnown()) {
880            if (fps.getUniqueChecksum(rep).equals(admin.getCheckSum(filename))) {
881                FileUtils.removeLineFromFile(filename, WorkFiles.getFile(rep, WorkFiles.WRONG_FILES));
882            }
883        }
884    }
885
886    /** Shut down cleanly. */
887    public void close() {
888        if (closeHook != null) {
889            Runtime.getRuntime().removeShutdownHook(closeHook);
890        }
891        closeHook = null;
892        cleanup();
893    }
894
895    /** @see CleanupIF#cleanup() */
896    public void cleanup() {
897        // In case a listener was set up, remove it.
898        ArcRepositoryClientFactory.getPreservationInstance().close();
899        instance = null;
900    }
901
902}