package dk.netarkivet.harvester.heritrix3;

import dk.netarkivet.common.Constants;
import dk.netarkivet.common.exceptions.ArgumentNotValid;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.exceptions.PermissionDenied;
import dk.netarkivet.common.utils.FileUtils;
import dk.netarkivet.common.utils.Settings;
import dk.netarkivet.common.utils.SystemUtils;
import dk.netarkivet.common.utils.archive.ArchiveProfile;
import dk.netarkivet.common.utils.cdx.CDXUtils;
import dk.netarkivet.harvester.harvesting.PersistentJobData;
import dk.netarkivet.harvester.harvesting.metadata.MetadataEntry;
import dk.netarkivet.harvester.harvesting.metadata.MetadataFile;
import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriter;
import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriterWarc;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.TreeSet;
import org.jwat.common.ANVLRecord;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:dk/netarkivet/harvester/heritrix3/HarvestDocumentation.class */
public class HarvestDocumentation {
    private static final Logger log = LoggerFactory.getLogger(HarvestDocumentation.class);

    public static void documentHarvest(IngestableFiles ingestableFiles) throws IOFailure {
        ArgumentNotValid.checkNotNull(ingestableFiles, "ingestables");
        File crawlDir = ingestableFiles.getCrawlDir();
        Long valueOf = Long.valueOf(ingestableFiles.getJobId());
        Long valueOf2 = Long.valueOf(ingestableFiles.getHarvestID());
        if (ingestableFiles.isMetadataReady()) {
            log.warn("The metadata-file '{}' already exists, so we don't make another one!", ingestableFiles.getMetadataFile().getAbsolutePath());
            return;
        }
        List list = null;
        try {
            MetadataFileWriterWarc metadataWriter = ingestableFiles.getMetadataWriter();
            if (metadataWriter instanceof MetadataFileWriterWarc) {
                ANVLRecord aNVLRecord = new ANVLRecord();
                aNVLRecord.addLabelValue("software", "NetarchiveSuite/" + Constants.getVersionString() + "/https://sbforge.org/display/NAS");
                aNVLRecord.addLabelValue("ip", SystemUtils.getLocalIP());
                aNVLRecord.addLabelValue("hostname", SystemUtils.getLocalHostName());
                aNVLRecord.addLabelValue("conformsTo", "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf");
                aNVLRecord.addLabelValue("isPartOf", "" + new PersistentJobData(crawlDir).getJobID());
                metadataWriter.insertInfoRecord(aNVLRecord);
            }
            try {
                for (MetadataEntry metadataEntry : getStoredMetadata(crawlDir)) {
                    metadataWriter.write(metadataEntry.getURL(), metadataEntry.getMimeType(), SystemUtils.getLocalIP(), System.currentTimeMillis(), metadataEntry.getData());
                }
            } catch (IOException e) {
                log.warn("Unable to write pre-metadata to metadata archivefile", e);
            }
            List<File> writeHarvestDetails = writeHarvestDetails(valueOf.longValue(), valueOf2.longValue(), ingestableFiles, metadataWriter, Constants.getHeritrix3VersionString());
            Iterator<File> it = writeHarvestDetails.iterator();
            while (it.hasNext()) {
                File next = it.next();
                if (next.getName().equals("crawl.log") || next.getName().equals("harvestInfo.xml") || next.getName().equals("progress-statistics.log")) {
                    it.remove();
                }
            }
            boolean z = false;
            File arcsDir = ingestableFiles.getArcsDir();
            File warcsDir = ingestableFiles.getWarcsDir();
            if (arcsDir.isDirectory() && FileUtils.hasFiles(arcsDir)) {
                addCDXes(ingestableFiles, arcsDir, metadataWriter, ArchiveProfile.ARC_PROFILE);
                z = true;
            }
            if (warcsDir.isDirectory() && FileUtils.hasFiles(warcsDir)) {
                addCDXes(ingestableFiles, warcsDir, metadataWriter, ArchiveProfile.WARC_PROFILE);
                z = true;
            }
            if (z) {
                ingestableFiles.setMetadataGenerationSucceeded(true);
            } else {
                log.warn("Found no archive directory with ARC og WARC files. Looked for dirs '{}' and '{}'.", arcsDir.getAbsolutePath(), warcsDir.getAbsolutePath());
            }
            if (!ingestableFiles.isMetadataReady()) {
                ingestableFiles.setMetadataGenerationSucceeded(false);
                return;
            }
            Iterator<File> it2 = writeHarvestDetails.iterator();
            while (it2.hasNext()) {
                FileUtils.remove(it2.next());
            }
            ingestableFiles.cleanup();
        } catch (Throwable th) {
            if (ingestableFiles.isMetadataReady()) {
                Iterator it3 = list.iterator();
                while (it3.hasNext()) {
                    FileUtils.remove((File) it3.next());
                }
                ingestableFiles.cleanup();
            } else {
                ingestableFiles.setMetadataGenerationSucceeded(false);
            }
            throw th;
        }
    }

    private static void addCDXes(IngestableFiles ingestableFiles, File file, MetadataFileWriter metadataFileWriter, ArchiveProfile archiveProfile) {
        moveAwayForeignFiles(archiveProfile, file, ingestableFiles);
        File createUniqueTempDir = FileUtils.createUniqueTempDir(ingestableFiles.getTmpMetadataDir(), "cdx");
        CDXUtils.generateCDX(archiveProfile, file, createUniqueTempDir);
        metadataFileWriter.insertFiles(createUniqueTempDir, FileUtils.CDX_FILE_FILTER, "application/x-cdx", ingestableFiles.getHarvestID(), ingestableFiles.getJobId());
    }

    private static List<MetadataEntry> getStoredMetadata(File file) {
        File file2 = new File(file, "metadata");
        if (file2.isDirectory()) {
            return MetadataEntry.getMetadataFromDisk(file2);
        }
        log.warn("Should have an metadata directory '{}' but there wasn't", file2.getAbsolutePath());
        return new ArrayList();
    }

    private static void moveAwayForeignFiles(ArchiveProfile archiveProfile, File file, IngestableFiles ingestableFiles) {
        File[] listFiles = file.listFiles(archiveProfile.filename_filter);
        File file2 = new File(new File(Settings.get(Heritrix3Settings.HARVEST_CONTROLLER_OLDJOBSDIR)), "lost-files-" + new Date().getTime());
        ArrayList arrayList = new ArrayList();
        log.info("Looking for files not having harvestprefix '{}'", ingestableFiles.getHarvestnamePrefix());
        for (File file3 : listFiles) {
            if (!file3.getName().startsWith(ingestableFiles.getHarvestnamePrefix())) {
                log.info("removing unidentified file {}", file3.getAbsolutePath());
                try {
                    if (!file2.exists()) {
                        FileUtils.createDir(file2);
                    }
                    File file4 = new File(file2, file3.getName());
                    file3.renameTo(file4);
                    arrayList.add(file4);
                } catch (PermissionDenied e) {
                    log.warn("Not allowed to make oldjobs dir '{}'", file2.getAbsolutePath(), e);
                }
            }
        }
        if (arrayList.isEmpty()) {
            return;
        }
        log.warn("Found files not belonging to job {}, the following files have been stored for later: {}", Long.valueOf(ingestableFiles.getJobId()), arrayList);
    }

    private static List<File> writeHarvestDetails(long j, long j2, IngestableFiles ingestableFiles, MetadataFileWriter metadataFileWriter, String str) {
        ArrayList arrayList = new ArrayList();
        TreeSet treeSet = new TreeSet();
        File crawlDir = ingestableFiles.getCrawlDir();
        File heritrix3JobDir = ingestableFiles.getHeritrix3JobDir();
        File reportsDir = ingestableFiles.getReportsDir();
        log.info("Looking for heritrix files in the following directories: {},{}, {}", new Object[]{crawlDir.getAbsolutePath(), heritrix3JobDir.getAbsolutePath(), reportsDir.getAbsolutePath()});
        for (File file : crawlDir.listFiles(new FileFilter() { // from class: dk.netarkivet.harvester.heritrix3.HarvestDocumentation.1
            @Override // java.io.FileFilter
            public boolean accept(File file2) {
                return file2.isFile() && file2.getName().matches(MetadataFile.HERITRIX_FILE_PATTERN);
            }
        })) {
            treeSet.add(new MetadataFile(file, Long.valueOf(j2), Long.valueOf(j), str));
        }
        if (heritrix3JobDir.exists()) {
            for (File file2 : heritrix3JobDir.listFiles(new FileFilter() { // from class: dk.netarkivet.harvester.heritrix3.HarvestDocumentation.2
                @Override // java.io.FileFilter
                public boolean accept(File file3) {
                    return file3.isFile() && file3.getName().matches(MetadataFile.HERITRIX_FILE_PATTERN);
                }
            })) {
                treeSet.add(new MetadataFile(file2, Long.valueOf(j2), Long.valueOf(j), str));
            }
        } else {
            log.warn("The directory {} does not exist", heritrix3JobDir.getAbsolutePath());
        }
        if (reportsDir.exists()) {
            for (File file3 : reportsDir.listFiles(new FileFilter() { // from class: dk.netarkivet.harvester.heritrix3.HarvestDocumentation.3
                @Override // java.io.FileFilter
                public boolean accept(File file4) {
                    return file4.isFile() && file4.getName().matches(MetadataFile.HERITRIX_FILE_PATTERN);
                }
            })) {
                treeSet.add(new MetadataFile(file3, Long.valueOf(j2), Long.valueOf(j), str));
            }
        } else {
            log.warn("The directory {} does not exist", reportsDir.getAbsolutePath());
        }
        if (Settings.getBoolean(Heritrix3Settings.METADATA_GENERATE_ARCHIVE_FILES_REPORT)) {
            log.debug("Arcfiles-report.txt generation Not currently supported by Heritrix3");
        } else {
            log.debug("Creation of the arcfiles-report.txt has been disabled by the setting '{}'!", Heritrix3Settings.METADATA_GENERATE_ARCHIVE_FILES_REPORT);
        }
        File file4 = new File(heritrix3JobDir, "logs");
        if (file4.exists()) {
            for (File file5 : file4.listFiles(new FileFilter() { // from class: dk.netarkivet.harvester.heritrix3.HarvestDocumentation.4
                @Override // java.io.FileFilter
                public boolean accept(File file6) {
                    return file6.isFile() && file6.getName().matches(MetadataFile.LOG_FILE_PATTERN);
                }
            })) {
                treeSet.add(new MetadataFile(file5, Long.valueOf(j2), Long.valueOf(j), str));
                log.info("Found Heritrix log file {}", file5.getName());
            }
        } else {
            log.debug("No logs dir found in crawldir: {}", crawlDir.getAbsolutePath());
        }
        Iterator it = treeSet.iterator();
        while (it.hasNext()) {
            MetadataFile metadataFile = (MetadataFile) it.next();
            File heritrixFile = metadataFile.getHeritrixFile();
            if (metadataFileWriter.writeTo(heritrixFile, metadataFile.getUrl(), heritrixFile.getName().endsWith(".xml") ? "text/xml" : "text/plain")) {
                arrayList.add(heritrixFile);
            } else {
                log.warn("The Heritrix file '{}' was not included in the metadata archivefile due to some error.", heritrixFile.getAbsolutePath());
            }
        }
        return arrayList;
    }
}
