package dk.netarkivet.wayback.indexer;

import dk.netarkivet.common.CommonSettings;
import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory;
import dk.netarkivet.common.distribute.arcrepository.BatchStatus;
import dk.netarkivet.common.distribute.arcrepository.PreservationArcRepositoryClient;
import dk.netarkivet.common.exceptions.IllegalState;
import dk.netarkivet.common.utils.FileUtils;
import dk.netarkivet.common.utils.Settings;
import dk.netarkivet.common.utils.SettingsFactory;
import dk.netarkivet.common.utils.arc.ARCUtils;
import dk.netarkivet.common.utils.batch.FileBatchJob;
import dk.netarkivet.common.utils.hadoop.HadoopFileUtils;
import dk.netarkivet.common.utils.hadoop.HadoopJobTool;
import dk.netarkivet.common.utils.hadoop.HadoopJobUtils;
import dk.netarkivet.common.utils.service.FileResolver;
import dk.netarkivet.common.utils.service.SimpleFileResolver;
import dk.netarkivet.common.utils.warc.WARCUtils;
import dk.netarkivet.wayback.WaybackSettings;
import dk.netarkivet.wayback.batch.DeduplicationCDXExtractionBatchJob;
import dk.netarkivet.wayback.batch.WaybackCDXExtractionARCBatchJob;
import dk.netarkivet.wayback.batch.WaybackCDXExtractionWARCBatchJob;
import dk.netarkivet.wayback.hadoop.CDXMapper;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.OpenOption;
import java.nio.file.Paths;
import java.nio.file.attribute.FileAttribute;
import java.util.Date;
import java.util.Iterator;
import java.util.UUID;
import javax.persistence.Entity;
import javax.persistence.Id;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.ToolRunner;
import org.apache.tools.ant.taskdefs.XSLTLiaison;
import org.archive.url.UsableURIFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@Entity
/* loaded from: input_file:dk/netarkivet/wayback/indexer/ArchiveFile.class */
public class ArchiveFile {
    private static final Logger log = LoggerFactory.getLogger((Class<?>) ArchiveFile.class);
    private String filename;
    private String originalIndexFileName;
    private int indexingFailedAttempts;
    private boolean isIndexed = false;
    private Date indexedDate = null;

    public String getOriginalIndexFileName() {
        return this.originalIndexFileName;
    }

    public void setOriginalIndexFileName(String str) {
        this.originalIndexFileName = str;
    }

    public Date getIndexedDate() {
        return this.indexedDate;
    }

    public void setIndexedDate(Date date) {
        this.indexedDate = date;
    }

    @Id
    public String getFilename() {
        return this.filename;
    }

    public void setFilename(String str) {
        this.filename = str;
    }

    public boolean isIndexed() {
        return this.isIndexed;
    }

    public void setIndexed(boolean z) {
        this.isIndexed = z;
    }

    public int getIndexingFailedAttempts() {
        return this.indexingFailedAttempts;
    }

    public void setIndexingFailedAttempts(int i) {
        this.indexingFailedAttempts = i;
    }

    public void index() throws IllegalState {
        log.info("Indexing {}", getFilename());
        if (this.isIndexed) {
            throw new IllegalState("Attempted to index file '" + this.filename + "' which is already indexed");
        }
        if (Settings.getBoolean(CommonSettings.USE_BITMAG_HADOOP_BACKEND)) {
            hadoopIndex();
        } else {
            batchIndex();
        }
    }

    private void hadoopIndex() {
        if (!(ARCUtils.isARC(this.filename) || WARCUtils.isWarc(this.filename))) {
            log.warn("Skipping indexing of file with filename '{}'", this.filename);
            return;
        }
        Configuration conf = HadoopJobUtils.getConf();
        UUID randomUUID = UUID.randomUUID();
        log.info("File {} indexed with job uuid for i/o {}.", this.filename, randomUUID);
        try {
            FileSystem newInstance = FileSystem.newInstance(conf);
            try {
                String str = Settings.get(CommonSettings.HADOOP_MAPRED_CDXJOB_INPUT_DIR);
                if (str == null) {
                    log.error("Parent input dir specified by {} must not be null.", CommonSettings.HADOOP_MAPRED_CDXJOB_INPUT_DIR);
                    if (newInstance != null) {
                        newInstance.close();
                        return;
                    }
                    return;
                }
                try {
                    HadoopFileUtils.initDir(newInstance, str);
                    Path path = new Path(str, randomUUID.toString());
                    log.info("Hadoop input file will be {}", path);
                    String str2 = Settings.get(CommonSettings.HADOOP_MAPRED_CDXJOB_OUTPUT_DIR);
                    if (str2 == null) {
                        log.error("Parent output dir specified by {} must not be null.", CommonSettings.HADOOP_MAPRED_CDXJOB_OUTPUT_DIR);
                        if (newInstance != null) {
                            newInstance.close();
                            return;
                        }
                        return;
                    }
                    try {
                        HadoopFileUtils.initDir(newInstance, str2);
                        Path path2 = new Path(str2, randomUUID.toString());
                        log.info("Output directory for job is {}", path2);
                        java.nio.file.Path createTempFile = Files.createTempFile(null, null, new FileAttribute[0]);
                        FileResolver fileResolver = (FileResolver) SettingsFactory.getInstance(CommonSettings.FILE_RESOLVER_CLASS, new Object[0]);
                        if (fileResolver instanceof SimpleFileResolver) {
                            ((SimpleFileResolver) fileResolver).setDirectory(Paths.get(Settings.get(CommonSettings.HADOOP_MAPRED_INPUT_FILES_PARENT_DIR), new String[0]));
                        }
                        String str3 = XSLTLiaison.FILE_PROTOCOL_PREFIX + fileResolver.getPath(this.filename).toString();
                        log.info("Inserting {} in {}.", str3, createTempFile);
                        Files.write(createTempFile, str3.getBytes(), new OpenOption[0]);
                        log.info("Copying file with input paths {} to hdfs {}.", createTempFile, path);
                        newInstance.copyFromLocalFile(false, new Path(createTempFile.toAbsolutePath().toString()), path);
                        log.info("Starting CDX job on file '{}'", this.filename);
                        try {
                            log.info("Starting hadoop job with input {} and output {}.", path, path2);
                            int run = ToolRunner.run(new HadoopJobTool(conf, new CDXMapper()), new String[]{path.toString(), path2.toString()});
                            if (run == 0) {
                                log.info("CDX job for file {} was a success!", this.filename);
                                collectHadoopResults(newInstance, path2);
                            } else {
                                log.warn("Hadoop job failed with exit code '{}'", Integer.valueOf(run));
                            }
                        } catch (Exception e) {
                            log.error("Hadoop indexing job failed to run normally.", (Throwable) e);
                        }
                        if (newInstance != null) {
                            newInstance.close();
                        }
                    } catch (IOException e2) {
                        log.error("Failed to init output dir {}", str2, e2);
                        if (newInstance != null) {
                            newInstance.close();
                        }
                    }
                } catch (IOException e3) {
                    log.error("Failed to init input dir {}", str, e3);
                    if (newInstance != null) {
                        newInstance.close();
                    }
                }
            } finally {
            }
        } catch (IOException e4) {
            log.error("Error on hadoop filesystem.", (Throwable) e4);
        }
    }

    private void collectHadoopResults(FileSystem fileSystem, Path path) {
        File makeNewFileInWaybackTempDir = makeNewFileInWaybackTempDir();
        log.info("Collecting index for '{}' from parts in '{}' to '{}'", getFilename(), path, makeNewFileInWaybackTempDir.getAbsolutePath());
        try {
            FileUtils.writeCollectionToFile(makeNewFileInWaybackTempDir, HadoopJobUtils.collectOutputLines(fileSystem, path));
            log.info("Finished collecting index for '{}' to '{}'", getFilename(), makeNewFileInWaybackTempDir.getAbsolutePath());
        } catch (IOException e) {
            log.warn("Could not collect index results from '{}'", path.toString(), e);
        }
        File moveFileToWaybackOutputDir = moveFileToWaybackOutputDir(makeNewFileInWaybackTempDir);
        this.originalIndexFileName = makeNewFileInWaybackTempDir.getName();
        this.isIndexed = true;
        log.info("Indexed '{}' to '{}'. Marking as indexed in DB.", this.filename, moveFileToWaybackOutputDir.getAbsolutePath());
        new ArchiveFileDAO().update(this);
    }

    private void batchIndex() {
        FileBatchJob waybackCDXExtractionWARCBatchJob;
        if (this.filename.matches("(.*)" + Settings.get(CommonSettings.METADATAFILE_REGEX_SUFFIX))) {
            waybackCDXExtractionWARCBatchJob = new DeduplicationCDXExtractionBatchJob();
        } else if (ARCUtils.isARC(this.filename)) {
            waybackCDXExtractionWARCBatchJob = new WaybackCDXExtractionARCBatchJob();
        } else {
            if (!WARCUtils.isWarc(this.filename)) {
                log.warn("Skipping indexing of file with filename '{}'", this.filename);
                return;
            }
            waybackCDXExtractionWARCBatchJob = new WaybackCDXExtractionWARCBatchJob();
        }
        waybackCDXExtractionWARCBatchJob.processOnlyFileNamed(this.filename);
        PreservationArcRepositoryClient preservationInstance = ArcRepositoryClientFactory.getPreservationInstance();
        String str = Settings.get(WaybackSettings.WAYBACK_REPLICA);
        log.info("Submitting {} for {} to {}", waybackCDXExtractionWARCBatchJob.getClass().getName(), getFilename(), str);
        BatchStatus batch = preservationInstance.batch(waybackCDXExtractionWARCBatchJob, str, new String[0]);
        log.info("Batch job for {} returned", getFilename());
        if (!batch.getFilesFailed().isEmpty() || batch.getNoOfFilesProcessed() == 0 || !batch.getExceptions().isEmpty()) {
            logBatchError(batch);
            return;
        }
        if (batch.getNoOfFilesProcessed() > 1) {
            log.warn("Processed '{}' files for {}.\n This may indicate a doublet in the arcrepository. Proceeding with caution.", Integer.valueOf(batch.getNoOfFilesProcessed()), getFilename());
        }
        try {
            collectResults(batch);
        } catch (Exception e) {
            logBatchError(batch);
            log.error("Failed to retrieve results", (Throwable) e);
        }
    }

    private void collectResults(BatchStatus batchStatus) {
        File makeNewFileInWaybackTempDir = makeNewFileInWaybackTempDir();
        log.info("Collecting index for '{}' to '{}'", getFilename(), makeNewFileInWaybackTempDir.getAbsolutePath());
        batchStatus.copyResults(makeNewFileInWaybackTempDir);
        log.info("Finished collecting index for '{}' to '{}'", getFilename(), makeNewFileInWaybackTempDir.getAbsolutePath());
        File moveFileToWaybackOutputDir = moveFileToWaybackOutputDir(makeNewFileInWaybackTempDir);
        this.originalIndexFileName = makeNewFileInWaybackTempDir.getName();
        this.isIndexed = true;
        log.info("Indexed '{}' to '{}'", this.filename, moveFileToWaybackOutputDir.getAbsolutePath());
        new ArchiveFileDAO().update(this);
    }

    private File makeNewFileInWaybackTempDir() {
        String uuid = UUID.randomUUID().toString();
        File file = new File(Settings.get(WaybackSettings.WAYBACK_INDEX_TEMPDIR));
        FileUtils.createDir(file);
        return new File(file, uuid);
    }

    private File moveFileToWaybackOutputDir(File file) {
        File file2 = new File(Settings.get(WaybackSettings.WAYBACK_BATCH_OUTPUTDIR));
        FileUtils.createDir(file2);
        File file3 = new File(file2, file.getName());
        file.renameTo(file3);
        return file3;
    }

    private void logBatchError(BatchStatus batchStatus) {
        String str = "Error indexing file '" + getFilename() + "'\nNumber of files processed: '" + batchStatus.getNoOfFilesProcessed() + "'\nNumber of files failed '" + batchStatus.getFilesFailed().size() + UsableURIFactory.SQUOT;
        if (!batchStatus.getExceptions().isEmpty()) {
            str = str + "\n Exceptions thrown: \n";
            Iterator<FileBatchJob.ExceptionOccurrence> it2 = batchStatus.getExceptions().iterator();
            while (it2.hasNext()) {
                str = str + it2.next().toString() + "\n";
            }
        }
        log.error(str);
        this.indexingFailedAttempts++;
        new ArchiveFileDAO().update(this);
    }

    public boolean equals(Object obj) {
        if (this == obj) {
            return true;
        }
        if (obj == null || getClass() != obj.getClass()) {
            return false;
        }
        ArchiveFile archiveFile = (ArchiveFile) obj;
        if (this.indexingFailedAttempts != archiveFile.indexingFailedAttempts || this.isIndexed != archiveFile.isIndexed || !this.filename.equals(archiveFile.filename)) {
            return false;
        }
        if (this.indexedDate != null) {
            if (!this.indexedDate.equals(archiveFile.indexedDate)) {
                return false;
            }
        } else if (archiveFile.indexedDate != null) {
            return false;
        }
        return this.originalIndexFileName != null ? this.originalIndexFileName.equals(archiveFile.originalIndexFileName) : archiveFile.originalIndexFileName == null;
    }

    public int hashCode() {
        return (31 * ((31 * ((31 * ((31 * this.filename.hashCode()) + (this.isIndexed ? 1 : 0))) + (this.originalIndexFileName != null ? this.originalIndexFileName.hashCode() : 0))) + this.indexingFailedAttempts)) + (this.indexedDate != null ? this.indexedDate.hashCode() : 0);
    }
}
