package dk.netarkivet.harvester.heritrix3;

import dk.netarkivet.common.distribute.indexserver.Index;
import dk.netarkivet.common.distribute.indexserver.IndexClientFactory;
import dk.netarkivet.common.exceptions.ArgumentNotValid;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.exceptions.IllegalState;
import dk.netarkivet.common.exceptions.PermissionDenied;
import dk.netarkivet.common.utils.ExceptionUtils;
import dk.netarkivet.common.utils.FileUtils;
import dk.netarkivet.common.utils.Settings;
import dk.netarkivet.common.utils.StringUtils;
import dk.netarkivet.harvester.HarvesterSettings;
import dk.netarkivet.harvester.datamodel.HarvestDefinitionInfo;
import dk.netarkivet.harvester.datamodel.Job;
import dk.netarkivet.harvester.harvesting.PersistentJobData;
import dk.netarkivet.harvester.harvesting.metadata.MetadataEntry;
import java.io.File;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:dk/netarkivet/harvester/heritrix3/HarvestJob.class */
public class HarvestJob {
    private static final Logger log = LoggerFactory.getLogger(HarvestJob.class);
    private HarvestControllerServer hcs;
    private Job job;
    private File crawlDir;
    private Heritrix3Files files;
    private String jobName;

    public HarvestJob(HarvestControllerServer harvestControllerServer) {
        this.hcs = harvestControllerServer;
    }

    public void init(Job job, HarvestDefinitionInfo harvestDefinitionInfo, List<MetadataEntry> list) {
        this.job = job;
        this.jobName = job.getJobID() + "_" + System.currentTimeMillis();
        this.crawlDir = createCrawlDir();
        this.files = writeHarvestFiles(this.crawlDir, job, harvestDefinitionInfo, list);
    }

    public Heritrix3Files getHeritrix3Files() {
        return this.files;
    }

    public void runHarvest() throws ArgumentNotValid {
        log.info("Starting crawl of job : {}", this.job.getJobID());
        HeritrixLauncherFactory.getInstance(this.files, this.jobName).doCrawl();
    }

    public File createCrawlDir() {
        try {
            File file = new File(new File(Settings.get(HarvesterSettings.HARVEST_CONTROLLER_SERVERDIR)), this.jobName);
            FileUtils.createDir(file);
            log.info("Created crawl directory: '{}'", file);
            return file;
        } catch (PermissionDenied e) {
            String str = "Couldn't create the directory for job " + this.job.getJobID();
            log.warn(str, e);
            this.hcs.sendErrorMessage(this.job.getJobID().longValue(), str, ExceptionUtils.getStackTrace(e));
            throw e;
        }
    }

    public Heritrix3Files writeHarvestFiles(File file, Job job, HarvestDefinitionInfo harvestDefinitionInfo, List<MetadataEntry> list) {
        Heritrix3Files h3HeritrixFiles = Heritrix3Files.getH3HeritrixFiles(file, job);
        if (job.getContinuationOf() != null && Settings.getBoolean(HarvesterSettings.RECOVERlOG_CONTINUATION_ENABLED)) {
            log.warn("Continuation of crawl from a RecoverLog is not implemented for Heritrix3!");
        }
        log.debug("Writing persistent job data for job {} to crawldir '{}'", job.getJobID(), file);
        if (PersistentJobData.existsIn(file)) {
            throw new IllegalState("We already found a harvestInfo.xml for the crawldir " + file.getAbsolutePath());
        }
        new PersistentJobData(file).write(job, harvestDefinitionInfo);
        writePreharvestMetadata(job, list, file);
        h3HeritrixFiles.writeSeedsTxt(job.getSeedListAsString());
        h3HeritrixFiles.writeOrderXml(job.getOrderXMLdoc());
        if (job.getOrderXMLdoc().IsDeduplicationEnabled()) {
            log.debug("Deduplication enabled. Fetching deduplication index..");
            h3HeritrixFiles.setIndexDir(fetchDeduplicateIndex(list));
        } else {
            log.debug("Deduplication disabled.");
        }
        return h3HeritrixFiles;
    }

    private void writePreharvestMetadata(Job job, List<MetadataEntry> list, File file) throws IOFailure {
        if (list.size() == 0) {
            return;
        }
        File file2 = new File(file, "metadata");
        file2.mkdir();
        if (!file2.exists() || !file2.isDirectory()) {
            throw new IOFailure("Unable to write preharvest metadata for job '" + job.getJobID() + "' to directory '" + file2.getAbsolutePath() + "', as directory does not exist.");
        }
        MetadataEntry.storeMetadataToDisk(list, file2);
    }

    private File fetchDeduplicateIndex(List<MetadataEntry> list) {
        HashSet hashSet = new HashSet(parseJobIDsForDuplicateReduction(list));
        Index index = IndexClientFactory.getDedupCrawllogInstance().getIndex(hashSet);
        HashSet hashSet2 = new HashSet(hashSet);
        hashSet2.removeAll((Collection) index.getIndexSet());
        if (log.isDebugEnabled()) {
            log.debug("Received deduplication index containing {} jobs. {}", Integer.valueOf(((Set) index.getIndexSet()).size()), hashSet2.size() > 0 ? "Missing jobs: " + StringUtils.conjoin(",", hashSet2) : "");
        }
        return index.getIndexFile();
    }

    private List<Long> parseJobIDsForDuplicateReduction(List<MetadataEntry> list) {
        ArrayList arrayList = new ArrayList();
        for (MetadataEntry metadataEntry : list) {
            if (metadataEntry.isDuplicateReductionMetadataEntry()) {
                String str = new String(metadataEntry.getData());
                if (!str.isEmpty()) {
                    for (String str2 : str.split(",")) {
                        try {
                            arrayList.add(Long.valueOf(Long.parseLong(str2)));
                        } catch (NumberFormatException e) {
                            log.warn("Unable to convert String '{}' in duplicate reduction jobid list metadataEntry '{}' to a jobID. Ignoring.", new Object[]{str2, str, e});
                        }
                    }
                }
            }
        }
        return arrayList;
    }
}
