package dk.netarkivet.harvester.harvesting;

import dk.netarkivet.common.Constants;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.InetAddress;
import java.net.URI;
import java.net.UnknownHostException;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.archive.format.warc.WARCConstants;
import org.archive.io.warc.WARCRecordInfo;
import org.archive.io.warc.WARCWriter;
import org.archive.modules.CrawlMetadata;
import org.archive.modules.CrawlURI;
import org.archive.modules.writer.WARCWriterProcessor;
import org.archive.util.ArchiveUtils;
import org.archive.util.anvl.ANVLRecord;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:dk/netarkivet/harvester/harvesting/NasWARCProcessor.class */
public class NasWARCProcessor extends WARCWriterProcessor {
    private static final Logger logger = LoggerFactory.getLogger(NasWARCProcessor.class);
    private static final String HARVESTINFO_VERSION = "harvestInfo.version";
    private static final String HARVESTINFO_JOBID = "harvestInfo.jobId";
    private static final String HARVESTINFO_CHANNEL = "harvestInfo.channel";
    private static final String HARVESTINFO_HARVESTNUM = "harvestInfo.harvestNum";
    private static final String HARVESTINFO_ORIGHARVESTDEFINITIONID = "harvestInfo.origHarvestDefinitionID";
    private static final String HARVESTINFO_MAXBYTESPERDOMAIN = "harvestInfo.maxBytesPerDomain";
    private static final String HARVESTINFO_MAXOBJECTSPERDOMAIN = "harvestInfo.maxObjectsPerDomain";
    private static final String HARVESTINFO_ORDERXMLNAME = "harvestInfo.templateName";
    private static final String HARVESTINFO_ORDERXMLUPDATEDATE = "harvestInfo.templateLastUpdateDate";
    private static final String HARVESTINFO_ORDERXMLDESCRIPTION = "harvestInfo.templateDescription";
    private static final String HARVESTINFO_ORIGHARVESTDEFINITIONNAME = "harvestInfo.origHarvestDefinitionName";
    private static final String HARVESTINFO_ORIGHARVESTDEFINITIONCOMMENTS = "harvestInfo.origHarvestDefinitionComments";
    private static final String HARVESTINFO_SCHEDULENAME = "harvestInfo.scheduleName";
    private static final String HARVESTINFO_HARVESTFILENAMEPREFIX = "harvestInfo.harvestFilenamePrefix";
    private static final String HARVESTINFO_JOBSUBMITDATE = "harvestInfo.jobSubmitDate";
    private static final String HARVESTINFO_PERFORMER = "harvestInfo.performer";
    private static final String HARVESTINFO_OPERATOR = "harvestInfo.operator";
    private static final String HARVESTINFO_AUDIENCE = "harvestInfo.audience";
    List<String> cachedMetadata;
    protected Map<String, String> metadataMap = new HashMap();

    public boolean getWriteMetadataOutlinks() {
        return ((Boolean) this.kp.get("writeMetadataOutlinks")).booleanValue();
    }

    public void setWriteMetadataOutlinks(boolean z) {
        this.kp.put("writeMetadataOutlinks", Boolean.valueOf(z));
    }

    public Map<String, String> getFormItems() {
        return this.metadataMap;
    }

    public void setMetadataItems(Map<String, String> map) {
        this.metadataMap = map;
    }

    public List<String> getMetadata() {
        if (this.cachedMetadata != null) {
            return this.cachedMetadata;
        }
        ANVLRecord aNVLRecord = new ANVLRecord();
        aNVLRecord.addLabelValue("software", "Heritrix/" + ArchiveUtils.VERSION + " http://crawler.archive.org");
        try {
            InetAddress localHost = InetAddress.getLocalHost();
            aNVLRecord.addLabelValue("ip", localHost.getHostAddress());
            aNVLRecord.addLabelValue("hostname", localHost.getCanonicalHostName());
        } catch (UnknownHostException e) {
        }
        aNVLRecord.addLabelValue("format", "WARC File Format 1.0");
        aNVLRecord.addLabelValue("conformsTo", "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf");
        CrawlMetadata metadataProvider = getMetadataProvider();
        addIfNotBlank(aNVLRecord, "operator", metadataProvider.getOperator());
        addIfNotBlank(aNVLRecord, "publisher", metadataProvider.getOrganization());
        addIfNotBlank(aNVLRecord, "audience", metadataProvider.getAudience());
        addIfNotBlank(aNVLRecord, "isPartOf", metadataProvider.getJobName());
        addIfNotBlank(aNVLRecord, "description", metadataProvider.getDescription());
        addIfNotBlank(aNVLRecord, "robots", metadataProvider.getRobotsPolicyName().toLowerCase());
        addIfNotBlank(aNVLRecord, "http-header-user-agent", metadataProvider.getUserAgent());
        addIfNotBlank(aNVLRecord, "http-header-from", metadataProvider.getOperatorFrom());
        String str = "#added by NetarchiveSuite " + Constants.getVersionString(false);
        ANVLRecord aNVLRecord2 = new ANVLRecord();
        if (this.metadataMap == null) {
            logger.warn("No NetarchiveSuite harvestInfo data available in the template");
        } else {
            try {
                aNVLRecord2.addLabelValue(HARVESTINFO_VERSION, this.metadataMap.get(HARVESTINFO_VERSION));
                aNVLRecord2.addLabelValue(HARVESTINFO_JOBID, this.metadataMap.get(HARVESTINFO_JOBID));
                aNVLRecord2.addLabelValue(HARVESTINFO_CHANNEL, this.metadataMap.get(HARVESTINFO_CHANNEL));
                aNVLRecord2.addLabelValue(HARVESTINFO_HARVESTNUM, this.metadataMap.get(HARVESTINFO_HARVESTNUM));
                aNVLRecord2.addLabelValue(HARVESTINFO_ORIGHARVESTDEFINITIONID, this.metadataMap.get(HARVESTINFO_ORIGHARVESTDEFINITIONID));
                aNVLRecord2.addLabelValue(HARVESTINFO_MAXBYTESPERDOMAIN, this.metadataMap.get(HARVESTINFO_MAXBYTESPERDOMAIN));
                aNVLRecord2.addLabelValue(HARVESTINFO_MAXOBJECTSPERDOMAIN, this.metadataMap.get(HARVESTINFO_MAXOBJECTSPERDOMAIN));
                aNVLRecord2.addLabelValue(HARVESTINFO_ORDERXMLNAME, this.metadataMap.get(HARVESTINFO_ORDERXMLNAME));
                if (this.metadataMap.containsKey(HARVESTINFO_ORDERXMLUPDATEDATE)) {
                    aNVLRecord2.addLabelValue(HARVESTINFO_ORDERXMLUPDATEDATE, this.metadataMap.get(HARVESTINFO_ORDERXMLUPDATEDATE));
                }
                if (this.metadataMap.containsKey(HARVESTINFO_ORDERXMLDESCRIPTION)) {
                    aNVLRecord2.addLabelValue(HARVESTINFO_ORDERXMLDESCRIPTION, this.metadataMap.get(HARVESTINFO_ORDERXMLDESCRIPTION));
                }
                aNVLRecord2.addLabelValue(HARVESTINFO_ORIGHARVESTDEFINITIONNAME, this.metadataMap.get(HARVESTINFO_ORIGHARVESTDEFINITIONNAME));
                if (this.metadataMap.containsKey(HARVESTINFO_ORIGHARVESTDEFINITIONCOMMENTS)) {
                    aNVLRecord2.addLabelValue(HARVESTINFO_ORIGHARVESTDEFINITIONCOMMENTS, this.metadataMap.get(HARVESTINFO_ORIGHARVESTDEFINITIONCOMMENTS));
                }
                if (this.metadataMap.containsKey(HARVESTINFO_SCHEDULENAME)) {
                    aNVLRecord2.addLabelValue(HARVESTINFO_SCHEDULENAME, this.metadataMap.get(HARVESTINFO_SCHEDULENAME));
                }
                aNVLRecord2.addLabelValue(HARVESTINFO_HARVESTFILENAMEPREFIX, this.metadataMap.get(HARVESTINFO_HARVESTFILENAMEPREFIX));
                aNVLRecord2.addLabelValue(HARVESTINFO_JOBSUBMITDATE, this.metadataMap.get(HARVESTINFO_JOBSUBMITDATE));
                if (this.metadataMap.containsKey(HARVESTINFO_PERFORMER)) {
                    aNVLRecord2.addLabelValue(HARVESTINFO_PERFORMER, this.metadataMap.get(HARVESTINFO_PERFORMER));
                }
                if (this.metadataMap.containsKey(HARVESTINFO_OPERATOR)) {
                    aNVLRecord2.addLabelValue(HARVESTINFO_OPERATOR, this.metadataMap.get(HARVESTINFO_OPERATOR));
                }
                if (this.metadataMap.containsKey(HARVESTINFO_AUDIENCE)) {
                    aNVLRecord2.addLabelValue(HARVESTINFO_AUDIENCE, this.metadataMap.get(HARVESTINFO_AUDIENCE));
                }
            } catch (Exception e2) {
                logger.warn("Error processing harvest info", e2);
            }
        }
        this.cachedMetadata = Collections.singletonList(aNVLRecord.toString() + str + "\n" + aNVLRecord2.toString());
        return this.cachedMetadata;
    }

    protected URI writeMetadata(WARCWriter wARCWriter, String str, URI uri, CrawlURI crawlURI, ANVLRecord aNVLRecord) throws IOException {
        Collection<CrawlURI> outLinks;
        WARCRecordInfo wARCRecordInfo = new WARCRecordInfo();
        wARCRecordInfo.setType(WARCConstants.WARCRecordType.metadata);
        wARCRecordInfo.setUrl(crawlURI.toString());
        wARCRecordInfo.setCreate14DigitDate(str);
        wARCRecordInfo.setMimetype("application/warc-fields");
        wARCRecordInfo.setExtraHeaders(aNVLRecord);
        wARCRecordInfo.setEnforceLength(true);
        wARCRecordInfo.setRecordId(qualifyRecordID(uri, "type", WARCConstants.WARCRecordType.metadata.toString()));
        ANVLRecord aNVLRecord2 = new ANVLRecord();
        if (crawlURI.isSeed()) {
            aNVLRecord2.addLabel("seed");
        } else {
            if (crawlURI.forceFetch()) {
                aNVLRecord2.addLabel("force-fetch");
            }
            if (StringUtils.isNotBlank(flattenVia(crawlURI))) {
                aNVLRecord2.addLabelValue("via", flattenVia(crawlURI));
            }
            if (StringUtils.isNotBlank(crawlURI.getPathFromSeed())) {
                aNVLRecord2.addLabelValue("hopsFromSeed", crawlURI.getPathFromSeed());
            }
            if (crawlURI.containsDataKey("source")) {
                aNVLRecord2.addLabelValue("sourceTag", (String) crawlURI.getData().get("source"));
            }
        }
        long fetchCompletedTime = crawlURI.getFetchCompletedTime() - crawlURI.getFetchBeginTime();
        if (fetchCompletedTime > -1) {
            aNVLRecord2.addLabelValue("fetchTimeMs", Long.toString(fetchCompletedTime));
        }
        if (crawlURI.getData().containsKey("ftp-fetch-status")) {
            aNVLRecord2.addLabelValue("ftpFetchStatus", crawlURI.getData().get("ftp-fetch-status").toString());
        }
        if (crawlURI.getRecorder() != null && crawlURI.getRecorder().getCharset() != null) {
            aNVLRecord2.addLabelValue("charsetForLinkExtraction", crawlURI.getRecorder().getCharset().name());
        }
        for (String str2 : crawlURI.getAnnotations()) {
            if (str2.startsWith("usingCharsetIn") || str2.startsWith("inconsistentCharsetIn")) {
                String[] split = str2.split(":", 2);
                aNVLRecord2.addLabelValue(split[0], split[1]);
            }
        }
        if (getWriteMetadataOutlinks() && (outLinks = crawlURI.getOutLinks()) != null && outLinks.size() > 0) {
            for (CrawlURI crawlURI2 : outLinks) {
                aNVLRecord2.addLabelValue("outlink", crawlURI2.getURI() + " " + crawlURI2.getLastHop() + " " + crawlURI2.getViaContext());
            }
        }
        wARCRecordInfo.setContentStream(new ByteArrayInputStream(aNVLRecord2.getUTF8Bytes()));
        wARCRecordInfo.setContentLength(r0.length);
        wARCWriter.writeRecord(wARCRecordInfo);
        return wARCRecordInfo.getRecordId();
    }
}
