package org.archive.modules.warc;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.URI;
import java.util.Collection;
import org.apache.commons.lang.StringUtils;
import org.archive.format.warc.WARCConstants;
import org.archive.io.warc.WARCRecordInfo;
import org.archive.modules.CoreAttributeConstants;
import org.archive.modules.CrawlURI;
import org.archive.modules.fetcher.FetchHTTP;
import org.archive.util.ArchiveUtils;
import org.archive.util.anvl.ANVLRecord;

/* loaded from: input_file:org/archive/modules/warc/MetadataRecordBuilder.class */
public class MetadataRecordBuilder extends BaseWARCRecordBuilder {
    @Override // org.archive.modules.warc.WARCRecordBuilder
    public boolean shouldBuildRecord(CrawlURI crawlURI) {
        String lowerCase = crawlURI.getUURI().getScheme().toLowerCase();
        return lowerCase.startsWith(FetchHTTP.HTTP_SCHEME) || "ftp".equals(lowerCase) || "sftp".equals(lowerCase);
    }

    @Override // org.archive.modules.warc.WARCRecordBuilder
    public WARCRecordInfo buildRecord(CrawlURI crawlURI, URI uri) throws IOException {
        String log14Date = ArchiveUtils.getLog14Date(crawlURI.getFetchBeginTime());
        WARCRecordInfo wARCRecordInfo = new WARCRecordInfo();
        wARCRecordInfo.setType(WARCConstants.WARCRecordType.metadata);
        wARCRecordInfo.setRecordId(generateRecordID());
        if (uri != null) {
            wARCRecordInfo.addExtraHeader("WARC-Concurrent-To", "<" + uri + ">");
        }
        wARCRecordInfo.setUrl(crawlURI.toString());
        wARCRecordInfo.setCreate14DigitDate(log14Date);
        wARCRecordInfo.setMimetype("application/warc-fields");
        wARCRecordInfo.setEnforceLength(true);
        ANVLRecord aNVLRecord = new ANVLRecord();
        if (crawlURI.isSeed()) {
            aNVLRecord.addLabel("seed");
        } else {
            if (crawlURI.forceFetch()) {
                aNVLRecord.addLabel("force-fetch");
            }
            if (StringUtils.isNotBlank(crawlURI.getVia().toString())) {
                aNVLRecord.addLabelValue("via", crawlURI.getVia().toString());
            }
            if (StringUtils.isNotBlank(crawlURI.getPathFromSeed())) {
                aNVLRecord.addLabelValue("hopsFromSeed", crawlURI.getPathFromSeed());
            }
            if (crawlURI.containsDataKey(CoreAttributeConstants.A_SOURCE_TAG)) {
                aNVLRecord.addLabelValue("sourceTag", (String) crawlURI.getData().get(CoreAttributeConstants.A_SOURCE_TAG));
            }
        }
        long fetchCompletedTime = crawlURI.getFetchCompletedTime() - crawlURI.getFetchBeginTime();
        if (fetchCompletedTime > -1) {
            aNVLRecord.addLabelValue("fetchTimeMs", Long.toString(fetchCompletedTime));
        }
        if (crawlURI.getData().containsKey(CoreAttributeConstants.A_FTP_FETCH_STATUS)) {
            aNVLRecord.addLabelValue("ftpFetchStatus", crawlURI.getData().get(CoreAttributeConstants.A_FTP_FETCH_STATUS).toString());
        }
        if (crawlURI.getRecorder() != null && crawlURI.getRecorder().getCharset() != null) {
            aNVLRecord.addLabelValue("charsetForLinkExtraction", crawlURI.getRecorder().getCharset().name());
        }
        for (String str : crawlURI.getAnnotations()) {
            if (str.startsWith("usingCharsetIn") || str.startsWith("inconsistentCharsetIn")) {
                String[] split = str.split(":", 2);
                aNVLRecord.addLabelValue(split[0], split[1]);
            }
        }
        Collection<CrawlURI> outLinks = crawlURI.getOutLinks();
        if (outLinks != null && outLinks.size() > 0) {
            for (CrawlURI crawlURI2 : outLinks) {
                aNVLRecord.addLabelValue("outlink", crawlURI2.getURI() + " " + crawlURI2.getLastHop() + " " + crawlURI2.getViaContext());
            }
        }
        wARCRecordInfo.setContentStream(new ByteArrayInputStream(aNVLRecord.getUTF8Bytes()));
        wARCRecordInfo.setContentLength(r0.length);
        return wARCRecordInfo;
    }
}
