package org.archive.modules.extractor;

import java.nio.charset.CoderMalfunctionError;
import java.util.concurrent.atomic.AtomicLong;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.httpclient.URIException;
import org.archive.modules.CrawlURI;
import org.archive.modules.Processor;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.util.UriUtils;
import org.json.JSONException;
import org.json.JSONObject;
import org.springframework.beans.factory.annotation.Autowired;

/* loaded from: input_file:org/archive/modules/extractor/Extractor.class */
public abstract class Extractor extends Processor {
    protected AtomicLong numberOfLinksExtracted = new AtomicLong(0);
    protected transient UriErrorLoggerModule loggerModule;
    protected transient ExtractorParameters extractorParameters;
    private static final Logger LOGGER = Logger.getLogger(Extractor.class.getName());
    private static final Logger logger = Logger.getLogger(Extractor.class.getName());
    public static final ExtractorParameters DEFAULT_PARAMETERS = new ExtractorParameters() { // from class: org.archive.modules.extractor.Extractor.1
        @Override // org.archive.modules.extractor.ExtractorParameters
        public int getMaxOutlinks() {
            return 6000;
        }

        @Override // org.archive.modules.extractor.ExtractorParameters
        public boolean getExtractIndependently() {
            return false;
        }

        @Override // org.archive.modules.extractor.ExtractorParameters
        public boolean getExtract404s() {
            return false;
        }
    };

    public Extractor() {
        setExtractorParameters(DEFAULT_PARAMETERS);
    }

    public UriErrorLoggerModule getLoggerModule() {
        return this.loggerModule;
    }

    @Autowired
    public void setLoggerModule(UriErrorLoggerModule uriErrorLoggerModule) {
        this.loggerModule = uriErrorLoggerModule;
    }

    public ExtractorParameters getExtractorParameters() {
        return this.extractorParameters;
    }

    @Autowired(required = false)
    public void setExtractorParameters(ExtractorParameters extractorParameters) {
        this.extractorParameters = extractorParameters;
    }

    @Override // org.archive.modules.Processor
    protected final void innerProcess(CrawlURI crawlURI) throws InterruptedException {
        try {
            extract(crawlURI);
        } catch (NullPointerException e) {
            handleException(crawlURI, e);
        } catch (StackOverflowError e2) {
            handleException(crawlURI, e2);
        } catch (CoderMalfunctionError e3) {
            handleException(crawlURI, e3);
        }
    }

    private void handleException(CrawlURI crawlURI, Throwable th) {
        crawlURI.getAnnotations().add("err=" + th.getClass().getName());
        crawlURI.getNonFatalFailures().add(th);
        logger.log(Level.INFO, "Exception", th);
    }

    protected abstract void extract(CrawlURI crawlURI);

    /* JADX INFO: Access modifiers changed from: protected */
    public CrawlURI addOutlink(CrawlURI crawlURI, String str, LinkContext linkContext, Hop hop) {
        if (UriUtils.isDataUri(str)) {
            return null;
        }
        try {
            CrawlURI createCrawlURI = crawlURI.createCrawlURI(UURIFactory.getInstance(crawlURI.getUURI(), str), linkContext, hop);
            crawlURI.getOutLinks().add(createCrawlURI);
            return createCrawlURI;
        } catch (URIException e) {
            logUriError(e, crawlURI.getUURI(), str);
            return null;
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public void addOutlink(CrawlURI crawlURI, UURI uuri, LinkContext linkContext, Hop hop) {
        if ("data".equalsIgnoreCase(uuri.getScheme())) {
            return;
        }
        try {
            crawlURI.getOutLinks().add(crawlURI.createCrawlURI(uuri, linkContext, hop));
        } catch (URIException e) {
            logUriError(e, crawlURI.getUURI(), uuri.toString());
        }
    }

    public void logUriError(URIException uRIException, UURI uuri, CharSequence charSequence) {
        this.loggerModule.logUriError(uRIException, uuri, charSequence);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // org.archive.modules.Processor
    public JSONObject toCheckpointJson() throws JSONException {
        JSONObject checkpointJson = super.toCheckpointJson();
        checkpointJson.put("numberOfLinksExtracted", this.numberOfLinksExtracted.get());
        return checkpointJson;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // org.archive.modules.Processor
    public void fromCheckpointJson(JSONObject jSONObject) throws JSONException {
        super.fromCheckpointJson(jSONObject);
        this.numberOfLinksExtracted.set(jSONObject.getLong("numberOfLinksExtracted"));
    }

    @Override // org.archive.modules.Processor
    public String report() {
        StringBuffer stringBuffer = new StringBuffer();
        stringBuffer.append(super.report());
        stringBuffer.append("  " + this.numberOfLinksExtracted + " links from " + getURICount() + " CrawlURIs\n");
        return stringBuffer.toString();
    }

    public static CrawlURI addRelativeToBase(CrawlURI crawlURI, int i, CharSequence charSequence, LinkContext linkContext, Hop hop) throws URIException {
        if (UriUtils.isDataUri(charSequence)) {
            return null;
        }
        return add2(crawlURI, i, UURIFactory.getInstance(crawlURI.getBaseURI(), charSequence.toString()), linkContext, hop);
    }

    public static CrawlURI addRelativeToVia(CrawlURI crawlURI, int i, String str, LinkContext linkContext, Hop hop) throws URIException {
        if (UriUtils.isDataUri(str)) {
            return null;
        }
        UURI via = crawlURI.getVia();
        if (via == null) {
            if (!crawlURI.getAnnotations().contains("usedBaseForVia")) {
                LOGGER.info("no via where expected; using base instead: " + crawlURI);
                crawlURI.getAnnotations().add("usedBaseForVia");
            }
            via = crawlURI.getBaseURI();
        }
        return add2(crawlURI, i, UURIFactory.getInstance(via, str), linkContext, hop);
    }

    public static void add(CrawlURI crawlURI, int i, String str, LinkContext linkContext, Hop hop) throws URIException {
        add2(crawlURI, i, UURIFactory.getInstance(str), linkContext, hop);
    }

    private static CrawlURI add2(CrawlURI crawlURI, int i, UURI uuri, LinkContext linkContext, Hop hop) throws URIException {
        if (crawlURI.getOutLinks().size() >= i) {
            crawlURI.incrementDiscardedOutLinks();
            return null;
        }
        CrawlURI createCrawlURI = crawlURI.createCrawlURI(uuri, linkContext, hop);
        crawlURI.getOutLinks().add(createCrawlURI);
        return createCrawlURI;
    }
}
