package dk.netarkivet.harvester.harvesting.extractor;

import java.io.IOException;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.lang.StringEscapeUtils;
import org.archive.crawler.datamodel.CoreAttributeConstants;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.extractor.Extractor;
import org.archive.crawler.extractor.Link;
import org.archive.crawler.framework.CrawlController;
import org.archive.io.ReplayCharSequence;
import org.archive.util.DevUtils;
import org.archive.util.TextUtils;
import org.archive.util.UriUtils;

/* loaded from: input_file:dk/netarkivet/harvester/harvesting/extractor/ExtractorJS.class */
public class ExtractorJS extends Extractor implements CoreAttributeConstants {
    private static final long serialVersionUID = -2231962381454717720L;
    static final String JAVASCRIPT_STRING_EXTRACTOR = "(\\\\{0,8}+(?:\"|'))(\\S{0,2083}?)(?:\\1)";
    protected long numberOfCURIsHandled;
    private static final Logger LOGGER = Logger.getLogger(ExtractorJS.class.getName());
    protected static long numberOfLinksExtracted = 0;
    protected static final String[] EXTRACTOR_URI_EXCEPTIONS = {"http://www.google-analytics.com/urchin.js"};

    public ExtractorJS(String str) {
        super(str, "JavaScript extractor. Link extraction on JavaScript files (.js).");
        this.numberOfCURIsHandled = 0L;
    }

    public void extract(CrawlURI crawlURI) {
        String contentType;
        for (String str : EXTRACTOR_URI_EXCEPTIONS) {
            if (crawlURI.toString().equals(str)) {
                return;
            }
        }
        if (isHttpTransactionContentToProcess(crawlURI) && (contentType = crawlURI.getContentType()) != null) {
            if (contentType.indexOf("javascript") >= 0 || contentType.indexOf("jscript") >= 0 || contentType.indexOf("ecmascript") >= 0 || crawlURI.toString().toLowerCase().endsWith(".js") || (crawlURI.getViaContext() != null && crawlURI.getViaContext().toString().toLowerCase().startsWith("script"))) {
                this.numberOfCURIsHandled++;
                ReplayCharSequence replayCharSequence = null;
                try {
                    replayCharSequence = crawlURI.getHttpRecorder().getReplayCharSequence();
                } catch (IOException e) {
                    crawlURI.addLocalizedError(getName(), e, "Failed get of replay char sequence.");
                }
                try {
                    if (replayCharSequence == null) {
                        LOGGER.warning("Failed getting ReplayCharSequence: " + crawlURI.toString());
                        return;
                    }
                    try {
                        numberOfLinksExtracted += considerStrings(crawlURI, replayCharSequence, getController(), true);
                    } catch (StackOverflowError e2) {
                        DevUtils.warnHandle(e2, "ExtractorJS StackOverflowError");
                    }
                    crawlURI.linkExtractorFinished();
                    if (replayCharSequence != null) {
                        try {
                            replayCharSequence.close();
                        } catch (IOException e3) {
                            LOGGER.warning(TextUtils.exceptionToString("Failed close of ReplayCharSequence.", e3));
                        }
                    }
                } catch (Throwable th) {
                    if (replayCharSequence != null) {
                        try {
                            replayCharSequence.close();
                        } catch (IOException e4) {
                            LOGGER.warning(TextUtils.exceptionToString("Failed close of ReplayCharSequence.", e4));
                        }
                    }
                    throw th;
                }
            }
        }
    }

    public static long considerStrings(CrawlURI crawlURI, CharSequence charSequence, CrawlController crawlController, boolean z) {
        long j = 0;
        Matcher matcher = TextUtils.getMatcher(JAVASCRIPT_STRING_EXTRACTOR, charSequence);
        while (matcher.find()) {
            CharSequence subSequence = charSequence.subSequence(matcher.start(2), matcher.end(2));
            if (UriUtils.isLikelyUriJavascriptContextLegacy(subSequence)) {
                String speculativeFixup = UriUtils.speculativeFixup(StringEscapeUtils.unescapeJavaScript(subSequence.toString()), crawlURI.getUURI());
                j++;
                if (z) {
                    try {
                        crawlURI.createAndAddLinkRelativeToVia(speculativeFixup, Link.JS_MISC, 'X');
                    } catch (URIException e) {
                        if (crawlController != null) {
                            crawlController.logUriError(e, crawlURI.getUURI(), speculativeFixup);
                        } else {
                            LOGGER.info(crawlURI + ", " + speculativeFixup + ": " + e.getMessage());
                        }
                    }
                } else {
                    crawlURI.createAndAddLinkRelativeToBase(speculativeFixup, Link.JS_MISC, 'X');
                }
            } else {
                j += considerStrings(crawlURI, subSequence, crawlController, z);
            }
        }
        TextUtils.recycleMatcher(matcher);
        return j;
    }

    public String report() {
        StringBuffer stringBuffer = new StringBuffer();
        stringBuffer.append("Processor: dk.netarkivet.harvester.harvesting.extractor.ExtractorJS\n");
        stringBuffer.append("  Function:          Link extraction on JavaScript code\n");
        stringBuffer.append("  CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");
        stringBuffer.append("  Links extracted:   " + numberOfLinksExtracted + "\n\n");
        return stringBuffer.toString();
    }
}
