package org.archive.modules.extractor;

import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.archive.modules.CrawlURI;

/* loaded from: input_file:org/archive/modules/extractor/ExtractorPDFContent.class */
public class ExtractorPDFContent extends ContentExtractor {
    private static final long serialVersionUID = 3;
    private static final Logger LOGGER = Logger.getLogger(ExtractorPDFContent.class.getName());
    public static final Pattern URLPattern = Pattern.compile("(?i)\\(?(https?):\\/\\/(([a-z0-9$_\\.\\+!\\*\\'\\(\\),;\\?&=-]|%[0-9a-f]{2})+(:([a-z0-9$_\\.\\+!\\*\\'\\(\\),;\\?&=-]|%[0-9a-f]{2})+)?@)?(?)((([a-z0-9]\\.|[a-z0-9][a-z0-9-]*[a-z0-9]\\.)*[a-z][a-z0-9-]*[a-z0-9]|((\\d|[1-9]\\d|1\\d{2}|2[0-4][0-9]|25[0-5])\\.){3}(\\d|[1-9]\\d|1\\d{2}|2[0-4][0-9]|25[0-5]))(:\\d+)?)(((\\/+([a-z0-9$_\\.\\+!\\*\\'\\(\\),;:@&=-]|%[0-9a-f]{2})*)*(\\?([a-z0-9$_\\.\\+!\\*\\'\\(\\),;:@&=-]|%[0-9a-f]{2})*)?)?)?(\\n(?!http://)((\\/)?([a-z0-9$_\\.\\+!\\*\\'\\(\\),;:@&=-]|%[0-9a-f]{2})*)*(\\?([a-z0-9$_\\.\\+!\\*\\'\\(\\),;:@&=-]|%[0-9a-f]{2})*)?)?");

    public long getMaxSizeToParse() {
        return ((Long) this.kp.get("maxSizeToParse")).longValue();
    }

    public void setMaxSizeToParse(long j) {
        this.kp.put("maxSizeToParse", Long.valueOf(j));
    }

    public ExtractorPDFContent() {
        setMaxSizeToParse(10485760L);
    }

    protected boolean innerExtract(CrawlURI crawlURI) {
        ArrayList arrayList = new ArrayList();
        try {
            PdfReader pdfReader = new PdfReader(crawlURI.getRecorder().getContentReplayInputStream());
            for (int i = 1; i <= pdfReader.getNumberOfPages(); i++) {
                String extractPageText = extractPageText(pdfReader, i);
                Matcher matcher = URLPattern.matcher(extractPageText);
                while (matcher.find()) {
                    String trim = extractPageText.substring(matcher.start(), matcher.end()).trim();
                    if (trim.startsWith("(")) {
                        trim = trim.substring(1, trim.length());
                        if (trim.endsWith(")")) {
                            trim = trim.substring(0, trim.length() - 1);
                        }
                    }
                    arrayList.add(trim);
                    if (trim.endsWith(".") && trim.length() > 2) {
                        arrayList.add(trim.substring(0, trim.length() - 1));
                    }
                    if (matcher.group(19) != null) {
                        String str = matcher.group(1) + "://" + (matcher.group(2) != null ? matcher.group(2) : "") + matcher.group(6) + matcher.group(13);
                        if (trim.startsWith("(") && str.endsWith(")")) {
                            str = str.substring(0, str.length() - 1);
                        }
                        arrayList.add(str);
                    }
                }
            }
            if (arrayList.size() < 1) {
                return true;
            }
            Iterator it = arrayList.iterator();
            while (it.hasNext()) {
                addOutlink(crawlURI, (String) it.next(), LinkContext.NAVLINK_MISC, Hop.NAVLINK);
            }
            this.numberOfLinksExtracted.addAndGet(arrayList.size());
            LOGGER.fine(crawlURI + " has " + arrayList.size() + " links.");
            return true;
        } catch (IOException e) {
            crawlURI.getNonFatalFailures().add(e);
            return false;
        } catch (RuntimeException e2) {
            crawlURI.getNonFatalFailures().add(e2);
            return false;
        }
    }

    public String extractPageText(PdfReader pdfReader, int i) {
        String str = "";
        try {
            str = new PdfReaderContentParser(pdfReader).processContent(i, new SimpleTextExtractionStrategy()).getResultantText();
        } catch (IOException e) {
            LOGGER.log(Level.WARNING, "Failed to parse pdf text in " + Thread.currentThread().getName(), (Throwable) e);
        }
        return str;
    }

    protected boolean shouldExtract(CrawlURI crawlURI) {
        String contentType;
        return crawlURI.getRecorder().getRecordedInput().getSize() <= getMaxSizeToParse() && (contentType = crawlURI.getContentType()) != null && contentType.startsWith("application/pdf");
    }
}
