package org.archive.modules.extractor;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.URIException;
import org.archive.modules.CrawlURI;

/* loaded from: input_file:org/archive/modules/extractor/ExtractorRobotsTxt.class */
public class ExtractorRobotsTxt extends ContentExtractor {
    private static final Logger LOGGER = Logger.getLogger(ExtractorRobotsTxt.class.getName());
    private static final Pattern ROBOTS_PATTERN = Pattern.compile("^https?://[^/]+/robots.txt$");
    private static final Pattern SITEMAP_PATTERN = Pattern.compile("(?i)Sitemap:\\s*(.+)$");
    public static final String ANNOTATION_IS_SITEMAP = "isSitemap";

    @Override // org.archive.modules.extractor.ContentExtractor
    protected boolean shouldExtract(CrawlURI crawlURI) {
        boolean z = false;
        if (crawlURI.isPrerequisite()) {
            z = ROBOTS_PATTERN.matcher(crawlURI.getURI()).matches();
            LOGGER.finest("Checked prerequisite " + crawlURI + " GOT " + z);
        }
        return z;
    }

    public List<String> parseRobotsTxt(InputStream inputStream) {
        ArrayList arrayList = new ArrayList();
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
        while (true) {
            try {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                }
                Matcher matcher = SITEMAP_PATTERN.matcher(readLine);
                if (matcher.matches()) {
                    arrayList.add(matcher.group(1));
                }
            } catch (IOException e) {
                LOGGER.warning(e.toString());
            }
        }
        return arrayList;
    }

    @Override // org.archive.modules.extractor.ContentExtractor
    protected boolean innerExtract(CrawlURI crawlURI) {
        CrawlURI addRelativeToBase;
        try {
            List<String> parseRobotsTxt = parseRobotsTxt(crawlURI.getRecorder().getContentReplayInputStream());
            LOGGER.finest("Checked " + crawlURI + " GOT " + parseRobotsTxt);
            int maxOutlinks = getExtractorParameters().getMaxOutlinks();
            for (String str : parseRobotsTxt) {
                try {
                    LOGGER.fine("Found site map: " + str);
                    this.numberOfLinksExtracted.incrementAndGet();
                    addRelativeToBase = addRelativeToBase(crawlURI, maxOutlinks, str, LinkContext.MANIFEST_MISC, Hop.MANIFEST);
                } catch (URIException e) {
                    logUriError(e, crawlURI.getUURI(), str);
                }
                if (addRelativeToBase != null) {
                    addRelativeToBase.getAnnotations().add(ANNOTATION_IS_SITEMAP);
                }
            }
            return !parseRobotsTxt.isEmpty();
        } catch (IOException e2) {
            LOGGER.log(Level.WARNING, crawlURI.getURI(), (Throwable) e2);
            crawlURI.getNonFatalFailures().add(e2);
            return false;
        }
    }
}
