package org.archive.modules.extractor;

import crawlercommons.sitemaps.AbstractSiteMap;
import crawlercommons.sitemaps.SiteMap;
import crawlercommons.sitemaps.SiteMapIndex;
import crawlercommons.sitemaps.SiteMapParser;
import crawlercommons.sitemaps.SiteMapURL;
import crawlercommons.sitemaps.UnknownFormatException;
import java.io.IOException;
import java.net.URL;
import java.util.Date;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.io.IOUtils;
import org.archive.modules.CrawlURI;

/* loaded from: input_file:org/archive/modules/extractor/ExtractorSitemap.class */
public class ExtractorSitemap extends ContentExtractor {
    private static final Logger LOGGER = Logger.getLogger(ExtractorSitemap.class.getName());
    private String urlPattern = null;
    private boolean enableLenientExtraction = false;

    @Override // org.archive.modules.extractor.ContentExtractor
    protected boolean shouldExtract(CrawlURI crawlURI) {
        if (crawlURI.getAnnotations().contains(ExtractorRobotsTxt.ANNOTATION_IS_SITEMAP)) {
            if (crawlURI.is2XXSuccess()) {
                LOGGER.fine("This url (" + crawlURI + ") is declared to be a sitemap (via robots.txt) and is a HTTP 200.");
                return true;
            }
            LOGGER.fine("This url (" + crawlURI + ") is declared to be a sitemap (via robots.txt) but is a HTTP " + crawlURI.getFetchStatus() + ".");
        }
        if (this.urlPattern != null && crawlURI.getURI().matches(this.urlPattern)) {
            return true;
        }
        String contentType = crawlURI.getContentType();
        if (contentType == null) {
            return false;
        }
        if (!contentType.toLowerCase().startsWith("text/xml") && !contentType.toLowerCase().startsWith("application/xml")) {
            return false;
        }
        String contentReplayPrefixString = crawlURI.getRecorder().getContentReplayPrefixString(400);
        if (!contentReplayPrefixString.matches("(?is)[\\ufeff]?<\\?xml\\s.*") || !contentReplayPrefixString.matches("(?is).*(?:<urlset|<sitemapindex[>\\s]).*")) {
            return false;
        }
        LOGGER.info("Based on content sniffing, this is a sitemap: " + crawlURI);
        return true;
    }

    @Override // org.archive.modules.extractor.ContentExtractor
    protected boolean innerExtract(CrawlURI crawlURI) {
        SiteMapIndex parseSiteMap = parseSiteMap(crawlURI);
        if (parseSiteMap == null) {
            return false;
        }
        if (parseSiteMap.isIndex()) {
            for (AbstractSiteMap abstractSiteMap : parseSiteMap.getSitemaps()) {
                if (abstractSiteMap != null) {
                    recordOutlink(crawlURI, abstractSiteMap.getUrl(), abstractSiteMap.getLastModified(), true);
                }
            }
            return false;
        }
        for (SiteMapURL siteMapURL : ((SiteMap) parseSiteMap).getSiteMapUrls()) {
            if (siteMapURL != null) {
                recordOutlink(crawlURI, siteMapURL.getUrl(), siteMapURL.getLastModified(), false);
            }
        }
        return false;
    }

    private AbstractSiteMap parseSiteMap(CrawlURI crawlURI) {
        AbstractSiteMap abstractSiteMap = null;
        SiteMapParser siteMapParser = new SiteMapParser(!isEnableLenientExtraction(), true);
        try {
            byte[] byteArray = IOUtils.toByteArray(crawlURI.getRecorder().getContentReplayInputStream());
            if (byteArray.length > 52428800) {
                LOGGER.warning("Found sitemap exceeding 50MB " + crawlURI + " " + byteArray.length);
            }
            abstractSiteMap = siteMapParser.parseSiteMap(byteArray, new URL(crawlURI.getURI()));
        } catch (IOException e) {
            LOGGER.log(Level.WARNING, "I/O Exception when parsing sitemap " + crawlURI, (Throwable) e);
        } catch (UnknownFormatException e2) {
            LOGGER.log(Level.WARNING, "UnknownFormatException when parsing sitemap " + crawlURI, e2);
        }
        return abstractSiteMap;
    }

    private void recordOutlink(CrawlURI crawlURI, URL url, Date date, boolean z) {
        try {
            CrawlURI addRelativeToBase = addRelativeToBase(crawlURI, 50000, url.toString(), LinkContext.MANIFEST_MISC, Hop.MANIFEST);
            if (addRelativeToBase == null) {
                return;
            }
            if (z) {
                addRelativeToBase.getAnnotations().add(ExtractorRobotsTxt.ANNOTATION_IS_SITEMAP);
            }
            LOGGER.fine("Found " + url + " from " + crawlURI + " Dated " + date + " and with isSitemap = " + z);
            this.numberOfLinksExtracted.incrementAndGet();
        } catch (URIException e) {
            logUriError(e, crawlURI.getUURI(), url.toString());
        }
    }

    public String getUrlPattern() {
        return this.urlPattern;
    }

    public void setUrlPattern(String str) {
        this.urlPattern = str;
    }

    public boolean isEnableLenientExtraction() {
        return this.enableLenientExtraction;
    }

    public void setEnableLenientExtraction(boolean z) {
        this.enableLenientExtraction = z;
    }
}
