package org.archive.modules.extractor;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import org.apache.commons.codec.DecoderException;
import org.apache.commons.httpclient.URIException;
import org.archive.modules.CrawlURI;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.url.LaxURLCodec;
import org.archive.util.TextUtils;

/* loaded from: input_file:org/archive/modules/extractor/ExtractorURI.class */
public class ExtractorURI extends Extractor {
    private static final long serialVersionUID = 3;
    private static Logger LOGGER = Logger.getLogger(ExtractorURI.class.getName());
    protected static final String ABS_HTTP_URI_PATTERN = "^https?://[^\\s<>]*$";

    @Override // org.archive.modules.Processor
    protected boolean shouldProcess(CrawlURI crawlURI) {
        return true;
    }

    @Override // org.archive.modules.extractor.Extractor
    public void extract(CrawlURI crawlURI) {
        Iterator<CrawlURI> it = crawlURI.getOutLinks().iterator();
        while (it.hasNext()) {
            extractLink(crawlURI, it.next());
        }
    }

    protected void extractLink(CrawlURI crawlURI, CrawlURI crawlURI2) {
        UURI uuri = null;
        try {
            uuri = UURIFactory.getInstance(crawlURI2.getURI());
        } catch (URIException e) {
            LOGGER.log(Level.FINE, "bad URI", e);
        }
        if (uuri == null) {
            return;
        }
        Iterator<String> it = extractQueryStringLinks(uuri).iterator();
        while (it.hasNext()) {
            try {
                addOutlink(crawlURI, UURIFactory.getInstance(it.next()), LinkContext.SPECULATIVE_MISC, Hop.SPECULATIVE);
                this.numberOfLinksExtracted.incrementAndGet();
            } catch (URIException e2) {
                LOGGER.log(Level.FINE, "bad URI", e2);
            }
        }
    }

    protected static List<String> extractQueryStringLinks(UURI uuri) {
        ArrayList arrayList = new ArrayList();
        try {
            String query = uuri.getQuery();
            if (query == null) {
                return arrayList;
            }
            Matcher matcher = TextUtils.getMatcher(ABS_HTTP_URI_PATTERN, query);
            if (matcher.matches()) {
                TextUtils.recycleMatcher(matcher);
                arrayList.add(query);
            }
            for (String str : new String(uuri.getRawQuery()).split("&")) {
                String[] split = str.split("=");
                if (split.length == 2) {
                    try {
                        String decode = LaxURLCodec.DEFAULT.decode(split[1]);
                        matcher.reset(decode);
                        if (matcher.matches()) {
                            arrayList.add(decode);
                        }
                    } catch (DecoderException e) {
                    }
                }
            }
            return arrayList;
        } catch (URIException e2) {
            return arrayList;
        }
    }
}
