package org.archive.modules.extractor;

import com.google.gson.stream.JsonReader;
import com.google.gson.stream.JsonToken;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.RandomAccessFile;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.nio.channels.Channels;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.httpclient.URIException;
import org.archive.crawler.reporting.CrawlerLoggerModule;
import org.archive.format.warc.WARCConstants;
import org.archive.io.warc.WARCRecordInfo;
import org.archive.modules.CrawlURI;
import org.archive.modules.warc.BaseWARCRecordBuilder;
import org.archive.modules.warc.WARCRecordBuilder;
import org.archive.net.UURIFactory;
import org.archive.util.ArchiveUtils;
import org.archive.util.MimetypeUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.Lifecycle;

/* loaded from: input_file:org/archive/modules/extractor/ExtractorYoutubeDL.class */
public class ExtractorYoutubeDL extends Extractor implements Lifecycle, WARCRecordBuilder {
    private static Logger logger = Logger.getLogger(ExtractorYoutubeDL.class.getName());
    protected static final String YDL_CONTAINING_PAGE_DIGEST = "ydl-containing-page-digest";
    protected static final String YDL_CONTAINING_PAGE_TIMESTAMP = "ydl-containing-page-timestamp";
    protected static final String YDL_CONTAINING_PAGE_URI = "ydl-containing-page-uri";
    protected static final int MAX_VIDEOS_PER_PAGE = 1000;
    protected transient Logger ydlLogger = null;
    protected transient ThreadLocal<RandomAccessFile> tempfile = new ThreadLocal<RandomAccessFile>() { // from class: org.archive.modules.extractor.ExtractorYoutubeDL.1
        /* JADX INFO: Access modifiers changed from: protected */
        /* JADX WARN: Can't rename method to resolve collision */
        @Override // java.lang.ThreadLocal
        public RandomAccessFile initialValue() {
            try {
                File createTempFile = File.createTempFile("ydl", ".json");
                RandomAccessFile randomAccessFile = new RandomAccessFile(createTempFile, "rw");
                createTempFile.delete();
                return randomAccessFile;
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
    };
    protected CrawlerLoggerModule crawlerLoggerModule;

    /* JADX INFO: Access modifiers changed from: package-private */
    /* renamed from: org.archive.modules.extractor.ExtractorYoutubeDL$3, reason: invalid class name */
    /* loaded from: input_file:org/archive/modules/extractor/ExtractorYoutubeDL$3.class */
    public static /* synthetic */ class AnonymousClass3 {
        static final /* synthetic */ int[] $SwitchMap$com$google$gson$stream$JsonToken = new int[JsonToken.values().length];

        static {
            try {
                $SwitchMap$com$google$gson$stream$JsonToken[JsonToken.BEGIN_ARRAY.ordinal()] = 1;
            } catch (NoSuchFieldError e) {
            }
            try {
                $SwitchMap$com$google$gson$stream$JsonToken[JsonToken.BEGIN_OBJECT.ordinal()] = 2;
            } catch (NoSuchFieldError e2) {
            }
            try {
                $SwitchMap$com$google$gson$stream$JsonToken[JsonToken.BOOLEAN.ordinal()] = 3;
            } catch (NoSuchFieldError e3) {
            }
            try {
                $SwitchMap$com$google$gson$stream$JsonToken[JsonToken.END_ARRAY.ordinal()] = 4;
            } catch (NoSuchFieldError e4) {
            }
            try {
                $SwitchMap$com$google$gson$stream$JsonToken[JsonToken.END_DOCUMENT.ordinal()] = 5;
            } catch (NoSuchFieldError e5) {
            }
            try {
                $SwitchMap$com$google$gson$stream$JsonToken[JsonToken.END_OBJECT.ordinal()] = 6;
            } catch (NoSuchFieldError e6) {
            }
            try {
                $SwitchMap$com$google$gson$stream$JsonToken[JsonToken.NAME.ordinal()] = 7;
            } catch (NoSuchFieldError e7) {
            }
            try {
                $SwitchMap$com$google$gson$stream$JsonToken[JsonToken.NULL.ordinal()] = 8;
            } catch (NoSuchFieldError e8) {
            }
            try {
                $SwitchMap$com$google$gson$stream$JsonToken[JsonToken.NUMBER.ordinal()] = 9;
            } catch (NoSuchFieldError e9) {
            }
            try {
                $SwitchMap$com$google$gson$stream$JsonToken[JsonToken.STRING.ordinal()] = 10;
            } catch (NoSuchFieldError e10) {
            }
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:org/archive/modules/extractor/ExtractorYoutubeDL$TeedInputStream.class */
    public static class TeedInputStream extends InputStream {
        private InputStream in;
        private RandomAccessFile out;

        public TeedInputStream(InputStream inputStream, RandomAccessFile randomAccessFile) {
            this.in = inputStream;
            this.out = randomAccessFile;
        }

        @Override // java.io.InputStream
        public int read() throws IOException {
            int read = this.in.read();
            if (read >= 0) {
                this.out.write(read);
            }
            return read;
        }

        @Override // java.io.InputStream
        public int read(byte[] bArr, int i, int i2) throws IOException {
            int read = this.in.read(bArr, i, i2);
            if (read > 0) {
                this.out.write(bArr, i, read);
            }
            return read;
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    /* loaded from: input_file:org/archive/modules/extractor/ExtractorYoutubeDL$YoutubeDLResults.class */
    public static class YoutubeDLResults {
        RandomAccessFile jsonFile;
        List<String> videoUrls = new ArrayList();
        List<String> pageUrls = new ArrayList();

        public YoutubeDLResults(RandomAccessFile randomAccessFile) {
            this.jsonFile = randomAccessFile;
            try {
                this.jsonFile.setLength(0L);
                this.jsonFile.seek(0L);
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
    }

    public CrawlerLoggerModule getCrawlerLoggerModule() {
        return this.crawlerLoggerModule;
    }

    @Autowired
    public void setCrawlerLoggerModule(CrawlerLoggerModule crawlerLoggerModule) {
        this.crawlerLoggerModule = crawlerLoggerModule;
    }

    public void start() {
        if (!this.isRunning) {
            getCrawlerLoggerModule().start();
            this.ydlLogger = getCrawlerLoggerModule().setupSimpleLog(getBeanName());
        }
        super.start();
    }

    protected String readToEnd(Reader reader) throws IOException {
        StringBuilder sb = new StringBuilder();
        char[] cArr = new char[4096];
        while (true) {
            int read = reader.read(cArr);
            if (read < 0) {
                return sb.toString();
            }
            sb.append(cArr, 0, read);
        }
    }

    protected void extract(CrawlURI crawlURI) {
        String findYdlAnnotation = findYdlAnnotation(crawlURI);
        if (findYdlAnnotation != null) {
            if (crawlURI.getFetchStatus() < 300 || crawlURI.getFetchStatus() >= 400) {
                logCapturedVideo(crawlURI, findYdlAnnotation);
                return;
            } else {
                doRedirectInheritance(crawlURI, findYdlAnnotation);
                return;
            }
        }
        YoutubeDLResults runYoutubeDL = runYoutubeDL(crawlURI);
        for (int i = 0; i < runYoutubeDL.videoUrls.size(); i++) {
            addVideoOutlink(crawlURI, runYoutubeDL.videoUrls.get(i), i, runYoutubeDL.videoUrls.size());
        }
        for (String str : runYoutubeDL.pageUrls) {
            try {
                crawlURI.getOutLinks().add(crawlURI.createCrawlURI(UURIFactory.getInstance(crawlURI.getUURI(), str), LinkContext.NAVLINK_MISC, Hop.NAVLINK));
            } catch (URIException e) {
                logUriError(e, crawlURI.getUURI(), str);
            }
        }
        if (runYoutubeDL.videoUrls.size() > 0) {
            String str2 = "youtube-dl:" + runYoutubeDL.videoUrls.size();
            crawlURI.getAnnotations().add(str2);
            logContainingPage(crawlURI, str2);
        }
    }

    protected void addVideoOutlink(CrawlURI crawlURI, String str, int i, int i2) {
        try {
            CrawlURI createCrawlURI = crawlURI.createCrawlURI(UURIFactory.getInstance(crawlURI.getUURI(), str), LinkContext.EMBED_MISC, Hop.EMBED);
            createCrawlURI.getAnnotations().add("youtube-dl:" + (i + 1) + "/" + i2);
            createCrawlURI.getData().put(YDL_CONTAINING_PAGE_URI, crawlURI.toString());
            createCrawlURI.getData().put(YDL_CONTAINING_PAGE_TIMESTAMP, ArchiveUtils.get17DigitDate(crawlURI.getFetchBeginTime()));
            createCrawlURI.getData().put(YDL_CONTAINING_PAGE_DIGEST, crawlURI.getContentDigestSchemeString());
            crawlURI.getOutLinks().add(createCrawlURI);
        } catch (URIException e) {
            logUriError(e, crawlURI.getUURI(), str);
        }
    }

    protected String findYdlAnnotation(CrawlURI crawlURI) {
        for (String str : crawlURI.getAnnotations()) {
            if (str.startsWith("youtube-dl:")) {
                return str;
            }
        }
        return null;
    }

    protected void logCapturedVideo(CrawlURI crawlURI, String str) {
        String str2 = "-";
        if (crawlURI.isHttpTransaction()) {
            if (crawlURI.getContentLength() >= 0) {
                str2 = Long.toString(crawlURI.getContentLength());
            } else if (crawlURI.getContentSize() > 0) {
                str2 = Long.toString(crawlURI.getContentSize());
            }
        } else if (crawlURI.getContentSize() > 0) {
            str2 = Long.toString(crawlURI.getContentSize());
        }
        this.ydlLogger.info(crawlURI.getFetchStatus() + " " + str2 + " " + MimetypeUtils.truncate(crawlURI.getContentType()) + " " + crawlURI.getContentDigestSchemeString() + " " + ArchiveUtils.get17DigitDate(crawlURI.getFetchBeginTime()) + " " + crawlURI + " " + str + " " + crawlURI.getData().get(YDL_CONTAINING_PAGE_DIGEST) + " " + crawlURI.getData().get(YDL_CONTAINING_PAGE_TIMESTAMP) + " " + crawlURI.getData().get(YDL_CONTAINING_PAGE_URI) + " " + (crawlURI.containsDataKey("source") ? crawlURI.getSourceTag() : "-"));
    }

    protected void logContainingPage(CrawlURI crawlURI, String str) {
        this.ydlLogger.info("- - - - - - " + str + " " + crawlURI.getContentDigestSchemeString() + " " + ArchiveUtils.get17DigitDate(crawlURI.getFetchBeginTime()) + " " + crawlURI + " " + (crawlURI.containsDataKey("source") ? crawlURI.getSourceTag() : "-"));
    }

    protected void doRedirectInheritance(CrawlURI crawlURI, String str) {
        for (CrawlURI crawlURI2 : crawlURI.getOutLinks()) {
            if ("R".equals(crawlURI2.getLastHop())) {
                crawlURI2.getAnnotations().add(str);
                crawlURI2.getData().put(YDL_CONTAINING_PAGE_URI, crawlURI.getData().get(YDL_CONTAINING_PAGE_URI));
                crawlURI2.getData().put(YDL_CONTAINING_PAGE_TIMESTAMP, crawlURI.getData().get(YDL_CONTAINING_PAGE_TIMESTAMP));
                crawlURI2.getData().put(YDL_CONTAINING_PAGE_DIGEST, crawlURI.getData().get(YDL_CONTAINING_PAGE_DIGEST));
            }
        }
    }

    /* JADX WARN: Failed to find 'out' block for switch in B:4:0x0030. Please report as an issue. */
    protected void streamYdlOutput(InputStream inputStream, YoutubeDLResults youtubeDLResults) throws IOException {
        JsonReader jsonReader = new JsonReader(new InputStreamReader(new TeedInputStream(inputStream, youtubeDLResults.jsonFile), "UTF-8"));
        while (true) {
            try {
                JsonToken peek = jsonReader.peek();
                switch (AnonymousClass3.$SwitchMap$com$google$gson$stream$JsonToken[peek.ordinal()]) {
                    case 1:
                        jsonReader.beginArray();
                    case 2:
                        jsonReader.beginObject();
                    case 3:
                        jsonReader.nextBoolean();
                    case 4:
                        jsonReader.endArray();
                    case 5:
                        jsonReader.close();
                        return;
                    case 6:
                        jsonReader.endObject();
                    case 7:
                        jsonReader.nextName();
                    case 8:
                        jsonReader.nextNull();
                    case 9:
                        jsonReader.nextString();
                    case 10:
                        String nextString = jsonReader.nextString();
                        if ("$.url".equals(jsonReader.getPath()) || jsonReader.getPath().matches("^\\$\\.entries\\[\\d+\\]\\.url$")) {
                            youtubeDLResults.videoUrls.add(nextString);
                        } else if ("$.webpage_url".equals(jsonReader.getPath()) || jsonReader.getPath().matches("^\\$\\.entries\\[\\d+\\]\\.webpage_url$")) {
                            youtubeDLResults.pageUrls.add(nextString);
                        }
                        break;
                    default:
                        throw new RuntimeException("unexpected json token " + peek);
                }
            } catch (Throwable th) {
                try {
                    jsonReader.close();
                } catch (Throwable th2) {
                    th.addSuppressed(th2);
                }
                throw th;
            }
        }
    }

    protected YoutubeDLResults runYoutubeDL(CrawlURI crawlURI) {
        ProcessBuilder processBuilder = new ProcessBuilder("youtube-dl", "--ignore-config", "--simulate", "--dump-single-json", "--format=best[height <=? 576]", "--playlist-end=1000", crawlURI.toString());
        logger.info("running: " + String.join(" ", processBuilder.command()));
        try {
            Process start = processBuilder.start();
            try {
                final InputStreamReader inputStreamReader = new InputStreamReader(start.getErrorStream(), "UTF-8");
                ExecutorService newSingleThreadExecutor = Executors.newSingleThreadExecutor();
                Future submit = newSingleThreadExecutor.submit(new Callable<String>() { // from class: org.archive.modules.extractor.ExtractorYoutubeDL.2
                    /* JADX WARN: Can't rename method to resolve collision */
                    @Override // java.util.concurrent.Callable
                    public String call() throws IOException {
                        return ExtractorYoutubeDL.this.readToEnd(inputStreamReader);
                    }
                });
                YoutubeDLResults youtubeDLResults = new YoutubeDLResults(this.tempfile.get());
                try {
                    try {
                        try {
                            streamYdlOutput(start.getInputStream(), youtubeDLResults);
                        } catch (Throwable th) {
                            try {
                                start.waitFor(1L, TimeUnit.SECONDS);
                            } catch (InterruptedException e) {
                                logger.warning("youtube-dl still running? killing it");
                                start.destroyForcibly();
                            }
                            newSingleThreadExecutor.shutdown();
                            throw th;
                        }
                    } catch (EOFException e2) {
                        try {
                            logger.log(Level.FINE, "problem parsing json from youtube-dl " + processBuilder.command() + " " + ((String) submit.get()));
                        } catch (InterruptedException e3) {
                            throw new IOException(e3);
                        } catch (ExecutionException e4) {
                            throw new IOException(e4);
                        }
                    }
                    try {
                        start.waitFor(1L, TimeUnit.SECONDS);
                    } catch (InterruptedException e5) {
                        logger.warning("youtube-dl still running? killing it");
                        start.destroyForcibly();
                    }
                    newSingleThreadExecutor.shutdown();
                    return youtubeDLResults;
                } catch (IOException e6) {
                    logger.log(Level.WARNING, "problem reading output from youtube-dl " + processBuilder.command(), (Throwable) e6);
                    try {
                        start.waitFor(1L, TimeUnit.SECONDS);
                    } catch (InterruptedException e7) {
                        logger.warning("youtube-dl still running? killing it");
                        start.destroyForcibly();
                    }
                    newSingleThreadExecutor.shutdown();
                    return null;
                }
            } catch (UnsupportedEncodingException e8) {
                throw new RuntimeException(e8);
            }
        } catch (IOException e9) {
            logger.log(Level.WARNING, "youtube-dl failed " + processBuilder.command(), (Throwable) e9);
            return null;
        }
    }

    protected boolean shouldProcess(CrawlURI crawlURI) {
        if (findYdlAnnotation(crawlURI) != null) {
            return true;
        }
        return shouldExtract(crawlURI);
    }

    protected boolean shouldExtract(CrawlURI crawlURI) {
        if (crawlURI.getFetchStatus() != 200 || crawlURI.getContentLength() <= 0 || crawlURI.getContentLength() >= 200000000) {
            return false;
        }
        String lowerCase = crawlURI.getContentType().toLowerCase();
        return lowerCase.startsWith("text/html") || lowerCase.startsWith("application/xhtml") || lowerCase.startsWith("text/vnd.wap.wml") || lowerCase.startsWith("application/vnd.wap.wml") || lowerCase.startsWith("application/vnd.wap.xhtml");
    }

    public boolean shouldBuildRecord(CrawlURI crawlURI) {
        String findYdlAnnotation = findYdlAnnotation(crawlURI);
        return (findYdlAnnotation == null || findYdlAnnotation.contains("/")) ? false : true;
    }

    public WARCRecordInfo buildRecord(CrawlURI crawlURI, URI uri) throws IOException {
        String log14Date = ArchiveUtils.getLog14Date(crawlURI.getFetchBeginTime());
        WARCRecordInfo wARCRecordInfo = new WARCRecordInfo();
        wARCRecordInfo.setType(WARCConstants.WARCRecordType.metadata);
        wARCRecordInfo.setRecordId(BaseWARCRecordBuilder.generateRecordID());
        if (uri != null) {
            wARCRecordInfo.addExtraHeader("WARC-Concurrent-To", "<" + uri + ">");
        }
        wARCRecordInfo.setUrl("youtube-dl:" + crawlURI);
        wARCRecordInfo.setCreate14DigitDate(log14Date);
        wARCRecordInfo.setMimetype("application/vnd.youtube-dl_formats+json;charset=utf-8");
        wARCRecordInfo.setEnforceLength(true);
        this.tempfile.get().seek(0L);
        wARCRecordInfo.setContentStream(Channels.newInputStream(this.tempfile.get().getChannel()));
        wARCRecordInfo.setContentLength(this.tempfile.get().length());
        logger.info("built record timestamp=" + log14Date + " url=" + wARCRecordInfo.getUrl());
        return wARCRecordInfo;
    }

    public static void main(String[] strArr) throws IOException {
        ExtractorYoutubeDL extractorYoutubeDL = new ExtractorYoutubeDL();
        FileInputStream fileInputStream = new FileInputStream("/tmp/ydl-single-video.json");
        YoutubeDLResults youtubeDLResults = new YoutubeDLResults(extractorYoutubeDL.tempfile.get());
        extractorYoutubeDL.streamYdlOutput(fileInputStream, youtubeDLResults);
        System.out.println("video urls: " + youtubeDLResults.videoUrls);
        System.out.println("page urls: " + youtubeDLResults.pageUrls);
        youtubeDLResults.jsonFile.seek(0L);
        byte[] bArr = new byte[4096];
        while (true) {
            int read = youtubeDLResults.jsonFile.read(bArr);
            if (read < 0) {
                break;
            } else {
                System.out.write(bArr, 0, read);
            }
        }
        FileInputStream fileInputStream2 = new FileInputStream("/tmp/ydl-uncgreensboro-limited.json");
        YoutubeDLResults youtubeDLResults2 = new YoutubeDLResults(extractorYoutubeDL.tempfile.get());
        extractorYoutubeDL.streamYdlOutput(fileInputStream2, youtubeDLResults2);
        System.out.println("video urls: " + youtubeDLResults2.videoUrls);
        System.out.println("page urls: " + youtubeDLResults2.pageUrls);
        youtubeDLResults2.jsonFile.seek(0L);
        while (true) {
            int read2 = youtubeDLResults2.jsonFile.read(bArr);
            if (read2 < 0) {
                return;
            } else {
                System.out.write(bArr, 0, read2);
            }
        }
    }
}
