package dk.netarkivet.wayback.hadoop;

import dk.netarkivet.common.CommonSettings;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.utils.Settings;
import dk.netarkivet.common.utils.archive.ArchiveHeaderBase;
import dk.netarkivet.common.utils.archive.ArchiveRecordBase;
import dk.netarkivet.common.utils.batch.ARCBatchFilter;
import dk.netarkivet.common.utils.batch.ArchiveBatchFilter;
import dk.netarkivet.common.utils.batch.WARCBatchFilter;
import dk.netarkivet.wayback.batch.UrlCanonicalizerFactory;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.util.Progressable;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveReaderFactory;
import org.archive.io.ArchiveRecord;
import org.archive.io.arc.ARCRecord;
import org.archive.io.warc.WARCRecord;
import org.archive.wayback.UrlCanonicalizer;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter;
import org.archive.wayback.resourcestore.indexer.ARCRecordToSearchResultAdapter;
import org.archive.wayback.resourcestore.indexer.WARCRecordToSearchResultAdapter;
import org.jwat.common.ByteCountingPushBackInputStream;
import org.jwat.common.ContentType;
import org.jwat.common.HttpHeader;

/* loaded from: input_file:dk/netarkivet/wayback/hadoop/CDXIndexer.class */
public class CDXIndexer implements Indexer {
    protected final WARCRecordToSearchResultAdapter warcAdapter = new WARCRecordToSearchResultAdapter();
    protected final ARCRecordToSearchResultAdapter arcAdapter = new ARCRecordToSearchResultAdapter();
    protected final SearchResultToCDXLineAdapter cdxLineCreator = new SearchResultToCDXLineAdapter();
    protected final UrlCanonicalizer urlCanonicalizer = UrlCanonicalizerFactory.getDefaultUrlCanonicalizer();

    public List<String> index(InputStream inputStream, String str, Progressable progressable) throws IOException {
        ArchiveReader archiveReader = ArchiveReaderFactory.get(str, inputStream, false);
        Throwable th = null;
        try {
            if (str.matches("(.*)" + Settings.get(CommonSettings.METADATAFILE_REGEX_SUFFIX))) {
                List<String> extractMetadataCDXLines = extractMetadataCDXLines(archiveReader, progressable);
                if (archiveReader != null) {
                    if (0 != 0) {
                        try {
                            archiveReader.close();
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                        }
                    } else {
                        archiveReader.close();
                    }
                }
                return extractMetadataCDXLines;
            }
            List<String> extractCDXLines = extractCDXLines(archiveReader, progressable);
            if (archiveReader != null) {
                if (0 != 0) {
                    try {
                        archiveReader.close();
                    } catch (Throwable th3) {
                        th.addSuppressed(th3);
                    }
                } else {
                    archiveReader.close();
                }
            }
            return extractCDXLines;
        } catch (Throwable th4) {
            if (archiveReader != null) {
                if (0 != 0) {
                    try {
                        archiveReader.close();
                    } catch (Throwable th5) {
                        th.addSuppressed(th5);
                    }
                } else {
                    archiveReader.close();
                }
            }
            throw th4;
        }
    }

    @Override // dk.netarkivet.wayback.hadoop.Indexer
    public List<String> indexFile(File file, Progressable progressable) throws IOException {
        return index(new FileInputStream(file), file.getName(), progressable);
    }

    private List<String> extractMetadataCDXLines(ArchiveReader archiveReader, Progressable progressable) {
        ContentType parseContentType;
        String[] strArr = {"A", "e", "b", "m", "n", "g", "v"};
        ArrayList arrayList = new ArrayList();
        Iterator it = archiveReader.iterator();
        while (it.hasNext()) {
            ArchiveRecord archiveRecord = (ArchiveRecord) it.next();
            progressable.progress();
            ArchiveRecordBase wrapArchiveRecord = ArchiveRecordBase.wrapArchiveRecord(archiveRecord);
            if (ArchiveBatchFilter.EXCLUDE_NON_WARCINFO_RECORDS.accept(wrapArchiveRecord)) {
                ArchiveHeaderBase header = wrapArchiveRecord.getHeader();
                HashMap hashMap = new HashMap();
                hashMap.put("A", header.getUrl());
                hashMap.put("e", header.getIp());
                hashMap.put("b", header.getArcDateStr());
                hashMap.put("n", Long.toString(header.getLength()));
                hashMap.put("g", wrapArchiveRecord.getHeader().getArchiveFile().getName());
                hashMap.put("v", Long.toString(wrapArchiveRecord.getHeader().getOffset()));
                String mimetype = header.getMimetype();
                ContentType parseContentType2 = ContentType.parseContentType(mimetype);
                boolean z = false;
                if (parseContentType2 != null) {
                    if (parseContentType2.contentType.equals("application") && parseContentType2.mediaType.equals("http") && parseContentType2.getParameter("msgtype").equals("response")) {
                        z = true;
                    }
                    mimetype = parseContentType2.toStringShort();
                }
                ByteCountingPushBackInputStream byteCountingPushBackInputStream = new ByteCountingPushBackInputStream(wrapArchiveRecord.getInputStream(), 1048576);
                HttpHeader httpHeader = null;
                if (z) {
                    try {
                        httpHeader = HttpHeader.processPayload(1, byteCountingPushBackInputStream, header.getLength(), (String) null);
                        if (httpHeader.contentType != null && (parseContentType = ContentType.parseContentType(httpHeader.contentType)) != null) {
                            mimetype = parseContentType.toStringShort();
                        }
                    } catch (IOException e) {
                        throw new IOFailure("Error reading httpresponse header", e);
                    }
                }
                hashMap.put("m", mimetype);
                if (httpHeader != null) {
                    try {
                        httpHeader.close();
                    } catch (IOException e2) {
                        throw new IOFailure("Error closing httpresponse header", e2);
                    }
                }
                StringBuilder sb = new StringBuilder();
                int i = 0;
                while (i < strArr.length) {
                    Object obj = hashMap.get(strArr[i]);
                    sb.append(i > 0 ? " " : "");
                    sb.append(obj == null ? "-" : obj.toString());
                    i++;
                }
                arrayList.add(sb.toString());
            }
        }
        return arrayList;
    }

    protected List<String> extractCDXLines(ArchiveReader archiveReader, Progressable progressable) {
        ArrayList arrayList = new ArrayList();
        Iterator it = archiveReader.iterator();
        while (it.hasNext()) {
            ARCRecord aRCRecord = (ArchiveRecord) it.next();
            progressable.progress();
            if (aRCRecord instanceof WARCRecord) {
                WARCRecord wARCRecord = (WARCRecord) aRCRecord;
                if (WARCBatchFilter.EXCLUDE_NON_RESPONSE_RECORDS.accept(wARCRecord)) {
                    this.warcAdapter.setCanonicalizer(this.urlCanonicalizer);
                    CaptureSearchResult adapt = this.warcAdapter.adapt(wARCRecord);
                    if (adapt != null) {
                        arrayList.add(this.cdxLineCreator.adapt(adapt));
                    }
                }
            } else {
                ARCRecord aRCRecord2 = aRCRecord;
                if (ARCBatchFilter.EXCLUDE_FILE_HEADERS.accept(aRCRecord2)) {
                    this.arcAdapter.setCanonicalizer(this.urlCanonicalizer);
                    CaptureSearchResult adapt2 = this.arcAdapter.adapt(aRCRecord2);
                    if (adapt2 != null) {
                        arrayList.add(this.cdxLineCreator.adapt(adapt2));
                    }
                }
            }
        }
        return arrayList;
    }
}
