package is.hi.bok.deduplicator;

import dk.netarkivet.common.Constants;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.List;
import org.apache.commons.cli.Option;
import org.apache.commons.configuration.tree.DefaultExpressionEngine;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.archive.util.ArchiveUtils;

/* loaded from: input_file:is/hi/bok/deduplicator/DigestIndexer.class */
public class DigestIndexer {
    public static final String FIELD_URL = "url";
    public static final String FIELD_DIGEST = "digest";
    public static final String FIELD_TIMESTAMP = "date";
    public static final String FIELD_ETAG = "etag";
    public static final String FIELD_URL_NORMALIZED = "url-normalized";
    public static final String FIELD_ORIGIN = "origin";
    public static final String MODE_URL = "URL";
    public static final String MODE_HASH = "HASH";
    public static final String MODE_BOTH = "BOTH";
    private Directory luceneDirectory;
    private IndexWriter index;
    private boolean etag;
    private boolean equivalent;
    private boolean timestamp;
    private boolean indexURL;
    private boolean indexDigest;

    public IndexWriter getIndex() {
        return this.index;
    }

    public DigestIndexer(String str, String str2, boolean z, boolean z2, boolean z3, boolean z4) throws IOException {
        this.etag = false;
        this.equivalent = false;
        this.timestamp = false;
        this.indexURL = true;
        this.indexDigest = true;
        this.etag = z3;
        this.equivalent = z;
        this.timestamp = z2;
        if (str2.equals("URL")) {
            this.indexDigest = false;
        } else if (str2.equals(MODE_HASH)) {
            this.indexURL = false;
        }
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Constants.LUCENE_VERSION, new WhitespaceAnalyzer(Constants.LUCENE_VERSION));
        if (z4) {
            indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
        } else {
            indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        }
        this.luceneDirectory = FSDirectory.open(new File(str));
        this.index = new IndexWriter(this.luceneDirectory, indexWriterConfig);
    }

    public long writeToIndex(CrawlDataIterator crawlDataIterator, String str, boolean z, String str2, boolean z2) throws IOException {
        return writeToIndex(crawlDataIterator, str, z, str2, z2, false);
    }

    public long writeToIndex(CrawlDataIterator crawlDataIterator, String str, boolean z, String str2, boolean z2, boolean z3) throws IOException {
        int i = 0;
        int i2 = 0;
        while (crawlDataIterator.hasNext()) {
            CrawlDataItem next = crawlDataIterator.next();
            if ((z3 && next.duplicate) || next.mimetype.matches(str) == z) {
                i2++;
            } else {
                i++;
                if (z2 && i % 10000 == 0) {
                    System.out.println("Indexed " + i + " - Last URL from " + next.getTimestamp());
                }
                this.index.addDocument(createDocument(next, str2));
            }
        }
        this.index.commit();
        if (z2) {
            System.out.println("Indexed " + i + " items (skipped " + i2 + DefaultExpressionEngine.DEFAULT_INDEX_END);
        }
        return i;
    }

    private Document createDocument(CrawlDataItem crawlDataItem, String str) {
        Document document = new Document();
        FieldType fieldType = new FieldType(StringField.TYPE_STORED);
        fieldType.setIndexed(false);
        FieldType fieldType2 = new FieldType(StringField.TYPE_STORED);
        fieldType2.setOmitNorms(false);
        if (this.indexURL) {
            document.add(new Field("url", crawlDataItem.getURL(), fieldType2));
            if (this.equivalent) {
                document.add(new Field(FIELD_URL_NORMALIZED, stripURL(crawlDataItem.getURL()), fieldType2));
            }
        } else {
            document.add(new Field("url", crawlDataItem.getURL(), fieldType));
            if (this.equivalent) {
                document.add(new Field(FIELD_URL_NORMALIZED, stripURL(crawlDataItem.getURL()), fieldType));
            }
        }
        if (this.indexDigest) {
            document.add(new Field("digest", crawlDataItem.getContentDigest(), fieldType2));
        } else {
            document.add(new Field("digest", crawlDataItem.getContentDigest(), fieldType));
        }
        if (this.timestamp) {
            document.add(new Field("date", crawlDataItem.getTimestamp(), fieldType));
        }
        if (this.etag && crawlDataItem.getEtag() != null) {
            document.add(new Field(FIELD_ETAG, crawlDataItem.getEtag(), fieldType));
        }
        if (str != null) {
            String origin = crawlDataItem.getOrigin();
            if (origin == null) {
                origin = str;
            }
            document.add(new Field("origin", origin, fieldType));
        }
        return document;
    }

    public void close() throws IOException {
        this.index.close(true);
        this.luceneDirectory.close();
    }

    public static String stripURL(String str) {
        return str.replaceAll("www[0-9]*\\.", "").replaceAll("\\?.*$", "").replaceAll("/$", "");
    }

    public static void main(String[] strArr) throws Exception {
        CommandLineParser commandLineParser = new CommandLineParser(strArr, new PrintWriter(System.out));
        long currentTimeMillis = System.currentTimeMillis();
        boolean z = false;
        boolean z2 = false;
        boolean z3 = false;
        String str = MODE_BOTH;
        boolean z4 = false;
        String str2 = "^text/.*";
        boolean z5 = true;
        String name = CrawlLogIterator.class.getName();
        String str3 = null;
        boolean z6 = false;
        for (Option option : commandLineParser.getCommandLineOptions()) {
            switch (option.getId()) {
                case 97:
                    z4 = true;
                    break;
                case 98:
                case 99:
                case 102:
                case 103:
                case 106:
                case 107:
                case 108:
                case 110:
                case 112:
                case 113:
                case 117:
                case 118:
                default:
                    System.err.println("Unhandled option id: " + option.getId());
                    break;
                case 100:
                    z6 = true;
                    break;
                case 101:
                    z = true;
                    break;
                case 104:
                    commandLineParser.usage(0);
                    break;
                case 105:
                    name = option.getValue();
                    break;
                case 109:
                    str2 = option.getValue();
                    break;
                case 111:
                    str = option.getValue();
                    break;
                case 114:
                    str3 = option.getValue();
                    break;
                case 115:
                    z2 = true;
                    break;
                case 116:
                    z3 = true;
                    break;
                case 119:
                    z5 = false;
                    break;
            }
        }
        List commandLineArguments = commandLineParser.getCommandLineArguments();
        if (commandLineArguments.size() != 2) {
            commandLineParser.usage(0);
        }
        CrawlDataIterator crawlDataIterator = (CrawlDataIterator) Class.forName(name).getConstructor(String.class).newInstance((String) commandLineArguments.get(0));
        System.out.println("Indexing: " + commandLineArguments.get(0));
        System.out.println(" - Mode: " + str);
        System.out.println(" - Mime filter: " + str2 + " (" + (z5 ? "blacklist" : "whitelist") + DefaultExpressionEngine.DEFAULT_INDEX_END);
        System.out.println(" - Includes" + (z2 ? " <equivalent URL>" : "") + (z3 ? " <timestamp>" : "") + (z ? " <etag>" : ""));
        System.out.println(" - Skip duplicates: " + (z6 ? "yes" : "no"));
        System.out.println(" - Iterator: " + name);
        System.out.println("   - " + crawlDataIterator.getSourceType());
        System.out.println("Target: " + commandLineArguments.get(1));
        if (z4) {
            System.out.println(" - Add to existing index (if any)");
        } else {
            System.out.println(" - New index (erases any existing index at that location)");
        }
        DigestIndexer digestIndexer = new DigestIndexer((String) commandLineArguments.get(1), str, z2, z3, z, z4);
        digestIndexer.writeToIndex(crawlDataIterator, str2, z5, str3, true, z6);
        digestIndexer.close();
        System.out.println("Total run time: " + ArchiveUtils.formatMillisecondsToConventional(System.currentTimeMillis() - currentTimeMillis));
    }
}
