package dk.netarkivet.harvester.indexserver;

import dk.netarkivet.common.distribute.indexserver.Index;
import dk.netarkivet.common.distribute.indexserver.JobIndexCache;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.utils.FileUtils;
import dk.netarkivet.common.utils.Settings;
import dk.netarkivet.common.utils.TimeUtils;
import dk.netarkivet.common.utils.ZipUtils;
import dk.netarkivet.harvester.HarvesterSettings;
import is.hi.bok.deduplicator.CrawlDataIterator;
import is.hi.bok.deduplicator.DigestIndexer;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:dk/netarkivet/harvester/indexserver/CrawlLogIndexCache.class */
public abstract class CrawlLogIndexCache extends CombiningMultiFileBasedCache<Long> implements JobIndexCache {
    private static final Logger log = LoggerFactory.getLogger(CrawlLogIndexCache.class);
    private final CDXDataCache cdxcache;
    private boolean useBlacklist;
    private String mimeFilter;
    private final long sleepintervalBetweenCompletenessChecks;
    private int indexingJobCount;

    public CrawlLogIndexCache(String str, boolean z, String str2) {
        super(str, new CrawlLogDataCache());
        this.cdxcache = new CDXDataCache();
        this.sleepintervalBetweenCompletenessChecks = Settings.getLong(HarvesterSettings.INDEXSERVER_INDEXING_CHECKINTERVAL);
        this.indexingJobCount = 0;
        this.useBlacklist = z;
        this.mimeFilter = str2;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // dk.netarkivet.harvester.indexserver.CombiningMultiFileBasedCache
    public Map<Long, File> prepareCombine(Set<Long> set) {
        log.info("Starting to generate {} for the {} jobs: {}", new Object[]{getCacheDir().getName(), Integer.valueOf(set.size()), set});
        Map<Long, File> prepareCombine = super.prepareCombine(set);
        HashSet hashSet = new HashSet();
        for (Long l : prepareCombine.keySet()) {
            if (this.cdxcache.cache((CDXDataCache) l) == null) {
                hashSet.add(l);
            }
        }
        if (!hashSet.isEmpty()) {
            log.warn("Data not found for {} jobs: {}", Integer.valueOf(hashSet.size()), hashSet);
        }
        Iterator it = hashSet.iterator();
        while (it.hasNext()) {
            prepareCombine.remove((Long) it.next());
        }
        return prepareCombine;
    }

    @Override // dk.netarkivet.harvester.indexserver.CombiningMultiFileBasedCache
    protected void combine(Map<Long, File> map) {
        this.indexingJobCount++;
        long size = map.values().size();
        log.info("Starting combine task #{}. This combines a dataset with {} crawl logs (thread = {})", new Object[]{Integer.valueOf(this.indexingJobCount), Long.valueOf(size), Thread.currentThread().getName()});
        File cacheFile = getCacheFile((Set) map.keySet());
        HashSet hashSet = new HashSet();
        String str = cacheFile.getAbsolutePath() + ".luceneDir";
        ThreadPoolExecutor threadPoolExecutor = null;
        try {
            try {
                DigestIndexer createStandardIndexer = createStandardIndexer(str);
                DigestOptions digestOptions = new DigestOptions(this.useBlacklist, false, this.mimeFilter);
                long j = 0;
                HashSet hashSet2 = new HashSet();
                int i = Settings.getInt(HarvesterSettings.INDEXSERVER_INDEXING_MAXTHREADS);
                threadPoolExecutor = new ThreadPoolExecutor(i, i, 0L, TimeUnit.MILLISECONDS, new LinkedBlockingQueue());
                threadPoolExecutor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
                for (Map.Entry<Long, File> entry : map.entrySet()) {
                    Long key = entry.getKey();
                    File value = entry.getValue();
                    File file = new File(FileUtils.getTempDir(), UUID.randomUUID().toString());
                    hashSet.add(file);
                    String absolutePath = file.getAbsolutePath();
                    Long cache = this.cdxcache.cache((CDXDataCache) key);
                    if (cache == null) {
                        log.warn("Skipping the ingest of logs for job {}. Unable to retrieve cdx-file for job.", entry.getKey());
                    } else {
                        File cacheFile2 = this.cdxcache.getCacheFile(cache);
                        j++;
                        String str2 = j + " out of " + size;
                        log.debug("Making subthread for indexing job " + key + " - task " + str2);
                        hashSet2.add(new IndexingState(key, absolutePath, threadPoolExecutor.submit(new DigestIndexerWorker(absolutePath, key, value, cacheFile2, digestOptions, str2))));
                    }
                }
                HashSet hashSet3 = new HashSet();
                long j2 = Settings.getLong(HarvesterSettings.INDEXSERVER_INDEXING_TIMEOUT);
                long currentTimeMillis = System.currentTimeMillis() + j2;
                IndexWriter index = createStandardIndexer.getIndex();
                int i2 = 0;
                int i3 = Settings.getInt(HarvesterSettings.INDEXSERVER_INDEXING_MAX_SEGMENTS);
                while (true) {
                    if (hashSet2.size() <= 0) {
                        break;
                    }
                    log.info("Outstanding jobs in combine task #{} is now {}", Integer.valueOf(this.indexingJobCount), Integer.valueOf(hashSet2.size()));
                    Iterator it = hashSet2.iterator();
                    if (currentTimeMillis < System.currentTimeMillis()) {
                        log.warn("Max indexing time exceeded for one index ({}). Indexing stops here, although missing subindices for {} jobs", TimeUtils.readableTimeInterval(j2), Integer.valueOf(hashSet2.size()));
                        break;
                    }
                    while (it.hasNext() && hashSet3.size() < 200) {
                        IndexingState indexingState = (IndexingState) it.next();
                        if (indexingState.getResultObject().isDone()) {
                            try {
                                if (indexingState.getResultObject().get().booleanValue()) {
                                    hashSet3.add(new SimpleFSDirectory(new File(indexingState.getIndex())));
                                } else {
                                    log.warn("Indexing of job {} failed.", indexingState.getJobIdentifier());
                                }
                            } catch (InterruptedException e) {
                                log.warn("Unable to get Result back from indexing thread", e);
                            } catch (ExecutionException e2) {
                                log.warn("Unable to get Result back from indexing thread", e2);
                            }
                            it.remove();
                        }
                    }
                    if (hashSet3.size() >= 200) {
                        log.info("Adding {} subindices to main index. Forcing index to contain max {} files (related to combine task #{})", new Object[]{Integer.valueOf(hashSet3.size()), Integer.valueOf(i3), Integer.valueOf(this.indexingJobCount)});
                        index.addIndexes((Directory[]) hashSet3.toArray(new Directory[0]));
                        index.forceMerge(i3);
                        index.commit();
                        Iterator it2 = hashSet3.iterator();
                        while (it2.hasNext()) {
                            ((Directory) it2.next()).close();
                        }
                        i2 += hashSet3.size();
                        log.info("Completed adding {} subindices to main index, now containing {} subindices(related to combine task #{})", new Object[]{Integer.valueOf(hashSet3.size()), Integer.valueOf(i2), Integer.valueOf(this.indexingJobCount)});
                        hashSet3.clear();
                    } else {
                        sleepAwhile();
                    }
                }
                log.info("Adding the final {} subindices to main index. Forcing index to contain max {} files (related to combine task #{})", new Object[]{Integer.valueOf(hashSet3.size()), Integer.valueOf(i3), Integer.valueOf(this.indexingJobCount)});
                index.addIndexes((Directory[]) hashSet3.toArray(new Directory[0]));
                index.forceMerge(i3);
                index.commit();
                Iterator it3 = hashSet3.iterator();
                while (it3.hasNext()) {
                    ((Directory) it3.next()).close();
                }
                hashSet3.clear();
                log.info("Adding operation completed (combine task #{})!", Integer.valueOf(this.indexingJobCount));
                long numDocs = index.numDocs();
                createStandardIndexer.close();
                log.info("Closed index (related to combine task #{}", Integer.valueOf(this.indexingJobCount));
                File file2 = new File(str);
                log.info("Gzip-compressing the individual {} index files of combine task # {}", Integer.valueOf(file2.list().length), Integer.valueOf(this.indexingJobCount));
                ZipUtils.gzipFiles(file2, cacheFile);
                log.info("Completed combine task #{} that combined a dataset with {} crawl logs (entries in combined index: {}) - compressed index has size {}", new Object[]{Integer.valueOf(this.indexingJobCount), Long.valueOf(size), Long.valueOf(numDocs), FileUtils.getHumanReadableFileSize(cacheFile)});
                closeDownThreadpoolQuietly(threadPoolExecutor);
                FileUtils.removeRecursively(new File(str));
                Iterator it4 = hashSet.iterator();
                while (it4.hasNext()) {
                    FileUtils.removeRecursively((File) it4.next());
                }
            } catch (IOException e3) {
                throw new IOFailure("Error setting up craw.log index framework for " + cacheFile.getAbsolutePath(), e3);
            }
        } catch (Throwable th) {
            closeDownThreadpoolQuietly(threadPoolExecutor);
            FileUtils.removeRecursively(new File(str));
            Iterator it5 = hashSet.iterator();
            while (it5.hasNext()) {
                FileUtils.removeRecursively((File) it5.next());
            }
            throw th;
        }
    }

    private void closeDownThreadpoolQuietly(ThreadPoolExecutor threadPoolExecutor) {
        if (threadPoolExecutor == null || threadPoolExecutor.isShutdown()) {
            return;
        }
        threadPoolExecutor.shutdownNow();
    }

    private void sleepAwhile() {
        try {
            Thread.sleep(this.sleepintervalBetweenCompletenessChecks);
        } catch (InterruptedException e) {
            log.trace("Was awoken early from sleep: ", e);
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static void indexFile(Long l, File file, File file2, DigestIndexer digestIndexer, DigestOptions digestOptions) {
        log.debug("Ingesting the crawl.log file '{}' related to job {}", file.getAbsolutePath(), l);
        boolean useBlacklist = digestOptions.getUseBlacklist();
        String mimeFilter = digestOptions.getMimeFilter();
        boolean verboseMode = digestOptions.getVerboseMode();
        CrawlDataIterator crawlDataIterator = null;
        File file3 = null;
        File file4 = null;
        BufferedReader bufferedReader = null;
        try {
            try {
                file3 = getSortedCDX(file2);
                bufferedReader = new BufferedReader(new FileReader(file3));
                file4 = getSortedCrawlLog(file);
                crawlDataIterator = new CDXOriginCrawlLogIterator(file4, bufferedReader);
                digestIndexer.writeToIndex(crawlDataIterator, mimeFilter, useBlacklist, "ERROR", verboseMode);
                if (crawlDataIterator != null) {
                    try {
                        crawlDataIterator.close();
                    } catch (IOException e) {
                        log.warn("Error cleaning up after crawl log index cache generation", e);
                        return;
                    }
                }
                if (file4 != null) {
                    FileUtils.remove(file4);
                }
                if (bufferedReader != null) {
                    bufferedReader.close();
                }
                if (file3 != null) {
                    FileUtils.remove(file3);
                }
            } catch (IOException e2) {
                throw new IOFailure("Fatal error indexing " + l, e2);
            }
        } catch (Throwable th) {
            if (crawlDataIterator != null) {
                try {
                    crawlDataIterator.close();
                } catch (IOException e3) {
                    log.warn("Error cleaning up after crawl log index cache generation", e3);
                    throw th;
                }
            }
            if (file4 != null) {
                FileUtils.remove(file4);
            }
            if (bufferedReader != null) {
                bufferedReader.close();
            }
            if (file3 != null) {
                FileUtils.remove(file3);
            }
            throw th;
        }
    }

    protected static File getSortedCDX(File file) {
        try {
            File createTempFile = File.createTempFile("sorted", "cdx", FileUtils.getTempDir());
            FileUtils.sortCDX(file, createTempFile);
            createTempFile.deleteOnExit();
            return createTempFile;
        } catch (IOException e) {
            throw new IOFailure("Error while making tmp file for " + file, e);
        }
    }

    protected static File getSortedCrawlLog(File file) {
        try {
            File createTempFile = File.createTempFile("sorted", "crawllog", FileUtils.getTempDir());
            FileUtils.sortCrawlLog(file, createTempFile);
            createTempFile.deleteOnExit();
            return createTempFile;
        } catch (IOException e) {
            throw new IOFailure("Error creating sorted crawl log file for '" + file + "'", e);
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static DigestIndexer createStandardIndexer(String str) throws IOException {
        return new DigestIndexer(str, "BOTH", false, true, true, false);
    }

    public /* bridge */ /* synthetic */ Index getIndex(Set set) {
        return super.getIndex((CrawlLogIndexCache) set);
    }
}
