package is.hi.bok.deduplicator;

import dk.netarkivet.common.utils.AllDocsCollector;
import java.io.File;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.httpclient.HttpConnection;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermRangeFilter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.fetcher.FetchHTTP;
import org.archive.crawler.frontier.AdaptiveRevisitAttributeConstants;
import org.archive.crawler.settings.SimpleType;
import org.archive.httpclient.HttpRecorderMethod;
import org.archive.util.ArchiveUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:is/hi/bok/deduplicator/DeDupFetchHTTP.class */
public class DeDupFetchHTTP extends FetchHTTP implements AdaptiveRevisitAttributeConstants {
    protected IndexSearcher index;
    protected IndexReader indexReader;
    protected String mimefilter;
    protected boolean blacklist;
    SimpleDateFormat sdfLastModified;
    SimpleDateFormat sdfIndexDate;
    protected long processedURLs;
    protected long unchangedURLs;
    protected boolean useSparseRangeFilter;
    public static final String ATTR_DECISION_SCHEME = "decision-scheme";
    public static final String SCHEME_TIMESTAMP = "Timestamp only";
    public static final String DEFAULT_DECISION_SCHEME = "Timestamp only";
    public static final String ATTR_INDEX_LOCATION = "index-location";
    public static final String DEFAULT_INDEX_LOCATION = "";
    public static final String ATTR_MIME_FILTER = "mime-filter";
    public static final String DEFAULT_MIME_FILTER = "^text/.*";
    public static final String ATTR_FILTER_MODE = "filter-mode";
    public static final String ATTR_USE_SPARSE_RANGE_FILTER = "use-sparse-range-filter";
    private static final long serialVersionUID = ArchiveUtils.classnameBasedUID(DeDupFetchHTTP.class, 1);
    private static final Logger log = LoggerFactory.getLogger(FetchHTTP.class.getName());
    public static final String SCHEME_ETAG = "Etag only";
    public static final String SCHEME_TIMESTAMP_AND_ETAG = "Timestamp AND Etag";
    public static final String SCHEME_TIMESTAMP_OR_ETAG = "Timestamp OR Etag";
    public static final String[] AVAILABLE_DECISION_SCHEMES = {"Timestamp only", SCHEME_ETAG, SCHEME_TIMESTAMP_AND_ETAG, SCHEME_TIMESTAMP_OR_ETAG};
    public static final String[] AVAILABLE_FILTER_MODES = {"Blacklist", "Whitelist"};
    public static final String DEFAULT_FILTER_MODE = AVAILABLE_FILTER_MODES[0];
    public static final Boolean DEFAULT_USE_SPARSE_RANGE_FILTER = new Boolean(false);

    public DeDupFetchHTTP(String str) {
        super(str);
        this.mimefilter = "^text/.*";
        this.blacklist = true;
        this.processedURLs = 0L;
        this.unchangedURLs = 0L;
        this.useSparseRangeFilter = DEFAULT_USE_SPARSE_RANGE_FILTER.booleanValue();
        setDescription("Fetch HTTP processor that aborts downloading of unchanged documents. This processor extends the standard FetchHTTP processor, adding a check after the header is downloaded where the header information for 'last-modified' and 'etag' is compared against values stored in a Lucene index built using the DigestIndexer.\n Note that the index must have been built indexed by URL and the Timestamp and/or Etag info must have been included in the index!");
        addElementToDefinition(new SimpleType(ATTR_DECISION_SCHEME, "The different schmes for deciding when to re-download a page given an old version of the same page (or rather meta-data on it)\n Timestamp only: Download when a datestamp is missing in either the downloaded header or index or if the header datestamp is newer then the one in the index.\n Etag only: Download when the Etag is missing in either theheader download or the index or the header Etag and the one in the index differ.\n Timestamp AND Etag: When both datestamp and Etag are available in both the header download and the index, download if EITHER of them indicates change.Timestamp OR Etag: When both datestamp and Etag are available in both the header download and the index, download only if BOTH of them indicate change.", "Timestamp only", AVAILABLE_DECISION_SCHEMES));
        SimpleType simpleType = new SimpleType("index-location", "Location of index (full path). Can not be changed at run time.", "");
        simpleType.setOverrideable(false);
        addElementToDefinition(simpleType);
        SimpleType simpleType2 = new SimpleType("mime-filter", "A regular expression that the mimetype of all documents will be compared against. Only those that pass will be considered. Others are given a pass. \nIf the attribute filter-mode is set to 'Blacklist' then all the documents whose mimetype matches will be ignored by this processor. If the filter-mode is set to 'Whitelist' only those documents whose mimetype matches will be processed.", "^text/.*");
        simpleType2.setOverrideable(false);
        simpleType2.setExpertSetting(true);
        addElementToDefinition(simpleType2);
        SimpleType simpleType3 = new SimpleType("filter-mode", "Determines if the mime-filter acts as a blacklist (declares what should be ignored) or whitelist (declares what should be processed).", DEFAULT_FILTER_MODE, AVAILABLE_FILTER_MODES);
        simpleType3.setOverrideable(false);
        simpleType3.setExpertSetting(true);
        addElementToDefinition(simpleType3);
        SimpleType simpleType4 = new SimpleType("use-sparse-range-filter", "If set to true, then Lucene queries use a custom 'sparse' range filter. This uses less memory at the cost of some lost performance. Suitable for very large indexes.", DEFAULT_USE_SPARSE_RANGE_FILTER);
        simpleType4.setOverrideable(false);
        simpleType4.setExpertSetting(true);
        addElementToDefinition(simpleType4);
    }

    protected boolean checkMidfetchAbort(CrawlURI crawlURI, HttpRecorderMethod httpRecorderMethod, HttpConnection httpConnection) {
        if (crawlURI.isPrerequisite()) {
            return false;
        }
        boolean checkMidfetchAbort = super.checkMidfetchAbort(crawlURI, httpRecorderMethod, httpConnection);
        if (isDuplicate(crawlURI)) {
            checkMidfetchAbort = true;
            this.unchangedURLs++;
            crawlURI.putInt("ar-state", 0);
            crawlURI.addAnnotation("header-duplicate");
        }
        return checkMidfetchAbort;
    }

    protected boolean isDuplicate(CrawlURI crawlURI) {
        boolean z = false;
        if (crawlURI.getContentType() != null && crawlURI.getContentType().matches(this.mimefilter) != this.blacklist) {
            this.processedURLs++;
            HttpMethod httpMethod = (HttpMethod) crawlURI.getObject("http-transaction");
            String str = (String) getUncheckedAttribute(crawlURI, ATTR_DECISION_SCHEME);
            Document lookup = lookup(crawlURI);
            if (lookup != null) {
                if (str.equals("Timestamp only")) {
                    z = datestampIndicatesNonChange(httpMethod, lookup);
                } else if (str.equals(SCHEME_ETAG)) {
                    z = etagIndicatesNonChange(httpMethod, lookup);
                } else if (str.equals(SCHEME_TIMESTAMP_AND_ETAG)) {
                    z = datestampIndicatesNonChange(httpMethod, lookup) && etagIndicatesNonChange(httpMethod, lookup);
                } else if (str.equals(SCHEME_TIMESTAMP_OR_ETAG)) {
                    z = datestampIndicatesNonChange(httpMethod, lookup) || etagIndicatesNonChange(httpMethod, lookup);
                } else {
                    log.error("Unknown decision sceme: {}", str);
                }
            }
        }
        return z;
    }

    protected boolean datestampIndicatesNonChange(HttpMethod httpMethod, Document document) {
        String str = null;
        if (httpMethod.getResponseHeader("last-modified") != null) {
            str = httpMethod.getResponseHeader("last-modified").getValue();
        }
        String str2 = document.get("date");
        if (str == null || str2 == null) {
            return false;
        }
        try {
            return this.sdfLastModified.parse(str).before(this.sdfIndexDate.parse(str2));
        } catch (Exception e) {
            return false;
        }
    }

    protected boolean etagIndicatesNonChange(HttpMethod httpMethod, Document document) {
        String str = null;
        if (httpMethod.getResponseHeader("last-etag") != null) {
            str = httpMethod.getResponseHeader("last-etag").getValue();
        }
        String str2 = document.get("etag");
        if (str == null || str2 == null) {
            return false;
        }
        return str.equals(str2);
    }

    protected Document lookup(CrawlURI crawlURI) {
        try {
            BytesRef bytesRef = new BytesRef(crawlURI.toString().getBytes());
            ConstantScoreQuery constantScoreQuery = new ConstantScoreQuery(new TermRangeFilter("url", bytesRef, bytesRef, true, true));
            AllDocsCollector allDocsCollector = new AllDocsCollector();
            this.index.search(constantScoreQuery, allDocsCollector);
            List hits = allDocsCollector.getHits();
            if (hits == null || hits.size() <= 0) {
                return null;
            }
            Document document = null;
            Iterator it = hits.iterator();
            while (it.hasNext()) {
                Document doc = this.index.doc(((ScoreDoc) it.next()).doc);
                String str = doc.get("date");
                if (document == null || str == null || document.get("date").compareTo(str) > 0) {
                    document = doc;
                }
            }
            return document;
        } catch (IOException e) {
            log.error("Error accessing index.", e);
            return null;
        }
    }

    public void finalTasks() {
        super.finalTasks();
    }

    public void initialTasks() {
        super.initialTasks();
        try {
            FSDirectory open = FSDirectory.open(new File((String) getAttribute("index-location")));
            open.setReadChunkSize(open.getReadChunkSize() / 2);
            this.index = new IndexSearcher(DirectoryReader.open(open));
        } catch (Exception e) {
            log.error("Unable to find/open index.", e);
        }
        try {
            this.mimefilter = (String) getAttribute("mime-filter");
        } catch (Exception e2) {
            log.error("Unable to get attribute mime-filter", e2);
        }
        try {
            this.blacklist = ((String) getAttribute("filter-mode")).equals(DEFAULT_FILTER_MODE);
        } catch (Exception e3) {
            log.error("Unable to get attribute filter-mode", e3);
        }
        this.sdfLastModified = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z");
        this.sdfIndexDate = new SimpleDateFormat("yyyyMMddHHmmssSSS");
        try {
            this.useSparseRangeFilter = ((Boolean) getAttribute("use-sparse-range-filter")).booleanValue();
        } catch (Exception e4) {
            log.error("Unable to get attribute use-sparse-range-filter", e4);
            this.useSparseRangeFilter = DEFAULT_USE_SPARSE_RANGE_FILTER.booleanValue();
        }
    }

    public String report() {
        StringBuffer stringBuffer = new StringBuffer();
        stringBuffer.append("Processor: is.hi.bok.deduplicator.DeDupFetchHTTP\n");
        stringBuffer.append("  URLs compared against index: " + this.processedURLs + "\n");
        stringBuffer.append("  URLs judged unchanged:       " + this.unchangedURLs + "\n");
        stringBuffer.append("  processor extends (parent report)\n");
        stringBuffer.append(super.report());
        return stringBuffer.toString();
    }
}
