package is.hi.bok.deduplicator;

import dk.netarkivet.common.utils.AllDocsCollector;
import java.io.File;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermRangeFilter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.framework.Processor;
import org.archive.crawler.frontier.AdaptiveRevisitAttributeConstants;
import org.archive.crawler.settings.SimpleType;
import org.archive.util.ArchiveUtils;
import org.archive.util.Base32;

/* loaded from: input_file:is/hi/bok/deduplicator/DeDuplicator.class */
public class DeDuplicator extends Processor implements AdaptiveRevisitAttributeConstants {
    protected IndexSearcher index;
    protected IndexReader indexReader;
    protected boolean lookupByURL;
    protected boolean equivalent;
    protected String mimefilter;
    protected boolean blacklist;
    protected boolean doTimestampAnalysis;
    protected boolean doETagAnalysis;
    protected boolean statsPerHost;
    protected boolean changeContentSize;
    protected boolean useOrigin;
    protected boolean useOriginFromIndex;
    protected boolean useSparseRangeFilter;
    protected Statistics stats;
    protected HashMap<String, Statistics> perHostStats;
    protected boolean skipWriting;
    public static final String ATTR_INDEX_LOCATION = "index-location";
    public static final String DEFAULT_INDEX_LOCATION = "";
    public static final String ATTR_MATCHING_METHOD = "matching-method";
    public static final String ATTR_EQUIVALENT = "try-equivalent";
    public static final String ATTR_MIME_FILTER = "mime-filter";
    public static final String DEFAULT_MIME_FILTER = "^text/.*";
    public static final String ATTR_FILTER_MODE = "filter-mode";
    public static final String ATTR_ANALYSIS_MODE = "analysis-mode";
    public static final String ATTR_CHANGE_CONTENT_SIZE = "change-content-size";
    public static final String ATTR_LOG_LEVEL = "log-level";
    public static final String ATTR_STATS_PER_HOST = "stats-per-host";
    public static final String ATTR_ORIGIN_HANDLING = "origin-handling";
    public static final String ORIGIN_HANDLING_NONE = "No origin information";
    public static final String DEFAULT_ORIGIN_HANDLING = "No origin information";
    public static final String ATTR_ORIGIN = "origin";
    public static final String DEFAULT_ORIGIN = "";
    public static final String ATTR_SKIP_WRITE = "skip-writing";
    public static final String ATTR_USE_SPARSE_RANGE_FILTER = "use-sparse-range-filter";
    private static final long serialVersionUID = ArchiveUtils.classnameBasedUID(DeDuplicator.class, 1);
    private static final Logger logger = Logger.getLogger(DeDuplicator.class.getName());
    public static final String[] AVAILABLE_MATCHING_METHODS = {"By URL", "By content digest"};
    public static final String DEFAULT_MATCHING_METHOD = AVAILABLE_MATCHING_METHODS[0];
    public static final Boolean DEFAULT_EQUIVALENT = new Boolean(false);
    public static final String[] AVAILABLE_FILTER_MODES = {"Blacklist", "Whitelist"};
    public static final String DEFAULT_FILTER_MODE = AVAILABLE_FILTER_MODES[0];
    public static final String[] AVAILABLE_ANALYSIS_MODES = {"None", "Timestamp", "Timestamp and ETag"};
    public static final String DEFAULT_ANALYSIS_MODE = AVAILABLE_ANALYSIS_MODES[0];
    public static final Boolean DEFAULT_CHANGE_CONTENT_SIZE = new Boolean(true);
    public static final String[] AVAILABLE_LOG_LEVELS = {Level.SEVERE.toString(), Level.INFO.toString(), Level.FINEST.toString()};
    public static final String DEFAULT_LOG_LEVEL = AVAILABLE_LOG_LEVELS[0];
    public static final Boolean DEFAULT_STATS_PER_HOST = new Boolean(false);
    public static final String ORIGIN_HANDLING_PROCESSOR = "Use processor setting";
    public static final String ORIGIN_HANDLING_INDEX = "Use index information";
    public static final String[] AVAILABLE_ORIGIN_HANDLING = {"No origin information", ORIGIN_HANDLING_PROCESSOR, ORIGIN_HANDLING_INDEX};
    public static final Boolean DEFAULT_SKIP_WRITE = new Boolean(true);
    public static final Boolean DEFAULT_USE_SPARSE_RANGE_FILTER = new Boolean(false);

    public DeDuplicator(String str) {
        super(str, "Aborts the processing of URIs (skips to post processing chain) if a duplicate is found in the specified index. Note that any changes made to this processors configuration at run time will be ignored unless otherwise stated.");
        this.index = null;
        this.indexReader = null;
        this.lookupByURL = true;
        this.equivalent = DEFAULT_EQUIVALENT.booleanValue();
        this.mimefilter = "^text/.*";
        this.blacklist = true;
        this.doTimestampAnalysis = false;
        this.doETagAnalysis = false;
        this.statsPerHost = DEFAULT_STATS_PER_HOST.booleanValue();
        this.changeContentSize = DEFAULT_CHANGE_CONTENT_SIZE.booleanValue();
        this.useOrigin = false;
        this.useOriginFromIndex = false;
        this.useSparseRangeFilter = DEFAULT_USE_SPARSE_RANGE_FILTER.booleanValue();
        this.stats = null;
        this.perHostStats = null;
        this.skipWriting = DEFAULT_SKIP_WRITE.booleanValue();
        SimpleType simpleType = new SimpleType("index-location", "Location of index (full path). Can not be changed at run time.", "");
        simpleType.setOverrideable(false);
        addElementToDefinition(simpleType);
        SimpleType simpleType2 = new SimpleType(ATTR_MATCHING_METHOD, "Select if we should lookup by URL or by content digest (counts mirror matches).", DEFAULT_MATCHING_METHOD, AVAILABLE_MATCHING_METHODS);
        simpleType2.setOverrideable(false);
        addElementToDefinition(simpleType2);
        SimpleType simpleType3 = new SimpleType(ATTR_EQUIVALENT, "If an exact match of URI and content digest is not found then an equivalent URI (i.e. one with any www[0-9]*, trailing slashes and parameters removed) can be checked. If an equivalent URI has an identical content digest then enabling this feature will cause the processor to consider this a duplicate. Equivalent matches are noted in the crawl log and their number is tracked seperately.", DEFAULT_EQUIVALENT);
        simpleType3.setOverrideable(false);
        addElementToDefinition(simpleType3);
        SimpleType simpleType4 = new SimpleType("mime-filter", "A regular expression that the mimetype of all documents will be compared against. \nIf the attribute filter-mode is set to 'Blacklist' then all the documents whose mimetype matches will be ignored by this processor. If the filter-mode is set to 'Whitelist' only those documents whose mimetype matches will be processed.", "^text/.*");
        simpleType4.setOverrideable(false);
        simpleType4.setExpertSetting(true);
        addElementToDefinition(simpleType4);
        SimpleType simpleType5 = new SimpleType("filter-mode", "Determines if the mime-filter acts as a blacklist (declares what should be ignored) or whitelist (declares what should be processed).", DEFAULT_FILTER_MODE, AVAILABLE_FILTER_MODES);
        simpleType5.setOverrideable(false);
        simpleType5.setExpertSetting(true);
        addElementToDefinition(simpleType5);
        SimpleType simpleType6 = new SimpleType(ATTR_ANALYSIS_MODE, "If enabled, the processor can analyse the timestamp (last-modified) and ETag info of the HTTP headers and compare their predictions as to whether or not the document had changed against the result of the index lookup. This is ONLY for the purpose of gathering statistics about the usefulness and accuracy of the HTTP header information in question and has no effect on the processing of documents. Analysis is only possible if the relevant data was included in the index.", DEFAULT_ANALYSIS_MODE, AVAILABLE_ANALYSIS_MODES);
        simpleType6.setOverrideable(false);
        simpleType6.setExpertSetting(true);
        addElementToDefinition(simpleType6);
        SimpleType simpleType7 = new SimpleType(ATTR_LOG_LEVEL, "Adjust the verbosity of the processor. By default, it only reports serious (Java runtime) errors. By setting the log level higher, various additional data can be logged. * Serious - Default logging level, only serious errors. Note that it is possible that a more permissive default logging level has been set via the heritrix.properties file. This setting (severe) will not affect that.\n* Info - Records some anomalies. Such as the information on URIs that the HTTP header info falsely predicts no-change on.\n* Finest - Full logging of all URIs processed. For debugging purposes only!", DEFAULT_LOG_LEVEL, AVAILABLE_LOG_LEVELS);
        simpleType7.setOverrideable(false);
        simpleType7.setExpertSetting(true);
        addElementToDefinition(simpleType7);
        SimpleType simpleType8 = new SimpleType(ATTR_STATS_PER_HOST, "If enabled the processor will keep track of the number of processed uris, duplicates found etc. per host. The listing will be added to the processor report (not the host-report).", DEFAULT_STATS_PER_HOST);
        simpleType8.setOverrideable(false);
        simpleType8.setExpertSetting(true);
        addElementToDefinition(simpleType8);
        SimpleType simpleType9 = new SimpleType(ATTR_CHANGE_CONTENT_SIZE, "If set to true then the processor will set the content size of the CrawlURI to zero when a duplicate is discovered. ", DEFAULT_CHANGE_CONTENT_SIZE);
        simpleType9.setOverrideable(false);
        addElementToDefinition(simpleType9);
        SimpleType simpleType10 = new SimpleType(ATTR_ORIGIN_HANDLING, "The origin of duplicate URLs can be handled a few different ways. It is important to note that the 'origin' information is malleable and may be anything from a ARC name and offset to a simple ID of a particular crawl. It is entirely at the operators discretion.\n No origin information - No origin information is associated with the URLs.\n Use processor setting - Duplicate URLs are all given the same origin, specified by the 'origin' setting of this processor.\n Use index information - The origin of each duplicate URL is read from the index. If the index does not contain any origin information for an URL, the processor setting is used as a fallback!", "No origin information", AVAILABLE_ORIGIN_HANDLING);
        simpleType10.setOverrideable(false);
        addElementToDefinition(simpleType10);
        addElementToDefinition(new SimpleType(ATTR_ORIGIN, "The origin of duplicate URLs.", ""));
        SimpleType simpleType11 = new SimpleType(ATTR_SKIP_WRITE, "If set to true, then processing of duplicate URIs will be skipped directly to the post processing chain. If false, processing of duplicates will skip directly to the writer chain that precedes the post processing chain.", DEFAULT_SKIP_WRITE);
        simpleType11.setOverrideable(true);
        addElementToDefinition(simpleType11);
        SimpleType simpleType12 = new SimpleType("use-sparse-range-filter", "If set to true, then Lucene queries use a custom 'sparse' range filter. This uses less memory at the cost of some lost performance. Suitable for very large indexes.", DEFAULT_USE_SPARSE_RANGE_FILTER);
        simpleType12.setOverrideable(false);
        simpleType12.setExpertSetting(true);
        addElementToDefinition(simpleType12);
    }

    protected void initialTasks() {
        try {
            FSDirectory open = FSDirectory.open(new File((String) readAttribute("index-location", "")));
            open.setReadChunkSize(open.getReadChunkSize() / 2);
            this.index = new IndexSearcher(DirectoryReader.open(open));
        } catch (Exception e) {
            logger.log(Level.SEVERE, "Unable to find/open index.", (Throwable) e);
        }
        this.lookupByURL = ((String) readAttribute(ATTR_MATCHING_METHOD, DEFAULT_MATCHING_METHOD)).equals(DEFAULT_MATCHING_METHOD);
        this.equivalent = ((Boolean) readAttribute(ATTR_EQUIVALENT, DEFAULT_EQUIVALENT)).booleanValue();
        this.mimefilter = (String) readAttribute("mime-filter", "^text/.*");
        this.blacklist = ((String) readAttribute("filter-mode", DEFAULT_FILTER_MODE)).equals(DEFAULT_FILTER_MODE);
        String str = (String) readAttribute(ATTR_ANALYSIS_MODE, DEFAULT_ANALYSIS_MODE);
        if (str.equals(AVAILABLE_ANALYSIS_MODES[1])) {
            this.doTimestampAnalysis = true;
        } else if (str.equals(AVAILABLE_ANALYSIS_MODES[2])) {
            this.doTimestampAnalysis = true;
            this.doETagAnalysis = true;
        }
        String str2 = (String) readAttribute(ATTR_LOG_LEVEL, DEFAULT_LOG_LEVEL);
        if (str2.equals(Level.FINEST.toString())) {
            logger.setLevel(Level.FINEST);
        } else if (str2.equals(Level.INFO.toString())) {
            logger.setLevel(Level.INFO);
        }
        this.statsPerHost = ((Boolean) readAttribute(ATTR_STATS_PER_HOST, DEFAULT_STATS_PER_HOST)).booleanValue();
        this.changeContentSize = ((Boolean) readAttribute(ATTR_CHANGE_CONTENT_SIZE, DEFAULT_CHANGE_CONTENT_SIZE)).booleanValue();
        String str3 = (String) readAttribute(ATTR_ORIGIN_HANDLING, "No origin information");
        if (!str3.equals("No origin information")) {
            this.useOrigin = true;
            if (str3.equals(ORIGIN_HANDLING_INDEX)) {
                this.useOriginFromIndex = true;
            }
        }
        this.useSparseRangeFilter = ((Boolean) readAttribute("use-sparse-range-filter", DEFAULT_USE_SPARSE_RANGE_FILTER)).booleanValue();
        this.stats = new Statistics();
        if (this.statsPerHost) {
            this.perHostStats = new HashMap<>();
        }
    }

    protected Object readAttribute(String str, Object obj) {
        try {
            return getAttribute(str);
        } catch (Exception e) {
            logger.log(Level.SEVERE, "Unable read " + str + " attribute", (Throwable) e);
            return obj;
        }
    }

    protected void innerProcess(CrawlURI crawlURI) throws InterruptedException {
        if (!crawlURI.isSuccess()) {
            logger.finest("Not handling " + crawlURI.toString() + ", did not succeed.");
            return;
        }
        if (crawlURI.isPrerequisite()) {
            logger.finest("Not handling " + crawlURI.toString() + ", prerequisite.");
            return;
        }
        if (!crawlURI.isSuccess() || crawlURI.isPrerequisite() || !crawlURI.toString().startsWith("http")) {
            logger.finest("Not handling " + crawlURI.toString() + ", non-http.");
            return;
        }
        if (crawlURI.getContentType() == null) {
            logger.finest("Not handling " + crawlURI.toString() + ", missing content (mime) type");
            return;
        }
        if (crawlURI.getContentType().matches(this.mimefilter) == this.blacklist) {
            logger.finest("Not handling " + crawlURI.toString() + ", excluded by mimefilter (" + crawlURI.getContentType() + ").");
            return;
        }
        if (crawlURI.containsKey("ar-state") && crawlURI.getInt("ar-state") == 0) {
            logger.finest("Not handling " + crawlURI.toString() + ", already flagged as unchanged.");
            return;
        }
        logger.finest("Processing " + crawlURI.toString() + "(" + crawlURI.getContentType() + ")");
        this.stats.handledNumber++;
        this.stats.totalAmount += crawlURI.getContentSize();
        Statistics statistics = null;
        if (this.statsPerHost) {
            synchronized (this.perHostStats) {
                String hostName = getController().getServerCache().getHostFor(crawlURI).getHostName();
                statistics = this.perHostStats.get(hostName);
                if (statistics == null) {
                    statistics = new Statistics();
                    this.perHostStats.put(hostName, statistics);
                }
            }
            statistics.handledNumber++;
            statistics.totalAmount += crawlURI.getContentSize();
        }
        Document lookupByURL = this.lookupByURL ? lookupByURL(crawlURI, statistics) : lookupByDigest(crawlURI, statistics);
        if (lookupByURL != null) {
            this.stats.duplicateAmount += crawlURI.getContentSize();
            this.stats.duplicateNumber++;
            if (this.statsPerHost) {
                statistics.duplicateAmount += crawlURI.getContentSize();
                statistics.duplicateNumber++;
            }
            if (((Boolean) readAttribute(ATTR_SKIP_WRITE, DEFAULT_SKIP_WRITE)).booleanValue()) {
                crawlURI.skipToProcessorChain(getController().getPostprocessorChain());
            } else {
                crawlURI.skipToProcessorChain(getController().getProcessorChainList().getProcessorChain("write-processors"));
            }
            String str = "duplicate";
            if (this.useOrigin) {
                if (!this.useOriginFromIndex || lookupByURL.get(ATTR_ORIGIN) == null) {
                    String str2 = (String) getUncheckedAttribute(crawlURI, ATTR_ORIGIN);
                    if (str2 != null && str2.trim().length() > 0) {
                        str = str + ":\"" + str2 + "\"";
                    }
                } else {
                    str = str + ":\"" + lookupByURL.get(ATTR_ORIGIN) + "\"";
                }
            }
            crawlURI.addAnnotation(str);
            if (this.changeContentSize) {
                crawlURI.setContentSize(0L);
            }
            crawlURI.putInt("ar-state", 0);
        }
        if (this.doTimestampAnalysis) {
            doAnalysis(crawlURI, statistics, lookupByURL != null);
        }
    }

    protected Document lookupByURL(CrawlURI crawlURI, Statistics statistics) {
        try {
            Query queryField = queryField("url", crawlURI.toString());
            AllDocsCollector allDocsCollector = new AllDocsCollector();
            this.index.search(queryField, allDocsCollector);
            List hits = allDocsCollector.getHits();
            String digestAsString = getDigestAsString(crawlURI);
            if (hits != null && hits.size() > 0) {
                Iterator it = hits.iterator();
                while (it.hasNext()) {
                    Document doc = this.index.doc(((ScoreDoc) it.next()).doc);
                    if (doc.get("digest").equalsIgnoreCase(digestAsString)) {
                        this.stats.exactURLDuplicates++;
                        if (this.statsPerHost) {
                            statistics.exactURLDuplicates++;
                        }
                        logger.finest("Found exact match for " + crawlURI.toString());
                        return doc;
                    }
                }
            }
            if (this.equivalent) {
                String stripURL = DigestIndexer.stripURL(crawlURI.toString());
                Query queryField2 = queryField("url-normalized", stripURL);
                allDocsCollector.reset();
                this.index.search(queryField2, allDocsCollector);
                Iterator it2 = allDocsCollector.getHits().iterator();
                while (it2.hasNext()) {
                    Document doc2 = this.index.doc(((ScoreDoc) it2.next()).doc);
                    if (doc2.get("digest").equals(digestAsString)) {
                        String str = doc2.get("url");
                        crawlURI.addAnnotation("equivalent to " + str);
                        this.stats.equivalentURLDuplicates++;
                        if (this.statsPerHost) {
                            statistics.equivalentURLDuplicates++;
                        }
                        logger.finest("Found equivalent match for " + crawlURI.toString() + ". Normalized: " + stripURL + ". Equivalent to: " + str);
                        return doc2;
                    }
                }
            }
            return null;
        } catch (IOException e) {
            logger.log(Level.SEVERE, "Error accessing index.", (Throwable) e);
            return null;
        }
    }

    protected Document lookupByDigest(CrawlURI crawlURI, Statistics statistics) {
        Document document = null;
        Object contentDigest = crawlURI.getContentDigest();
        if (contentDigest == null) {
            logger.warning("Digest received from CrawlURI is null. Null Document returned");
            return null;
        }
        Query queryField = queryField("digest", Base32.encode((byte[]) contentDigest));
        try {
            AllDocsCollector allDocsCollector = new AllDocsCollector();
            this.index.search(queryField, allDocsCollector);
            List hits = allDocsCollector.getHits();
            StringBuffer stringBuffer = new StringBuffer();
            stringBuffer.append("mirrors: ");
            if (hits != null && hits.size() > 0) {
                Iterator it = hits.iterator();
                while (it.hasNext() && document == null) {
                    Document doc = this.index.doc(((ScoreDoc) it.next()).doc);
                    String str = doc.get("url");
                    if (crawlURI.toString().equals(str)) {
                        document = doc;
                        this.stats.exactURLDuplicates++;
                        if (this.statsPerHost) {
                            statistics.exactURLDuplicates++;
                        }
                        logger.finest("Found exact match for " + crawlURI.toString());
                    }
                    if (document == null && this.equivalent) {
                        String stripURL = DigestIndexer.stripURL(crawlURI.toString());
                        if (stripURL.equals(doc.get("url-normalized"))) {
                            document = doc;
                            this.stats.equivalentURLDuplicates++;
                            if (this.statsPerHost) {
                                statistics.equivalentURLDuplicates++;
                            }
                            crawlURI.addAnnotation("equivalent to " + str);
                            logger.finest("Found equivalent match for " + crawlURI.toString() + ". Normalized: " + stripURL + ". Equivalent to: " + str);
                        }
                    }
                    if (document == null) {
                        stringBuffer.append(str + " ");
                    }
                }
                if (document == null) {
                    this.stats.mirrorNumber++;
                    if (this.statsPerHost) {
                        statistics.mirrorNumber++;
                    }
                    logger.log(Level.FINEST, "Found mirror URLs for " + crawlURI.toString() + ". " + ((Object) stringBuffer));
                }
            }
        } catch (IOException e) {
            logger.log(Level.SEVERE, "Error accessing index.", (Throwable) e);
        }
        return document;
    }

    public String report() {
        StringBuffer stringBuffer = new StringBuffer();
        stringBuffer.append("Processor: is.hi.bok.digest.DeDuplicator\n");
        stringBuffer.append("  Function:          Abort processing of duplicate records\n");
        stringBuffer.append("                     - Lookup by " + (this.lookupByURL ? "url" : "digest") + " in use\n");
        stringBuffer.append("  Total handled:     " + this.stats.handledNumber + "\n");
        stringBuffer.append("  Duplicates found:  " + this.stats.duplicateNumber + " " + getPercentage(this.stats.duplicateNumber, this.stats.handledNumber) + "\n");
        stringBuffer.append("  Bytes total:       " + this.stats.totalAmount + " (" + ArchiveUtils.formatBytesForDisplay(this.stats.totalAmount) + ")\n");
        stringBuffer.append("  Bytes discarded:   " + this.stats.duplicateAmount + " (" + ArchiveUtils.formatBytesForDisplay(this.stats.duplicateAmount) + ") " + getPercentage(this.stats.duplicateAmount, this.stats.totalAmount) + "\n");
        stringBuffer.append("  New (no hits):     " + (this.stats.handledNumber - ((this.stats.mirrorNumber + this.stats.exactURLDuplicates) + this.stats.equivalentURLDuplicates)) + "\n");
        stringBuffer.append("  Exact hits:        " + this.stats.exactURLDuplicates + "\n");
        stringBuffer.append("  Equivalent hits:   " + this.stats.equivalentURLDuplicates + "\n");
        if (!this.lookupByURL) {
            stringBuffer.append("  Mirror hits:       " + this.stats.mirrorNumber + "\n");
        }
        if (this.doTimestampAnalysis) {
            stringBuffer.append("  Timestamp predicts: (Where exact URL existed in the index)\n");
            stringBuffer.append("  Change correctly:  " + this.stats.timestampChangeCorrect + "\n");
            stringBuffer.append("  Change falsly:     " + this.stats.timestampChangeFalse + "\n");
            stringBuffer.append("  Non-change correct:" + this.stats.timestampNoChangeCorrect + "\n");
            stringBuffer.append("  Non-change falsly: " + this.stats.timestampNoChangeFalse + "\n");
            stringBuffer.append("  Missing timpestamp:" + this.stats.timestampMissing + "\n");
        }
        if (this.statsPerHost) {
            stringBuffer.append("  [Host] [total] [duplicates] [bytes] [bytes discarded] [new] [exact] [equiv]");
            if (!this.lookupByURL) {
                stringBuffer.append(" [mirror]");
            }
            if (this.doTimestampAnalysis) {
                stringBuffer.append(" [change correct] [change falsly]");
                stringBuffer.append(" [non-change correct] [non-change falsly]");
                stringBuffer.append(" [no timestamp]\n");
            }
            synchronized (this.perHostStats) {
                for (String str : this.perHostStats.keySet()) {
                    Statistics statistics = this.perHostStats.get(str);
                    stringBuffer.append("  " + str);
                    stringBuffer.append(" ");
                    stringBuffer.append(statistics.handledNumber);
                    stringBuffer.append(" ");
                    stringBuffer.append(statistics.duplicateNumber);
                    stringBuffer.append(" ");
                    stringBuffer.append(statistics.totalAmount);
                    stringBuffer.append(" ");
                    stringBuffer.append(statistics.duplicateAmount);
                    stringBuffer.append(" ");
                    stringBuffer.append(statistics.handledNumber - ((statistics.mirrorNumber + statistics.exactURLDuplicates) + statistics.equivalentURLDuplicates));
                    stringBuffer.append(" ");
                    stringBuffer.append(statistics.exactURLDuplicates);
                    stringBuffer.append(" ");
                    stringBuffer.append(statistics.equivalentURLDuplicates);
                    if (!this.lookupByURL) {
                        stringBuffer.append(" ");
                        stringBuffer.append(statistics.mirrorNumber);
                    }
                    if (this.doTimestampAnalysis) {
                        stringBuffer.append(" ");
                        stringBuffer.append(statistics.timestampChangeCorrect);
                        stringBuffer.append(" ");
                        stringBuffer.append(statistics.timestampChangeFalse);
                        stringBuffer.append(" ");
                        stringBuffer.append(statistics.timestampNoChangeCorrect);
                        stringBuffer.append(" ");
                        stringBuffer.append(statistics.timestampNoChangeFalse);
                        stringBuffer.append(" ");
                        stringBuffer.append(statistics.timestampMissing);
                    }
                    stringBuffer.append("\n");
                }
            }
        }
        stringBuffer.append("\n");
        return stringBuffer.toString();
    }

    protected static String getPercentage(double d, double d2) {
        String d3 = Double.toString((d / d2) * 100.0d);
        int indexOf = d3.indexOf(46);
        if (indexOf + 3 < d3.length()) {
            d3 = d3.substring(0, indexOf + 3);
        }
        return d3 + "%";
    }

    private static String getDigestAsString(CrawlURI crawlURI) {
        Object contentDigest = crawlURI.getContentDigest();
        if (contentDigest != null) {
            return Base32.encode((byte[]) contentDigest);
        }
        return null;
    }

    protected void doAnalysis(CrawlURI crawlURI, Statistics statistics, boolean z) {
        try {
            Query queryField = queryField("url", crawlURI.toString());
            AllDocsCollector allDocsCollector = new AllDocsCollector();
            this.index.search(queryField, allDocsCollector);
            List hits = allDocsCollector.getHits();
            if (hits != null && hits.size() > 0) {
                Document document = null;
                Iterator it = hits.iterator();
                while (it.hasNext()) {
                    Document doc = this.index.doc(((ScoreDoc) it.next()).doc);
                    String str = doc.get("date");
                    if (document == null || document.get("date").compareTo(str) > 0) {
                        document = doc;
                    }
                }
                doTimestampAnalysis(crawlURI, document, statistics, z);
                if (this.doETagAnalysis) {
                }
            }
        } catch (IOException e) {
            logger.log(Level.SEVERE, "Error accessing index.", (Throwable) e);
        }
    }

    protected void doTimestampAnalysis(CrawlURI crawlURI, Document document, Statistics statistics, boolean z) {
        HttpMethod httpMethod = (HttpMethod) crawlURI.getObject("http-transaction");
        if (httpMethod.getResponseHeader("last-modified") == null) {
            this.stats.timestampMissing++;
            if (this.statsPerHost) {
                statistics.timestampMissing++;
                logger.finest("Missing timestamp on " + crawlURI.toString());
                return;
            }
            return;
        }
        try {
            Date parse = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z", Locale.ENGLISH).parse(httpMethod.getResponseHeader("last-modified").getValue());
            try {
                Date parse2 = new SimpleDateFormat("yyyyMMddHHmmssSSS").parse(document.get("date"));
                if (parse.after(parse2)) {
                    if (z) {
                        this.stats.timestampChangeFalse++;
                        if (this.statsPerHost) {
                            statistics.timestampChangeFalse++;
                        }
                        logger.finest("Last-modified falsly predicts change on " + crawlURI.toString());
                        return;
                    }
                    this.stats.timestampChangeCorrect++;
                    if (this.statsPerHost) {
                        statistics.timestampChangeCorrect++;
                    }
                    logger.finest("Last-modified correctly predicts change on " + crawlURI.toString());
                    return;
                }
                if (z) {
                    this.stats.timestampNoChangeCorrect++;
                    if (this.statsPerHost) {
                        statistics.timestampNoChangeCorrect++;
                    }
                    logger.finest("Last-modified correctly predicts no-change on " + crawlURI.toString());
                    return;
                }
                logger.log(Level.INFO, "Last-modified incorrectly indicated no-change on " + crawlURI.toString() + " " + crawlURI.getContentType() + ". last-modified: " + parse + ". Last fetched: " + parse2);
                this.stats.timestampNoChangeFalse++;
                if (this.statsPerHost) {
                    statistics.timestampNoChangeFalse++;
                }
            } catch (ParseException e) {
                logger.log(Level.WARNING, "Exception parsing indexed date for " + document.get("url"), (Throwable) e);
            }
        } catch (ParseException e2) {
            logger.log(Level.INFO, "Exception parsing last modified of " + crawlURI.toString(), (Throwable) e2);
        }
    }

    protected Query queryField(String str, String str2) {
        BytesRef bytesRef = new BytesRef(str2.getBytes());
        return new ConstantScoreQuery(new TermRangeFilter(str, bytesRef, bytesRef, true, true));
    }

    protected void finalTasks() {
    }
}
