package dk.netarkivet.harvester.harvesting.report;

import dk.netarkivet.common.exceptions.ArgumentNotValid;
import dk.netarkivet.common.exceptions.IOFailure;
import dk.netarkivet.common.utils.DomainUtils;
import dk.netarkivet.common.utils.FileUtils;
import dk.netarkivet.common.utils.Settings;
import dk.netarkivet.common.utils.StringUtils;
import dk.netarkivet.harvester.HarvesterSettings;
import dk.netarkivet.harvester.datamodel.StopReason;
import dk.netarkivet.harvester.harvesting.HeritrixFiles;
import dk.netarkivet.harvester.harvesting.distribute.DomainStats;
import gnu.inet.encoding.IDNA;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;
import org.jwat.common.Uri;
import org.jwat.common.UriProfile;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:dk/netarkivet/harvester/harvesting/report/HarvestReportGenerator.class */
public class HarvestReportGenerator {
    private static final Logger log = LoggerFactory.getLogger(HarvestReportGenerator.class);
    private final Map<String, DomainStats> domainstats = new HashMap();
    private HeritrixFiles heritrixFiles;
    private StopReason defaultStopReason;

    /* loaded from: input_file:dk/netarkivet/harvester/harvesting/report/HarvestReportGenerator$ProgressStatisticsConstants.class */
    public enum ProgressStatisticsConstants {
        ORDERLY_FINISH("CRAWL ENDED"),
        TIMELIMIT_EXCEEDED("Timelimit hit"),
        HARVEST_ABORTED("Ended by operator");

        private final String pattern;

        ProgressStatisticsConstants(String str) {
            this.pattern = str;
        }
    }

    public HarvestReportGenerator() {
    }

    public HarvestReportGenerator(HeritrixFiles heritrixFiles) {
        ArgumentNotValid.checkNotNull(heritrixFiles, "files");
        this.heritrixFiles = heritrixFiles;
        this.defaultStopReason = findDefaultStopReason(this.heritrixFiles.getProgressStatisticsLog());
        preProcess(this.heritrixFiles);
    }

    public void preProcess(HeritrixFiles heritrixFiles) {
        if (log.isInfoEnabled()) {
            log.info("Starting pre-processing of harvest report for job {}", heritrixFiles.getJobID());
        }
        long currentTimeMillis = System.currentTimeMillis();
        File crawlLog = heritrixFiles.getCrawlLog();
        if (!crawlLog.isFile() || !crawlLog.canRead()) {
            throw new IOFailure("Not a file or not readable: " + crawlLog.getAbsolutePath());
        }
        parseCrawlLog(heritrixFiles.getCrawlLog());
        if (log.isInfoEnabled()) {
            log.info("Finished pre-processing of harvest report for job {}, operation took {}", heritrixFiles.getJobID(), StringUtils.formatDuration(System.currentTimeMillis() - currentTimeMillis));
        }
    }

    protected DomainStats getOrCreateDomainStats(String str) {
        DomainStats domainStats = this.domainstats.get(str);
        if (domainStats == null) {
            domainStats = new DomainStats(0L, 0L, this.defaultStopReason);
            this.domainstats.put(str, domainStats);
        }
        return domainStats;
    }

    public static StopReason findDefaultStopReason(File file) {
        ArgumentNotValid.checkNotNull(file, "File logFile");
        if (!file.exists()) {
            return StopReason.DOWNLOAD_UNFINISHED;
        }
        String readLastLine = FileUtils.readLastLine(file);
        if (readLastLine.contains(ProgressStatisticsConstants.ORDERLY_FINISH.pattern) && !readLastLine.contains(ProgressStatisticsConstants.HARVEST_ABORTED.pattern)) {
            return readLastLine.contains(ProgressStatisticsConstants.TIMELIMIT_EXCEEDED.pattern) ? StopReason.TIME_LIMIT : StopReason.DOWNLOAD_COMPLETE;
        }
        return StopReason.DOWNLOAD_UNFINISHED;
    }

    private void parseCrawlLog(File file) throws IOFailure {
        boolean z = Settings.getBoolean(HarvesterSettings.DISREGARD_SEEDURL_INFORMATION_IN_CRAWLLOG);
        BufferedReader bufferedReader = null;
        try {
            try {
                bufferedReader = new BufferedReader(new FileReader(file));
                int i = 0;
                while (true) {
                    String readLine = bufferedReader.readLine();
                    if (readLine == null) {
                        break;
                    }
                    i++;
                    try {
                        processHarvestLine(readLine, z);
                    } catch (ArgumentNotValid e) {
                        log.debug("Invalid line in '{}' line {}: '{}'. Ignoring due to reason: {}", new Object[]{file.getAbsolutePath(), Integer.valueOf(i), readLine, e.getMessage(), e});
                    }
                }
                if (bufferedReader != null) {
                    try {
                        bufferedReader.close();
                    } catch (IOException e2) {
                        log.debug("Unable to close {}", file, e2);
                    }
                }
            } catch (IOException e3) {
                String str = "Unable to open/read crawl.log file '" + file.getAbsolutePath() + "'.";
                log.warn(str, e3);
                throw new IOFailure(str, e3);
            }
        } catch (Throwable th) {
            if (bufferedReader != null) {
                try {
                    bufferedReader.close();
                } catch (IOException e4) {
                    log.debug("Unable to close {}", file, e4);
                }
            }
            throw th;
        }
    }

    private void processHarvestLine(String str, boolean z) {
        String str2;
        String[] split = str.split("\\s+", 12);
        if (split.length < 11) {
            throw new ArgumentNotValid("Not enough fields for line in crawl.log: '" + str + "'. Was only " + split.length + " fields. Should have been at least 11");
        }
        String str3 = split[10];
        boolean z2 = (str3.equals("-") || z) ? false : true;
        String str4 = null;
        if (z2) {
            try {
                str4 = getDomainNameFromURIString(str3);
                if (str4 != null) {
                    str4 = IDNA.toUnicode(str4);
                }
            } catch (URISyntaxException e) {
                log.debug("Unable to extract a domain from the seedURL found in field 11 of crawl.log: '{}'.", str3, e);
            }
        }
        String str5 = null;
        String str6 = split[3];
        try {
            str5 = getDomainNameFromURIString(str6);
            if (str5 != null) {
                str5 = IDNA.toUnicode(str5);
            }
        } catch (URISyntaxException e2) {
            log.debug("Unable to extract a domain from the object URL found in field 4 of crawl.log: '{}'.", str6, e2);
        }
        if (str5 == null && str4 == null) {
            throw new ArgumentNotValid("Unable to find a domainName in the line: '" + str + "'.");
        }
        if (z2 && str4 != null) {
            str2 = str4;
        } else {
            if (str5 == null) {
                throw new ArgumentNotValid("Unable to find valid domainname");
            }
            str2 = str5;
        }
        try {
            long parseLong = Long.parseLong(split[1]);
            StopReason defaultStopReason = getDefaultStopReason();
            long j = 0;
            if (split.length > 11) {
                for (String str7 : split[11].split(",")) {
                    if (str7.trim().startsWith("content-size:")) {
                        try {
                            j = Long.parseLong(str7.substring("content-size:".length()));
                        } catch (NumberFormatException e3) {
                            throw new ArgumentNotValid("Unparsable annotation in field 12 of crawl.log: '" + split[11] + "'.", e3);
                        }
                    }
                    if (parseLong == -5003) {
                        if (str7.trim().equals("Q:group-max-all-kb")) {
                            defaultStopReason = StopReason.SIZE_LIMIT;
                        } else if (str7.trim().equals("Q:group-max-fetch-successes")) {
                            defaultStopReason = StopReason.OBJECT_LIMIT;
                        }
                    }
                }
            }
            DomainStats orCreateDomainStats = getOrCreateDomainStats(str2);
            if (parseLong >= 0) {
                orCreateDomainStats.setObjectCount(orCreateDomainStats.getObjectCount() + 1);
                orCreateDomainStats.setByteCount(orCreateDomainStats.getByteCount() + j);
            }
            if (orCreateDomainStats.getStopReason() == this.defaultStopReason) {
                orCreateDomainStats.setStopReason(defaultStopReason);
            }
        } catch (NumberFormatException e4) {
            throw new ArgumentNotValid("Unparsable response code in field 2 of crawl.log: '" + split[1] + "'.");
        }
    }

    private String getDomainNameFromURIString(String str) throws URISyntaxException {
        Uri uri = new Uri(str, UriProfile.RFC3986_ABS_16BIT_LAX);
        String path = "dns".equals(uri.getScheme()) ? uri.getPath() : uri.getHost();
        if (path != null) {
            return DomainUtils.domainNameFromHostname(path);
        }
        log.debug("Not possible to extract domainname from URL: {}", str);
        return null;
    }

    public StopReason getDefaultStopReason() {
        return this.defaultStopReason;
    }

    public Map<String, DomainStats> getDomainStatsMap() {
        return this.domainstats;
    }

    public static DomainStatsReport getDomainStatsReport(HeritrixFiles heritrixFiles) {
        HarvestReportGenerator harvestReportGenerator = new HarvestReportGenerator(heritrixFiles);
        return new DomainStatsReport(harvestReportGenerator.getDomainStatsMap(), harvestReportGenerator.getDefaultStopReason());
    }
}
