package dk.netarkivet.wayback.batch;

import dk.netarkivet.common.exceptions.ArgumentNotValid;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.text.SimpleDateFormat;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.archive.wayback.UrlCanonicalizer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:dk/netarkivet/wayback/batch/DeduplicateToCDXAdapter.class */
public class DeduplicateToCDXAdapter implements DeduplicateToCDXAdapterInterface {
    UrlCanonicalizer canonicalizer = UrlCanonicalizerFactory.getDefaultUrlCanonicalizer();
    private static final String DUPLICATE_MATCHING_STRING = "duplicate:";
    private static final Logger log = LoggerFactory.getLogger(DeduplicateToCDXAdapter.class);
    private static final String crawlDateFormatString = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'";
    private static final SimpleDateFormat crawlDateFormat = new SimpleDateFormat(crawlDateFormatString);
    private static final String cdxDateFormatString = "yyyyMMddHHmmss";
    private static final SimpleDateFormat cdxDateFormat = new SimpleDateFormat(cdxDateFormatString);
    private static final String duplicateRecordPatternString = "duplicate:\"(.*),(.*)\",(.*)";
    private static final Pattern duplicateRecordPattern = Pattern.compile(duplicateRecordPatternString);

    @Override // dk.netarkivet.wayback.batch.DeduplicateToCDXAdapterInterface
    public String adaptLine(String str) {
        if (str == null || !str.contains(DUPLICATE_MATCHING_STRING)) {
            return null;
        }
        try {
            String[] split = str.split("\\s+");
            StringBuffer stringBuffer = new StringBuffer();
            String str2 = split[3];
            stringBuffer.append(this.canonicalizer.urlStringToKey(str2)).append(' ');
            stringBuffer.append(cdxDateFormat.format(crawlDateFormat.parse(split[0]))).append(' ').append(str2).append(' ');
            stringBuffer.append(split[6]).append(' ');
            stringBuffer.append(split[1]).append(' ');
            stringBuffer.append(split[9].replaceAll("sha1:", "")).append(" - ");
            String str3 = split[11];
            if (!str3.startsWith(DUPLICATE_MATCHING_STRING)) {
                String[] split2 = str3.split(DUPLICATE_MATCHING_STRING);
                if (split2.length == 2) {
                    String str4 = DUPLICATE_MATCHING_STRING + split2[1];
                    log.warn("Duplicate-record changed from '{}' to '{}'", str3, str4);
                    str3 = str4;
                }
            }
            Matcher matcher = duplicateRecordPattern.matcher(str3);
            if (!matcher.matches()) {
                throw new ArgumentNotValid("crawl record did not match expected pattern for duplicate record: '" + str3 + "'");
            }
            stringBuffer.append(matcher.group(2)).append(' ').append(matcher.group(1));
            return stringBuffer.toString();
        } catch (Exception e) {
            log.error("Could not adapt deduplicate record to CDX line: '{}'", str, e);
            return null;
        }
    }

    @Override // dk.netarkivet.wayback.batch.DeduplicateToCDXAdapterInterface
    public void adaptStream(InputStream inputStream, OutputStream outputStream) {
        ArgumentNotValid.checkNotNull(inputStream, "is");
        ArgumentNotValid.checkNotNull(outputStream, "os");
        try {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    return;
                }
                String adaptLine = adaptLine(readLine);
                if (adaptLine != null) {
                    outputStream.write((adaptLine + "\n").getBytes());
                }
            }
        } catch (IOException e) {
            log.error("Exception reading crawl log;", e);
        }
    }
}
