Source code

001/*
002 * #%L
003 * Netarchivesuite - wayback
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 *
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 *
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.wayback.batch;
024
025import java.io.BufferedReader;
026import java.io.IOException;
027import java.io.InputStream;
028import java.io.InputStreamReader;
029import java.io.OutputStream;
030import java.text.SimpleDateFormat;
031import java.util.regex.Matcher;
032import java.util.regex.Pattern;
033
034import org.archive.wayback.UrlCanonicalizer;
035import org.slf4j.Logger;
036import org.slf4j.LoggerFactory;
037
038import dk.netarkivet.common.exceptions.ArgumentNotValid;
039
040/**
041 * Class containing methods for turning duplicate entries in a crawl log into lines in a CDX index file.
042 */
043public class DeduplicateToCDXAdapter implements DeduplicateToCDXAdapterInterface {
044
045    /** Logger for this class. */
046    private static final Logger log = LoggerFactory.getLogger(DeduplicateToCDXAdapter.class);
047
048    /** Define SimpleDateFormat objects for the representation of timestamps in crawl logs and cdx files respectively. */
049    private static final String crawlDateFormatString = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'";
050    private static final String cdxDateFormatString = "yyyyMMddHHmmss";
051    private static final SimpleDateFormat crawlDateFormat = new SimpleDateFormat(crawlDateFormatString);
052    private static final SimpleDateFormat cdxDateFormat = new SimpleDateFormat(cdxDateFormatString);
053
054    /** Pattern representing the part of a crawl log entry describing a duplicate record. */
055    private static final String duplicateRecordPatternString = "duplicate:\"(.*),(.*)\",(.*)";
056    private static final Pattern duplicateRecordPattern = Pattern.compile(duplicateRecordPatternString);
057
058    /** canonicalizer used to canonicalize urls. */
059    UrlCanonicalizer canonicalizer;
060
061    /** String for identifying crawl-log entries representing duplicates. */
062    private static final String DUPLICATE_MATCHING_STRING = "duplicate:";
063
064    /**
065     * Default constructor. Initializes the canonicalizer.
066     */
067    public DeduplicateToCDXAdapter() {
068        canonicalizer = UrlCanonicalizerFactory.getDefaultUrlCanonicalizer();
069    }
070
071    /**
072     * If the input line is a crawl log entry representing a duplicate then a CDX entry is written to the output.
073     * Otherwise returns null. In the event of an error returns null.
074     *
075     * @param line the crawl-log line to be analysed
076     * @return a CDX line (without newline) or null
077     */
078    @Override
079    public String adaptLine(String line) {
080        if (line != null && line.contains(DUPLICATE_MATCHING_STRING)) {
081            try {
082                String[] crawlElements = line.split("\\s+");
083                StringBuffer result = new StringBuffer();
084                String originalUrl = crawlElements[3];
085                String canonicalUrl = canonicalizer.urlStringToKey(originalUrl);
086                result.append(canonicalUrl).append(' ');
087                String cdxDate = cdxDateFormat.format(crawlDateFormat.parse(crawlElements[0]));
088                result.append(cdxDate).append(' ').append(originalUrl).append(' ');
089                String mimetype = crawlElements[6];
090                result.append(mimetype).append(' ');
091                String httpCode = crawlElements[1];
092                result.append(httpCode).append(' ');
093                String digest = crawlElements[9].replaceAll("sha1:", "");
094                result.append(digest).append(" - ");
095                String duplicateRecord = crawlElements[11];
096                if (!duplicateRecord.startsWith(DUPLICATE_MATCHING_STRING)) {
097                    // Probably an Exception starting with "le:" is injected before the
098                    // DUPLICATE_MATCHING_STRING, Try splitting on duplicate:
099                    String[] parts = duplicateRecord.split(DUPLICATE_MATCHING_STRING);
100                    if (parts.length == 2) {
101                        String newDuplicateRecord = DUPLICATE_MATCHING_STRING + parts[1];
102                        log.warn("Duplicate-record changed from '{}' to '{}'", duplicateRecord, newDuplicateRecord);
103                        duplicateRecord = newDuplicateRecord;
104                    }
105                }
106                Matcher m = duplicateRecordPattern.matcher(duplicateRecord);
107                if (m.matches()) {
108                    String arcfile = m.group(1);
109                    String offset = m.group(2);
110                    result.append(offset).append(' ').append(arcfile);
111                } else {
112                    throw new ArgumentNotValid("crawl record did not match " + "expected pattern for duplicate"
113                            + " record: '" + duplicateRecord + "'");
114                }
115                return result.toString();
116            } catch (Exception e) {
117                log.error("Could not adapt deduplicate record to CDX line: '{}'", line, e);
118                return null;
119            }
120        } else {
121            return null;
122        }
123    }
124
125    /**
126     * Reads an input stream representing a crawl log line by line and converts any lines representing duplicate entries
127     * to wayback-compliant cdx lines.
128     *
129     * @param is The input stream from which data is read.
130     * @param os The output stream to which the cdx lines are written.
131     */
132    public void adaptStream(InputStream is, OutputStream os) {
133        ArgumentNotValid.checkNotNull(is, "is");
134        ArgumentNotValid.checkNotNull(os, "os");
135        try {
136            BufferedReader reader = new BufferedReader(new InputStreamReader(is));
137            String line;
138            while ((line = reader.readLine()) != null) {
139                String cdxLine = adaptLine(line);
140                if (cdxLine != null) {
141                    os.write((cdxLine + "\n").getBytes());
142                }
143            }
144        } catch (IOException e) {
145            log.error("Exception reading crawl log;", e);
146        }
147    }
148}