Source code

001/*
002 * #%L
003 * Netarchivesuite - wayback
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 *
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 *
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.wayback.batch;
024
025import java.io.BufferedReader;
026import java.io.IOException;
027import java.io.InputStream;
028import java.io.InputStreamReader;
029import java.io.OutputStream;
030import java.text.SimpleDateFormat;
031import java.util.regex.Matcher;
032import java.util.regex.Pattern;
033
034import org.archive.wayback.UrlCanonicalizer;
035import org.slf4j.Logger;
036import org.slf4j.LoggerFactory;
037
038import dk.netarkivet.common.exceptions.ArgumentNotValid;
039
040/**
041 * Class containing methods for turning duplicate entries in a crawl log into lines in a CDX index file.
042 */
043public class DeduplicateToCDXAdapter implements DeduplicateToCDXAdapterInterface {
044
045    /** Logger for this class. */
046    private static final Logger log = LoggerFactory.getLogger(DeduplicateToCDXAdapter.class);
047
048    /** Define SimpleDateFormat objects for the representation of timestamps in crawl logs and cdx files respectively. */
049    private static final String crawlDateFormatString = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'";
050    private static final String cdxDateFormatString = "yyyyMMddHHmmss";
051    private static final SimpleDateFormat crawlDateFormat = new SimpleDateFormat(crawlDateFormatString);
052    private static final SimpleDateFormat cdxDateFormat = new SimpleDateFormat(cdxDateFormatString);
053
054    /** Pattern representing the part of a crawl log entry describing a duplicate record. */
055    private static final String duplicateRecordPatternString = "duplicate:\"(.*),(.*)\",(.*)";  //e.g. duplicate:"arcfile,offset"
056    // The extended format is made to preserve the date of the record pointed to by arcfile,offset argument
057    private static final String extendedDuplicateRecordPatternString = "duplicate:\"(.*),(.*),(.*)\",(.*)"; //e.g. duplicate:"arcfile,offset,timestamp"
058    
059    private static final Pattern duplicateRecordPattern = Pattern.compile(duplicateRecordPatternString);
060    private static final Pattern extendedDuplicateRecordPattern = Pattern.compile(extendedDuplicateRecordPatternString);
061
062    /** canonicalizer used to canonicalize urls. */
063    UrlCanonicalizer canonicalizer;
064
065    /** String for identifying crawl-log entries representing duplicates. */
066    private static final String DUPLICATE_MATCHING_STRING = "duplicate:";
067
068    /**
069     * Default constructor. Initializes the canonicalizer.
070     */
071    public DeduplicateToCDXAdapter() {
072        canonicalizer = UrlCanonicalizerFactory.getDefaultUrlCanonicalizer();
073    }
074
075    /**
076     * If the input line is a crawl log entry representing a duplicate then a CDX entry is written to the output.
077     * Otherwise returns null. In the event of an error returns null.
078     *
079     * @param line the crawl-log line to be analysed
080     * @return a CDX line (without newline) or null
081     */
082    @Override
083    public String adaptLine(String line) {
084        if (line != null && line.contains(DUPLICATE_MATCHING_STRING)) {
085            try {
086                String[] crawlElements = line.split("\\s+");
087                StringBuffer result = new StringBuffer();
088                String originalUrl = crawlElements[3];
089                String canonicalUrl = canonicalizer.urlStringToKey(originalUrl);
090                result.append(canonicalUrl).append(' ');
091                String cdxDate = cdxDateFormat.format(crawlDateFormat.parse(crawlElements[0]));
092                result.append(cdxDate).append(' ').append(originalUrl).append(' ');
093                String mimetype = crawlElements[6];
094                result.append(mimetype).append(' ');
095                String httpCode = crawlElements[1];
096                result.append(httpCode).append(' ');
097                String digest = crawlElements[9].replaceAll("sha1:", "");
098                result.append(digest).append(" - ");
099                String duplicateRecord = crawlElements[11];
100                if (!duplicateRecord.startsWith(DUPLICATE_MATCHING_STRING)) {
101                    // Probably an Exception starting with "le:" is injected before the
102                    // DUPLICATE_MATCHING_STRING, Try splitting on duplicate:
103                    String[] parts = duplicateRecord.split(DUPLICATE_MATCHING_STRING);
104                    if (parts.length == 2) {
105                        String newDuplicateRecord = DUPLICATE_MATCHING_STRING + parts[1];
106                        log.warn("Duplicate-record changed from '{}' to '{}'", duplicateRecord, newDuplicateRecord);
107                        duplicateRecord = newDuplicateRecord;
108                    }
109                }
110                Matcher m = duplicateRecordPattern.matcher(duplicateRecord);
111                Matcher m1 = extendedDuplicateRecordPattern.matcher(duplicateRecord);
112                if (m.matches()) {
113                    String arcfile = m.group(1);
114                    String offset = m.group(2);
115                    result.append(offset).append(' ').append(arcfile);
116                } else if (m1.matches()) {
117                        String arcfile = m1.group(1);
118                        String offset = m1.group(2);
119                        result.append(offset).append(' ').append(arcfile);
120                } else {
121                    throw new ArgumentNotValid("crawl record did not match " + "expected pattern for duplicate"
122                            + " record: '" + duplicateRecord + "'");
123                }
124                return result.toString();
125            } catch (Exception e) {
126                log.error("Could not adapt deduplicate record to CDX line: '{}'", line, e);
127                return null;
128            }
129        } else {
130            return null;
131        }
132    }
133
134    /**
135     * Reads an input stream representing a crawl log line by line and converts any lines representing duplicate entries
136     * to wayback-compliant cdx lines.
137     *
138     * @param is The input stream from which data is read.
139     * @param os The output stream to which the cdx lines are written.
140     */
141    public void adaptStream(InputStream is, OutputStream os) {
142        ArgumentNotValid.checkNotNull(is, "is");
143        ArgumentNotValid.checkNotNull(os, "os");
144        try {
145            BufferedReader reader = new BufferedReader(new InputStreamReader(is));
146            String line;
147            while ((line = reader.readLine()) != null) {
148                String cdxLine = adaptLine(line);
149                if (cdxLine != null) {
150                    os.write((cdxLine + "\n").getBytes());
151                }
152            }
153        } catch (IOException e) {
154            log.error("Exception reading crawl log;", e);
155        }
156    }
157}