001/* 002 * #%L 003 * Netarchivesuite - wayback 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.wayback.batch; 024 025import java.io.BufferedReader; 026import java.io.IOException; 027import java.io.InputStream; 028import java.io.InputStreamReader; 029import java.io.OutputStream; 030import java.text.SimpleDateFormat; 031import java.util.regex.Matcher; 032import java.util.regex.Pattern; 033 034import org.archive.wayback.UrlCanonicalizer; 035import org.slf4j.Logger; 036import org.slf4j.LoggerFactory; 037 038import dk.netarkivet.common.exceptions.ArgumentNotValid; 039 040/** 041 * Class containing methods for turning duplicate entries in a crawl log into lines in a CDX index file. 042 */ 043public class DeduplicateToCDXAdapter implements DeduplicateToCDXAdapterInterface { 044 045 /** Logger for this class. */ 046 private static final Logger log = LoggerFactory.getLogger(DeduplicateToCDXAdapter.class); 047 048 /** Define SimpleDateFormat objects for the representation of timestamps in crawl logs and cdx files respectively. */ 049 private static final String crawlDateFormatString = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"; 050 private static final String cdxDateFormatString = "yyyyMMddHHmmss"; 051 private static final SimpleDateFormat crawlDateFormat = new SimpleDateFormat(crawlDateFormatString); 052 private static final SimpleDateFormat cdxDateFormat = new SimpleDateFormat(cdxDateFormatString); 053 054 /** Pattern representing the part of a crawl log entry describing a duplicate record. */ 055 private static final String duplicateRecordPatternString = "duplicate:\"(.*),(.*)\",(.*)"; 056 private static final Pattern duplicateRecordPattern = Pattern.compile(duplicateRecordPatternString); 057 058 /** canonicalizer used to canonicalize urls. */ 059 UrlCanonicalizer canonicalizer; 060 061 /** String for identifying crawl-log entries representing duplicates. */ 062 private static final String DUPLICATE_MATCHING_STRING = "duplicate:"; 063 064 /** 065 * Default constructor. Initializes the canonicalizer. 066 */ 067 public DeduplicateToCDXAdapter() { 068 canonicalizer = UrlCanonicalizerFactory.getDefaultUrlCanonicalizer(); 069 } 070 071 /** 072 * If the input line is a crawl log entry representing a duplicate then a CDX entry is written to the output. 073 * Otherwise returns null. In the event of an error returns null. 074 * 075 * @param line the crawl-log line to be analysed 076 * @return a CDX line (without newline) or null 077 */ 078 @Override 079 public String adaptLine(String line) { 080 if (line != null && line.contains(DUPLICATE_MATCHING_STRING)) { 081 try { 082 String[] crawlElements = line.split("\\s+"); 083 StringBuffer result = new StringBuffer(); 084 String originalUrl = crawlElements[3]; 085 String canonicalUrl = canonicalizer.urlStringToKey(originalUrl); 086 result.append(canonicalUrl).append(' '); 087 String cdxDate = cdxDateFormat.format(crawlDateFormat.parse(crawlElements[0])); 088 result.append(cdxDate).append(' ').append(originalUrl).append(' '); 089 String mimetype = crawlElements[6]; 090 result.append(mimetype).append(' '); 091 String httpCode = crawlElements[1]; 092 result.append(httpCode).append(' '); 093 String digest = crawlElements[9].replaceAll("sha1:", ""); 094 result.append(digest).append(" - "); 095 String duplicateRecord = crawlElements[11]; 096 if (!duplicateRecord.startsWith(DUPLICATE_MATCHING_STRING)) { 097 // Probably an Exception starting with "le:" is injected before the 098 // DUPLICATE_MATCHING_STRING, Try splitting on duplicate: 099 String[] parts = duplicateRecord.split(DUPLICATE_MATCHING_STRING); 100 if (parts.length == 2) { 101 String newDuplicateRecord = DUPLICATE_MATCHING_STRING + parts[1]; 102 log.warn("Duplicate-record changed from '{}' to '{}'", duplicateRecord, newDuplicateRecord); 103 duplicateRecord = newDuplicateRecord; 104 } 105 } 106 Matcher m = duplicateRecordPattern.matcher(duplicateRecord); 107 if (m.matches()) { 108 String arcfile = m.group(1); 109 String offset = m.group(2); 110 result.append(offset).append(' ').append(arcfile); 111 } else { 112 throw new ArgumentNotValid("crawl record did not match " + "expected pattern for duplicate" 113 + " record: '" + duplicateRecord + "'"); 114 } 115 return result.toString(); 116 } catch (Exception e) { 117 log.error("Could not adapt deduplicate record to CDX line: '{}'", line, e); 118 return null; 119 } 120 } else { 121 return null; 122 } 123 } 124 125 /** 126 * Reads an input stream representing a crawl log line by line and converts any lines representing duplicate entries 127 * to wayback-compliant cdx lines. 128 * 129 * @param is The input stream from which data is read. 130 * @param os The output stream to which the cdx lines are written. 131 */ 132 public void adaptStream(InputStream is, OutputStream os) { 133 ArgumentNotValid.checkNotNull(is, "is"); 134 ArgumentNotValid.checkNotNull(os, "os"); 135 try { 136 BufferedReader reader = new BufferedReader(new InputStreamReader(is)); 137 String line; 138 while ((line = reader.readLine()) != null) { 139 String cdxLine = adaptLine(line); 140 if (cdxLine != null) { 141 os.write((cdxLine + "\n").getBytes()); 142 } 143 } 144 } catch (IOException e) { 145 log.error("Exception reading crawl log;", e); 146 } 147 } 148}