001/* 002 * #%L 003 * Netarchivesuite - wayback 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.wayback.batch; 024 025import java.io.BufferedReader; 026import java.io.IOException; 027import java.io.InputStream; 028import java.io.InputStreamReader; 029import java.io.OutputStream; 030import java.text.SimpleDateFormat; 031import java.util.regex.Matcher; 032import java.util.regex.Pattern; 033 034import org.archive.wayback.UrlCanonicalizer; 035import org.slf4j.Logger; 036import org.slf4j.LoggerFactory; 037 038import dk.netarkivet.common.exceptions.ArgumentNotValid; 039 040/** 041 * Class containing methods for turning duplicate entries in a crawl log into lines in a CDX index file. 042 */ 043public class DeduplicateToCDXAdapter implements DeduplicateToCDXAdapterInterface { 044 045 /** Logger for this class. */ 046 private static final Logger log = LoggerFactory.getLogger(DeduplicateToCDXAdapter.class); 047 048 /** Define SimpleDateFormat objects for the representation of timestamps in crawl logs and cdx files respectively. */ 049 private static final String crawlDateFormatString = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"; 050 private static final String cdxDateFormatString = "yyyyMMddHHmmss"; 051 private static final SimpleDateFormat crawlDateFormat = new SimpleDateFormat(crawlDateFormatString); 052 private static final SimpleDateFormat cdxDateFormat = new SimpleDateFormat(cdxDateFormatString); 053 054 /** Pattern representing the part of a crawl log entry describing a duplicate record. */ 055 private static final String duplicateRecordPatternString = "duplicate:\"(.*),(.*)\",(.*)"; //e.g. duplicate:"arcfile,offset" 056 // The extended format is made to preserve the date of the record pointed to by arcfile,offset argument 057 private static final String extendedDuplicateRecordPatternString = "duplicate:\"(.*),(.*),(.*)\",(.*)"; //e.g. duplicate:"arcfile,offset,timestamp" 058 059 private static final Pattern duplicateRecordPattern = Pattern.compile(duplicateRecordPatternString); 060 private static final Pattern extendedDuplicateRecordPattern = Pattern.compile(extendedDuplicateRecordPatternString); 061 062 /** canonicalizer used to canonicalize urls. */ 063 UrlCanonicalizer canonicalizer; 064 065 /** String for identifying crawl-log entries representing duplicates. */ 066 private static final String DUPLICATE_MATCHING_STRING = "duplicate:"; 067 068 /** 069 * Default constructor. Initializes the canonicalizer. 070 */ 071 public DeduplicateToCDXAdapter() { 072 canonicalizer = UrlCanonicalizerFactory.getDefaultUrlCanonicalizer(); 073 } 074 075 /** 076 * If the input line is a crawl log entry representing a duplicate then a CDX entry is written to the output. 077 * Otherwise returns null. In the event of an error returns null. 078 * 079 * @param line the crawl-log line to be analysed 080 * @return a CDX line (without newline) or null 081 */ 082 @Override 083 public String adaptLine(String line) { 084 if (line != null && line.contains(DUPLICATE_MATCHING_STRING)) { 085 try { 086 String[] crawlElements = line.split("\\s+"); 087 StringBuffer result = new StringBuffer(); 088 String originalUrl = crawlElements[3]; 089 String canonicalUrl = canonicalizer.urlStringToKey(originalUrl); 090 result.append(canonicalUrl).append(' '); 091 String cdxDate = cdxDateFormat.format(crawlDateFormat.parse(crawlElements[0])); 092 result.append(cdxDate).append(' ').append(originalUrl).append(' '); 093 String mimetype = crawlElements[6]; 094 result.append(mimetype).append(' '); 095 String httpCode = crawlElements[1]; 096 result.append(httpCode).append(' '); 097 String digest = crawlElements[9].replaceAll("sha1:", ""); 098 result.append(digest).append(" - "); 099 String duplicateRecord = crawlElements[11]; 100 if (!duplicateRecord.startsWith(DUPLICATE_MATCHING_STRING)) { 101 // Probably an Exception starting with "le:" is injected before the 102 // DUPLICATE_MATCHING_STRING, Try splitting on duplicate: 103 String[] parts = duplicateRecord.split(DUPLICATE_MATCHING_STRING); 104 if (parts.length == 2) { 105 String newDuplicateRecord = DUPLICATE_MATCHING_STRING + parts[1]; 106 log.warn("Duplicate-record changed from '{}' to '{}'", duplicateRecord, newDuplicateRecord); 107 duplicateRecord = newDuplicateRecord; 108 } 109 } 110 Matcher m = duplicateRecordPattern.matcher(duplicateRecord); 111 Matcher m1 = extendedDuplicateRecordPattern.matcher(duplicateRecord); 112 if (m.matches()) { 113 String arcfile = m.group(1); 114 String offset = m.group(2); 115 result.append(offset).append(' ').append(arcfile); 116 } else if (m1.matches()) { 117 String arcfile = m1.group(1); 118 String offset = m1.group(2); 119 result.append(offset).append(' ').append(arcfile); 120 } else { 121 throw new ArgumentNotValid("crawl record did not match " + "expected pattern for duplicate" 122 + " record: '" + duplicateRecord + "'"); 123 } 124 return result.toString(); 125 } catch (Exception e) { 126 log.error("Could not adapt deduplicate record to CDX line: '{}'", line, e); 127 return null; 128 } 129 } else { 130 return null; 131 } 132 } 133 134 /** 135 * Reads an input stream representing a crawl log line by line and converts any lines representing duplicate entries 136 * to wayback-compliant cdx lines. 137 * 138 * @param is The input stream from which data is read. 139 * @param os The output stream to which the cdx lines are written. 140 */ 141 public void adaptStream(InputStream is, OutputStream os) { 142 ArgumentNotValid.checkNotNull(is, "is"); 143 ArgumentNotValid.checkNotNull(os, "os"); 144 try { 145 BufferedReader reader = new BufferedReader(new InputStreamReader(is)); 146 String line; 147 while ((line = reader.readLine()) != null) { 148 String cdxLine = adaptLine(line); 149 if (cdxLine != null) { 150 os.write((cdxLine + "\n").getBytes()); 151 } 152 } 153 } catch (IOException e) { 154 log.error("Exception reading crawl log;", e); 155 } 156 } 157}