001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.harvester.indexserver; 025 026import java.io.BufferedReader; 027import java.io.File; 028import java.io.IOException; 029 030import org.slf4j.Logger; 031import org.slf4j.LoggerFactory; 032 033import dk.netarkivet.common.exceptions.ArgumentNotValid; 034import dk.netarkivet.common.exceptions.IOFailure; 035import dk.netarkivet.common.utils.cdx.CDXRecord; 036import is.hi.bok.deduplicator.CrawlDataItem; 037import is.hi.bok.deduplicator.CrawlLogIterator; 038 039/** 040 * This subclass of CrawlLogIterator adds the layer of digging an origin of the form "arcfile,offset" out of a 041 * corresponding CDX index. This may cause some of the entries in the crawl log to be skipped. The two files are read in 042 * parallel. 043 */ 044public class CDXOriginCrawlLogIterator extends CrawlLogIterator { 045 046 /** The log. */ 047 private static final Logger log = LoggerFactory.getLogger(CDXOriginCrawlLogIterator.class); 048 049 /** The reader of the (sorted) CDX index. */ 050 protected BufferedReader reader; 051 052 /** 053 * The last record we read from the reader. We may overshoot on the CDX reading if there are entries not in CDX, so 054 * we hang onto this until the reading of the crawl.log catches up. 055 */ 056 protected CDXRecord lastRecord; 057 058 /** 059 * The constant prefixed checksums in newer versions of Heritrix indicating the digest method. The deduplicator 060 * currently doesn't use the equivalent prefix, so we need to strip it off (see bug #1004). 061 */ 062 private static final String SHA1_PREFIX = "sha1:"; 063 064 /** 065 * Create a new CDXOriginCrawlLogIterator from crawl.log and CDX sources. 066 * 067 * @param source File containing a crawl.log sorted by URL (LANG=C sort -k 4b) 068 * @param cdx A reader of a sorted CDX file. This is given as a reader so that it may be closed after use 069 * (CrawlLogIterator provides no close()) 070 * @throws IOException If the underlying CrawlLogIterator fails, e.g. due to missing files. 071 */ 072 public CDXOriginCrawlLogIterator(File source, BufferedReader cdx) throws IOException { 073 super(source.getAbsolutePath()); 074 ArgumentNotValid.checkNotNull(cdx, "BufferedReader cdx"); 075 reader = cdx; 076 } 077 078 /** 079 * Parse a crawl.log line into a valid CrawlDataItem. 080 * <p> 081 * If CrawlLogIterator is ok with this line, we must make sure that it has an origin by finding missing ones in the 082 * CDX file. If multiple origins are found in the CDX files, the one that was harvested last is chosen. If no origin 083 * can be found, the item is rejected. 084 * <p> 085 * We assume that super.parseLine() delivers us the items in the crawl.log in the given (sorted) order with non-null 086 * URLs, though we admit that some undeclared exceptions can be thrown by it. 087 * 088 * @param line A crawl.log line to parse. 089 * @return A CrawlDataItem with a valid origin field, or null if we could not determine an appropriate origin. 090 * @throws IOFailure if there is an error reading the files. 091 */ 092 protected CrawlDataItem parseLine(String line) throws IOFailure { 093 CrawlDataItem item; 094 log.trace("Processing crawl-log line: {}", line); 095 try { 096 item = super.parseLine(line); 097 } catch (RuntimeException e) { 098 log.debug("Skipping over bad crawl-log line '" + line + "'", e); 099 return null; 100 } 101 102 // Hack that works around bug #1004: sha1: prefix not accounted for 103 if (item != null && item.getContentDigest() != null 104 && item.getContentDigest().toLowerCase().startsWith(SHA1_PREFIX)) { 105 item.setContentDigest(item.getContentDigest().substring(SHA1_PREFIX.length())); 106 } 107 108 // If a origin was found in the crawl log, we accept that as correct. 109 // Otherwise we must find the origin in the CDX file. 110 if (item != null && item.getOrigin() == null) { 111 // Iterate through the sorted CDX file until lastRecord is not null 112 // and lastRecord.getURL() is lexicographically higher than 113 // item.getURL(), indicating that there are no more matches. 114 CDXRecord foundRecord = null; 115 while (lastRecord == null || lastRecord.getURL().compareTo(item.getURL()) <= 0) { 116 // If the cdx URL is the one we are looking for, we have a 117 // potential origin. 118 if (lastRecord != null && lastRecord.getURL().equals(item.getURL())) { 119 // If this is our first potential origin, or if it is better 120 // than the one we currently consider best, we remember this 121 // entry. A better origin is defined as one with a later 122 // date than the current choice. 123 if (foundRecord == null || lastRecord.getDate().compareTo(foundRecord.getDate()) > 0) { 124 foundRecord = lastRecord; 125 log.trace("Foundrecord set to '{},{}'", foundRecord.getArcfile(), foundRecord.getOffset()); 126 } 127 } 128 129 // Read the next line 130 try { 131 String record = reader.readLine(); 132 if (record == null) { 133 break; // EOF, nothing to do 134 } 135 if (record.length() == 0) { 136 continue; // skip empty lines 137 } 138 try { 139 lastRecord = new CDXRecord(record); 140 } catch (ArgumentNotValid e) { 141 log.debug("Skipping over bad CDX line '{}'", record, e); 142 continue; 143 } 144 log.trace("lastrecord is '{}'", record); 145 } catch (IOException e) { 146 throw new IOFailure("Error reading CDX record", e); 147 } 148 } 149 if (foundRecord == null) { 150 if (lastRecord == null) { 151 log.trace("No matching CDX for URL '{}'. No last CDX was found.", item.getURL()); 152 } else { 153 log.trace("No matching CDX for URL '{}'. Last CDX was for URL '{}'", item.getURL(), 154 lastRecord.getURL()); 155 } 156 157 return null; 158 } 159 160 String origin = foundRecord.getArcfile() + "," + foundRecord.getOffset(); 161 item.setOrigin(origin); 162 log.trace("URL '{}' combined with origin '{}'.", item.getURL(), origin); 163 } 164 return item; 165 } 166 167}