001/* CrawlLogIterator 002 * 003 * Created on 10.04.2006 004 * 005 * Copyright (C) 2006 National and University Library of Iceland 006 * 007 * This file is part of the DeDuplicator (Heritrix add-on module). 008 * 009 * DeDuplicator is free software; you can redistribute it and/or modify 010 * it under the terms of the GNU Lesser Public License as published by 011 * the Free Software Foundation; either version 2.1 of the License, or 012 * any later version. 013 * 014 * DeDuplicator is distributed in the hope that it will be useful, 015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 017 * GNU Lesser Public License for more details. 018 * 019 * You should have received a copy of the GNU Lesser Public License 020 * along with DeDuplicator; if not, write to the Free Software 021 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 022 */ 023package is.hi.bok.deduplicator; 024 025import java.io.BufferedReader; 026import java.io.File; 027import java.io.FileInputStream; 028import java.io.IOException; 029import java.io.InputStreamReader; 030import java.text.ParseException; 031import java.text.SimpleDateFormat; 032import java.util.NoSuchElementException; 033 034import org.apache.commons.logging.Log; 035import org.apache.commons.logging.LogFactory; 036 037/** 038 * An implementation of a {@link is.hi.bok.deduplicator.CrawlDataIterator} capable of iterating over a Heritrix's style 039 * <code>crawl.log</code>. 040 * 041 * @author Kristinn Sigurðsson 042 * @author Lars Clausen 043 */ 044public class CrawlLogIterator extends CrawlDataIterator { 045 046 private Log logger = LogFactory.getLog(getClass().getName()); 047 048 protected final String crawlDateFormatStr = "yyyyMMddHHmmss"; 049 protected final String fallbackCrawlDateFormatStr = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"; 050 /** 051 * The date format used in crawl.log files. 052 */ 053 protected final SimpleDateFormat crawlDateFormat = new SimpleDateFormat(crawlDateFormatStr); 054 protected final SimpleDateFormat fallbackCrawlDateFormat = new SimpleDateFormat(fallbackCrawlDateFormatStr); 055 056 /** 057 * The date format specified by the {@link CrawlDataItem} for dates entered into it (and eventually into the index) 058 */ 059 protected final SimpleDateFormat crawlDataItemFormat = new SimpleDateFormat(CrawlDataItem.dateFormat); 060 061 /** 062 * A reader for the crawl.log file being processed 063 */ 064 protected BufferedReader in; 065 066 /** 067 * The next item to be issued (if ready) or null if the next item has not been prepared or there are no more 068 * elements 069 */ 070 protected CrawlDataItem next; 071 072 /** 073 * Create a new CrawlLogIterator that reads items from a Heritrix crawl.log 074 * 075 * @param source The path of a Heritrix crawl.log file. 076 * @throws IOException If errors were found reading the log. 077 */ 078 public CrawlLogIterator(String source) throws IOException { 079 super(source); 080 in = new BufferedReader(new InputStreamReader(new FileInputStream(new File(source)))); 081 } 082 083 /** 084 * Returns true if there are more items available. 085 * 086 * @return True if at least one more item can be fetched with next(). 087 */ 088 public boolean hasNext() throws IOException { 089 if (next == null) { 090 prepareNext(); 091 } 092 return next != null; 093 } 094 095 /** 096 * Returns the next valid item from the crawl log. 097 * 098 * @return An item from the crawl log. Note that unlike the Iterator interface, this method returns null if there 099 * are no more items to fetch. 100 * @throws IOException If there is an error reading the item *after* the item to be returned from the crawl.log. 101 * @throws NoSuchElementException If there are no more items 102 */ 103 public CrawlDataItem next() throws IOException { 104 if (hasNext()) { 105 CrawlDataItem tmp = next; 106 this.next = null; 107 return tmp; 108 } 109 throw new NoSuchElementException("No more items"); 110 } 111 112 /** 113 * Ready the next item. This method will skip over items that getNextItem() rejects. When the method returns, either 114 * next is non-null or there are no more items in the crawl log. 115 * <p> 116 * Note: This method should only be called when <code>next==null<code> 117 */ 118 protected void prepareNext() throws IOException { 119 String line; 120 while ((line = in.readLine()) != null) { 121 next = parseLine(line); 122 if (next != null) { 123 return; 124 } 125 } 126 } 127 128 /** 129 * Parse the a line in the crawl log. 130 * <p> 131 * Override this method to change how individual crawl log items are processed and accepted/rejected. This method is 132 * called from within the loop in prepareNext(). 133 * 134 * @param line A line from the crawl log. Must not be null. 135 * @return A {@link CrawlDataItem} if the next line in the crawl log yielded a usable item, null otherwise. 136 */ 137 protected CrawlDataItem parseLine(String line) { 138 if (line != null && line.length() > 42) { 139 // Split the line up by whitespaces. 140 // Limit to 12 parts (annotations may contain spaces, but will 141 // always be at the end of each line. 142 String[] lineParts = line.split("\\s+", 12); 143 144 if (lineParts.length < 10) { 145 // If the lineParts are fewer then 10 then the line is 146 // malformed. 147 return null; 148 } 149 150 // Index 0: Timestamp 151 String timestamp; 152 try { 153 // Convert from crawl.log format to the format specified by 154 // CrawlDataItem 155 // the 8th item, for example 20170116161421526+52 156 // -> we keep the numbers until the seconds : 20170116161421 157 String timestampTrunc = lineParts[8].substring(0, crawlDateFormatStr.length()); 158 timestamp = crawlDataItemFormat.format(crawlDateFormat.parse(timestampTrunc)); 159 } catch (Exception e) { 160 try { 161 timestamp = crawlDataItemFormat.format(fallbackCrawlDateFormat.parse(lineParts[0])); 162 } catch (ParseException e1) { 163 logger.debug("Error parsing date for crawl log entry: " + line); 164 return null; 165 } 166 167 } 168 169 // Index 1: status return code (ignore) 170 // Index 2: File size (ignore) 171 172 // Index 3: URL 173 String url = lineParts[3]; 174 175 // Index 4: Hop path (ignore) 176 // Index 5: Parent URL (ignore) 177 178 // Index 6: Mime type 179 String mime = lineParts[6]; 180 181 // Index 7: ToeThread number (ignore) 182 // Index 8: ArcTimeAndDuration (ignore) 183 184 // Index 9: Digest 185 String digest = lineParts[9]; 186 // The digest may contain a prefix. 187 // The prefix will be terminated by a : which is immediately 188 // followed by the actual digest 189 if (digest.lastIndexOf(":") >= 0) { 190 digest = digest.substring(digest.lastIndexOf(":") + 1); 191 } 192 193 // Index 10: Source tag (ignore) 194 195 // Index 11: Annotations (may be missing) 196 String origin = null; 197 boolean duplicate = false; 198 if (lineParts.length == 12) { 199 // Have an annotation field. Look for origin inside it. 200 // Origin can be found in the 'annotations' field, preceeded by 201 // 'deduplicate:' (no quotes) and contained within a pair of 202 // double quotes. Example: deduplicate:"origin". 203 // Can very possibly be missing. 204 String annotation = lineParts[11]; 205 206 int startIndex = annotation.indexOf("duplicate:\""); 207 if (startIndex >= 0) { 208 // The annotation field contains origin info. Extract it. 209 startIndex += 11; // Skip over the ]deduplicate:"' part 210 int endIndex = annotation.indexOf('"', startIndex + 1); 211 origin = annotation.substring(startIndex, endIndex); 212 // That also means this is a duplicate of an URL from an 213 // earlier crawl 214 duplicate = true; 215 } else if (annotation.contains("duplicate")) { 216 // Is a duplicate of an URL from an earlier crawl but 217 // no origin information was recorded 218 duplicate = true; 219 } 220 } 221 // Got a valid item. 222 return new CrawlDataItem(url, digest, timestamp, null, mime, origin, duplicate); 223 } 224 return null; 225 } 226 227 /** 228 * Closes the crawl.log file. 229 */ 230 public void close() throws IOException { 231 in.close(); 232 } 233 234 /* 235 * (non-Javadoc) 236 * 237 * @see is.hi.bok.deduplicator.CrawlDataIterator#getSourceType() 238 */ 239 public String getSourceType() { 240 return "Handles Heritrix style crawl.log files"; 241 } 242 243}