001/* CrawlLogIterator 002 * 003 * Created on 10.04.2006 004 * 005 * Copyright (C) 2006 National and University Library of Iceland 006 * 007 * This file is part of the DeDuplicator (Heritrix add-on module). 008 * 009 * DeDuplicator is free software; you can redistribute it and/or modify 010 * it under the terms of the GNU Lesser Public License as published by 011 * the Free Software Foundation; either version 2.1 of the License, or 012 * any later version. 013 * 014 * DeDuplicator is distributed in the hope that it will be useful, 015 * but WITHOUT ANY WARRANTY; without even the implied warranty of 016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 017 * GNU Lesser Public License for more details. 018 * 019 * You should have received a copy of the GNU Lesser Public License 020 * along with DeDuplicator; if not, write to the Free Software 021 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 022 */ 023package is.hi.bok.deduplicator; 024 025import java.io.BufferedReader; 026import java.io.File; 027import java.io.FileInputStream; 028import java.io.IOException; 029import java.io.InputStreamReader; 030import java.text.ParseException; 031import java.text.SimpleDateFormat; 032import java.util.NoSuchElementException; 033 034/** 035 * An implementation of a {@link is.hi.bok.deduplicator.CrawlDataIterator} capable of iterating over a Heritrix's style 036 * <code>crawl.log</code>. 037 * 038 * @author Kristinn Sigurðsson 039 * @author Lars Clausen 040 */ 041public class CrawlLogIterator extends CrawlDataIterator { 042 043 /** 044 * The date format used in crawl.log files. 045 */ 046 protected final SimpleDateFormat crawlDateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"); 047 /** 048 * The date format specified by the {@link CrawlDataItem} for dates entered into it (and eventually into the index) 049 */ 050 protected final SimpleDateFormat crawlDataItemFormat = new SimpleDateFormat(CrawlDataItem.dateFormat); 051 052 /** 053 * A reader for the crawl.log file being processed 054 */ 055 protected BufferedReader in; 056 057 /** 058 * The next item to be issued (if ready) or null if the next item has not been prepared or there are no more 059 * elements 060 */ 061 protected CrawlDataItem next; 062 063 /** 064 * Create a new CrawlLogIterator that reads items from a Heritrix crawl.log 065 * 066 * @param source The path of a Heritrix crawl.log file. 067 * @throws IOException If errors were found reading the log. 068 */ 069 public CrawlLogIterator(String source) throws IOException { 070 super(source); 071 in = new BufferedReader(new InputStreamReader(new FileInputStream(new File(source)))); 072 } 073 074 /** 075 * Returns true if there are more items available. 076 * 077 * @return True if at least one more item can be fetched with next(). 078 */ 079 public boolean hasNext() throws IOException { 080 if (next == null) { 081 prepareNext(); 082 } 083 return next != null; 084 } 085 086 /** 087 * Returns the next valid item from the crawl log. 088 * 089 * @return An item from the crawl log. Note that unlike the Iterator interface, this method returns null if there 090 * are no more items to fetch. 091 * @throws IOException If there is an error reading the item *after* the item to be returned from the crawl.log. 092 * @throws NoSuchElementException If there are no more items 093 */ 094 public CrawlDataItem next() throws IOException { 095 if (hasNext()) { 096 CrawlDataItem tmp = next; 097 this.next = null; 098 return tmp; 099 } 100 throw new NoSuchElementException("No more items"); 101 } 102 103 /** 104 * Ready the next item. This method will skip over items that getNextItem() rejects. When the method returns, either 105 * next is non-null or there are no more items in the crawl log. 106 * <p> 107 * Note: This method should only be called when <code>next==null<code> 108 */ 109 protected void prepareNext() throws IOException { 110 String line; 111 while ((line = in.readLine()) != null) { 112 next = parseLine(line); 113 if (next != null) { 114 return; 115 } 116 } 117 } 118 119 /** 120 * Parse the a line in the crawl log. 121 * <p> 122 * Override this method to change how individual crawl log items are processed and accepted/rejected. This method is 123 * called from within the loop in prepareNext(). 124 * 125 * @param line A line from the crawl log. Must not be null. 126 * @return A {@link CrawlDataItem} if the next line in the crawl log yielded a usable item, null otherwise. 127 */ 128 protected CrawlDataItem parseLine(String line) { 129 if (line != null && line.length() > 42) { 130 // Split the line up by whitespaces. 131 // Limit to 12 parts (annotations may contain spaces, but will 132 // always be at the end of each line. 133 String[] lineParts = line.split("\\s+", 12); 134 135 if (lineParts.length < 10) { 136 // If the lineParts are fewer then 10 then the line is 137 // malformed. 138 return null; 139 } 140 141 // Index 0: Timestamp 142 String timestamp; 143 try { 144 // Convert from crawl.log format to the format specified by 145 // CrawlDataItem 146 timestamp = crawlDataItemFormat.format(crawlDateFormat.parse(lineParts[0])); 147 } catch (ParseException e) { 148 System.err.println("Error parsing date for: " + line); 149 e.printStackTrace(); 150 return null; 151 } 152 153 // Index 1: status return code (ignore) 154 // Index 2: File size (ignore) 155 156 // Index 3: URL 157 String url = lineParts[3]; 158 159 // Index 4: Hop path (ignore) 160 // Index 5: Parent URL (ignore) 161 162 // Index 6: Mime type 163 String mime = lineParts[6]; 164 165 // Index 7: ToeThread number (ignore) 166 // Index 8: ArcTimeAndDuration (ignore) 167 168 // Index 9: Digest 169 String digest = lineParts[9]; 170 // The digest may contain a prefix. 171 // The prefix will be terminated by a : which is immediately 172 // followed by the actual digest 173 if (digest.lastIndexOf(":") >= 0) { 174 digest = digest.substring(digest.lastIndexOf(":") + 1); 175 } 176 177 // Index 10: Source tag (ignore) 178 179 // Index 11: Annotations (may be missing) 180 String origin = null; 181 boolean duplicate = false; 182 if (lineParts.length == 12) { 183 // Have an annotation field. Look for origin inside it. 184 // Origin can be found in the 'annotations' field, preceeded by 185 // 'deduplicate:' (no quotes) and contained within a pair of 186 // double quotes. Example: deduplicate:"origin". 187 // Can very possibly be missing. 188 String annotation = lineParts[11]; 189 190 int startIndex = annotation.indexOf("duplicate:\""); 191 if (startIndex >= 0) { 192 // The annotation field contains origin info. Extract it. 193 startIndex += 11; // Skip over the ]deduplicate:"' part 194 int endIndex = annotation.indexOf('"', startIndex + 1); 195 origin = annotation.substring(startIndex, endIndex); 196 // That also means this is a duplicate of an URL from an 197 // earlier crawl 198 duplicate = true; 199 } else if (annotation.contains("duplicate")) { 200 // Is a duplicate of an URL from an earlier crawl but 201 // no origin information was recorded 202 duplicate = true; 203 } 204 } 205 // Got a valid item. 206 return new CrawlDataItem(url, digest, timestamp, null, mime, origin, duplicate); 207 } 208 return null; 209 } 210 211 /** 212 * Closes the crawl.log file. 213 */ 214 public void close() throws IOException { 215 in.close(); 216 } 217 218 /* 219 * (non-Javadoc) 220 * 221 * @see is.hi.bok.deduplicator.CrawlDataIterator#getSourceType() 222 */ 223 public String getSourceType() { 224 return "Handles Heritrix style crawl.log files"; 225 } 226 227}