Source code

001/* CrawlLogIterator
002 * 
003 * Created on 10.04.2006
004 *
005 * Copyright (C) 2006 National and University Library of Iceland
006 * 
007 * This file is part of the DeDuplicator (Heritrix add-on module).
008 * 
009 * DeDuplicator is free software; you can redistribute it and/or modify
010 * it under the terms of the GNU Lesser Public License as published by
011 * the Free Software Foundation; either version 2.1 of the License, or
012 * any later version.
013 * 
014 * DeDuplicator is distributed in the hope that it will be useful, 
015 * but WITHOUT ANY WARRANTY; without even the implied warranty of
016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
017 * GNU Lesser Public License for more details.
018 * 
019 * You should have received a copy of the GNU Lesser Public License
020 * along with DeDuplicator; if not, write to the Free Software
021 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
022 */
023package is.hi.bok.deduplicator;
024
025import java.io.BufferedReader;
026import java.io.File;
027import java.io.FileInputStream;
028import java.io.IOException;
029import java.io.InputStreamReader;
030import java.text.ParseException;
031import java.text.SimpleDateFormat;
032import java.util.NoSuchElementException;
033
034/**
035 * An implementation of a {@link is.hi.bok.deduplicator.CrawlDataIterator} capable of iterating over a Heritrix's style
036 * <code>crawl.log</code>.
037 *
038 * @author Kristinn Sigur&eth;sson
039 * @author Lars Clausen
040 */
041public class CrawlLogIterator extends CrawlDataIterator {
042
043    /**
044     * The date format used in crawl.log files.
045     */
046    protected final SimpleDateFormat crawlDateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'");
047    /**
048     * The date format specified by the {@link CrawlDataItem} for dates entered into it (and eventually into the index)
049     */
050    protected final SimpleDateFormat crawlDataItemFormat = new SimpleDateFormat(CrawlDataItem.dateFormat);
051
052    /**
053     * A reader for the crawl.log file being processed
054     */
055    protected BufferedReader in;
056
057    /**
058     * The next item to be issued (if ready) or null if the next item has not been prepared or there are no more
059     * elements
060     */
061    protected CrawlDataItem next;
062
063    /**
064     * Create a new CrawlLogIterator that reads items from a Heritrix crawl.log
065     *
066     * @param source The path of a Heritrix crawl.log file.
067     * @throws IOException If errors were found reading the log.
068     */
069    public CrawlLogIterator(String source) throws IOException {
070        super(source);
071        in = new BufferedReader(new InputStreamReader(new FileInputStream(new File(source))));
072    }
073
074    /**
075     * Returns true if there are more items available.
076     *
077     * @return True if at least one more item can be fetched with next().
078     */
079    public boolean hasNext() throws IOException {
080        if (next == null) {
081            prepareNext();
082        }
083        return next != null;
084    }
085
086    /**
087     * Returns the next valid item from the crawl log.
088     *
089     * @return An item from the crawl log. Note that unlike the Iterator interface, this method returns null if there
090     * are no more items to fetch.
091     * @throws IOException If there is an error reading the item *after* the item to be returned from the crawl.log.
092     * @throws NoSuchElementException If there are no more items
093     */
094    public CrawlDataItem next() throws IOException {
095        if (hasNext()) {
096            CrawlDataItem tmp = next;
097            this.next = null;
098            return tmp;
099        }
100        throw new NoSuchElementException("No more items");
101    }
102
103    /**
104     * Ready the next item. This method will skip over items that getNextItem() rejects. When the method returns, either
105     * next is non-null or there are no more items in the crawl log.
106     * <p>
107     * Note: This method should only be called when <code>next==null<code>
108     */
109    protected void prepareNext() throws IOException {
110        String line;
111        while ((line = in.readLine()) != null) {
112            next = parseLine(line);
113            if (next != null) {
114                return;
115            }
116        }
117    }
118
119    /**
120     * Parse the a line in the crawl log.
121     * <p>
122     * Override this method to change how individual crawl log items are processed and accepted/rejected. This method is
123     * called from within the loop in prepareNext().
124     *
125     * @param line A line from the crawl log. Must not be null.
126     * @return A {@link CrawlDataItem} if the next line in the crawl log yielded a usable item, null otherwise.
127     */
128    protected CrawlDataItem parseLine(String line) {
129        if (line != null && line.length() > 42) {
130            // Split the line up by whitespaces.
131            // Limit to 12 parts (annotations may contain spaces, but will
132            // always be at the end of each line.
133            String[] lineParts = line.split("\\s+", 12);
134
135            if (lineParts.length < 10) {
136                // If the lineParts are fewer then 10 then the line is
137                // malformed.
138                return null;
139            }
140
141            // Index 0: Timestamp
142            String timestamp;
143            try {
144                // Convert from crawl.log format to the format specified by
145                // CrawlDataItem
146                timestamp = crawlDataItemFormat.format(crawlDateFormat.parse(lineParts[0]));
147            } catch (ParseException e) {
148                System.err.println("Error parsing date for: " + line);
149                e.printStackTrace();
150                return null;
151            }
152
153            // Index 1: status return code (ignore)
154            // Index 2: File size (ignore)
155
156            // Index 3: URL
157            String url = lineParts[3];
158
159            // Index 4: Hop path (ignore)
160            // Index 5: Parent URL (ignore)
161
162            // Index 6: Mime type
163            String mime = lineParts[6];
164
165            // Index 7: ToeThread number (ignore)
166            // Index 8: ArcTimeAndDuration (ignore)
167
168            // Index 9: Digest
169            String digest = lineParts[9];
170            // The digest may contain a prefix.
171            // The prefix will be terminated by a : which is immediately
172            // followed by the actual digest
173            if (digest.lastIndexOf(":") >= 0) {
174                digest = digest.substring(digest.lastIndexOf(":") + 1);
175            }
176
177            // Index 10: Source tag (ignore)
178
179            // Index 11: Annotations (may be missing)
180            String origin = null;
181            boolean duplicate = false;
182            if (lineParts.length == 12) {
183                // Have an annotation field. Look for origin inside it.
184                // Origin can be found in the 'annotations' field, preceeded by
185                // 'deduplicate:' (no quotes) and contained within a pair of
186                // double quotes. Example: deduplicate:"origin".
187                // Can very possibly be missing.
188                String annotation = lineParts[11];
189
190                int startIndex = annotation.indexOf("duplicate:\"");
191                if (startIndex >= 0) {
192                    // The annotation field contains origin info. Extract it.
193                    startIndex += 11; // Skip over the ]deduplicate:"' part
194                    int endIndex = annotation.indexOf('"', startIndex + 1);
195                    origin = annotation.substring(startIndex, endIndex);
196                    // That also means this is a duplicate of an URL from an
197                    // earlier crawl
198                    duplicate = true;
199                } else if (annotation.contains("duplicate")) {
200                    // Is a duplicate of an URL from an earlier crawl but
201                    // no origin information was recorded
202                    duplicate = true;
203                }
204            }
205            // Got a valid item.
206            return new CrawlDataItem(url, digest, timestamp, null, mime, origin, duplicate);
207        }
208        return null;
209    }
210
211    /**
212     * Closes the crawl.log file.
213     */
214    public void close() throws IOException {
215        in.close();
216    }
217
218    /*
219     * (non-Javadoc)
220     * 
221     * @see is.hi.bok.deduplicator.CrawlDataIterator#getSourceType()
222     */
223    public String getSourceType() {
224        return "Handles Heritrix style crawl.log files";
225    }
226
227}