001/* CrawlLogIterator
002 * 
003 * Created on 10.04.2006
004 *
005 * Copyright (C) 2006 National and University Library of Iceland
006 * 
007 * This file is part of the DeDuplicator (Heritrix add-on module).
008 * 
009 * DeDuplicator is free software; you can redistribute it and/or modify
010 * it under the terms of the GNU Lesser Public License as published by
011 * the Free Software Foundation; either version 2.1 of the License, or
012 * any later version.
013 * 
014 * DeDuplicator is distributed in the hope that it will be useful, 
015 * but WITHOUT ANY WARRANTY; without even the implied warranty of
016 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
017 * GNU Lesser Public License for more details.
018 * 
019 * You should have received a copy of the GNU Lesser Public License
020 * along with DeDuplicator; if not, write to the Free Software
021 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
022 */
023package is.hi.bok.deduplicator;
024
025import java.io.BufferedReader;
026import java.io.File;
027import java.io.FileInputStream;
028import java.io.IOException;
029import java.io.InputStreamReader;
030import java.text.ParseException;
031import java.text.SimpleDateFormat;
032import java.util.NoSuchElementException;
033
034import org.apache.commons.logging.Log;
035import org.apache.commons.logging.LogFactory;
036
037/**
038 * An implementation of a {@link is.hi.bok.deduplicator.CrawlDataIterator} capable of iterating over a Heritrix's style
039 * <code>crawl.log</code>.
040 *
041 * @author Kristinn Sigur&eth;sson
042 * @author Lars Clausen
043 */
044public class CrawlLogIterator extends CrawlDataIterator {
045
046    private Log logger = LogFactory.getLog(getClass().getName());
047
048    protected final String crawlDateFormatStr = "yyyyMMddHHmmss";
049    protected final String fallbackCrawlDateFormatStr = "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'";
050    /**
051     * The date format used in crawl.log files.
052     */
053    protected final SimpleDateFormat crawlDateFormat = new SimpleDateFormat(crawlDateFormatStr);
054    protected final SimpleDateFormat fallbackCrawlDateFormat = new SimpleDateFormat(fallbackCrawlDateFormatStr);
055
056    /**
057     * The date format specified by the {@link CrawlDataItem} for dates entered into it (and eventually into the index)
058     */
059    protected final SimpleDateFormat crawlDataItemFormat = new SimpleDateFormat(CrawlDataItem.dateFormat);
060
061    /**
062     * A reader for the crawl.log file being processed
063     */
064    protected BufferedReader in;
065
066    /**
067     * The next item to be issued (if ready) or null if the next item has not been prepared or there are no more
068     * elements
069     */
070    protected CrawlDataItem next;
071
072    /**
073     * Create a new CrawlLogIterator that reads items from a Heritrix crawl.log
074     *
075     * @param source The path of a Heritrix crawl.log file.
076     * @throws IOException If errors were found reading the log.
077     */
078    public CrawlLogIterator(String source) throws IOException {
079        super(source);
080        in = new BufferedReader(new InputStreamReader(new FileInputStream(new File(source))));
081    }
082
083    /**
084     * Returns true if there are more items available.
085     *
086     * @return True if at least one more item can be fetched with next().
087     */
088    public boolean hasNext() throws IOException {
089        if (next == null) {
090            prepareNext();
091        }
092        return next != null;
093    }
094
095    /**
096     * Returns the next valid item from the crawl log.
097     *
098     * @return An item from the crawl log. Note that unlike the Iterator interface, this method returns null if there
099     * are no more items to fetch.
100     * @throws IOException If there is an error reading the item *after* the item to be returned from the crawl.log.
101     * @throws NoSuchElementException If there are no more items
102     */
103    public CrawlDataItem next() throws IOException {
104        if (hasNext()) {
105            CrawlDataItem tmp = next;
106            this.next = null;
107            return tmp;
108        }
109        throw new NoSuchElementException("No more items");
110    }
111
112    /**
113     * Ready the next item. This method will skip over items that getNextItem() rejects. When the method returns, either
114     * next is non-null or there are no more items in the crawl log.
115     * <p>
116     * Note: This method should only be called when <code>next==null<code>
117     */
118    protected void prepareNext() throws IOException {
119        String line;
120        while ((line = in.readLine()) != null) {
121            next = parseLine(line);
122            if (next != null) {
123                return;
124            }
125        }
126    }
127
128    /**
129     * Parse the a line in the crawl log.
130     * <p>
131     * Override this method to change how individual crawl log items are processed and accepted/rejected. This method is
132     * called from within the loop in prepareNext().
133     *
134     * @param line A line from the crawl log. Must not be null.
135     * @return A {@link CrawlDataItem} if the next line in the crawl log yielded a usable item, null otherwise.
136     */
137    protected CrawlDataItem parseLine(String line) {
138        if (line != null && line.length() > 42) {
139            // Split the line up by whitespaces.
140            // Limit to 12 parts (annotations may contain spaces, but will
141            // always be at the end of each line.
142            String[] lineParts = line.split("\\s+", 12);
143
144            if (lineParts.length < 10) {
145                // If the lineParts are fewer then 10 then the line is
146                // malformed.
147                return null;
148            }
149
150            // Index 0: Timestamp
151            String timestamp;
152            try {
153                // Convert from crawl.log format to the format specified by
154                // CrawlDataItem
155                // the 8th item, for example 20170116161421526+52
156                // -> we keep the numbers until the seconds : 20170116161421
157                String timestampTrunc = lineParts[8].substring(0, crawlDateFormatStr.length());
158                timestamp = crawlDataItemFormat.format(crawlDateFormat.parse(timestampTrunc));
159            } catch (Exception e) {
160                try {
161                    timestamp = crawlDataItemFormat.format(fallbackCrawlDateFormat.parse(lineParts[0]));
162                } catch (ParseException e1) {
163                    logger.debug("Error parsing date for crawl log entry: " + line);
164                    return null;
165                }
166
167            }
168
169            // Index 1: status return code (ignore)
170            // Index 2: File size (ignore)
171
172            // Index 3: URL
173            String url = lineParts[3];
174
175            // Index 4: Hop path (ignore)
176            // Index 5: Parent URL (ignore)
177
178            // Index 6: Mime type
179            String mime = lineParts[6];
180
181            // Index 7: ToeThread number (ignore)
182            // Index 8: ArcTimeAndDuration (ignore)
183
184            // Index 9: Digest
185            String digest = lineParts[9];
186            // The digest may contain a prefix.
187            // The prefix will be terminated by a : which is immediately
188            // followed by the actual digest
189            if (digest.lastIndexOf(":") >= 0) {
190                digest = digest.substring(digest.lastIndexOf(":") + 1);
191            }
192
193            // Index 10: Source tag (ignore)
194
195            // Index 11: Annotations (may be missing)
196            String origin = null;
197            boolean duplicate = false;
198            if (lineParts.length == 12) {
199                // Have an annotation field. Look for origin inside it.
200                // Origin can be found in the 'annotations' field, preceeded by
201                // 'deduplicate:' (no quotes) and contained within a pair of
202                // double quotes. Example: deduplicate:"origin".
203                // Can very possibly be missing.
204                String annotation = lineParts[11];
205
206                int startIndex = annotation.indexOf("duplicate:\"");
207                if (startIndex >= 0) {
208                    // The annotation field contains origin info. Extract it.
209                    startIndex += 11; // Skip over the ]deduplicate:"' part
210                    int endIndex = annotation.indexOf('"', startIndex + 1);
211                    origin = annotation.substring(startIndex, endIndex);
212                    // That also means this is a duplicate of an URL from an
213                    // earlier crawl
214                    duplicate = true;
215                } else if (annotation.contains("duplicate")) {
216                    // Is a duplicate of an URL from an earlier crawl but
217                    // no origin information was recorded
218                    duplicate = true;
219                }
220            }
221            // Got a valid item.
222            return new CrawlDataItem(url, digest, timestamp, null, mime, origin, duplicate);
223        }
224        return null;
225    }
226
227    /**
228     * Closes the crawl.log file.
229     */
230    public void close() throws IOException {
231        in.close();
232    }
233
234    /*
235     * (non-Javadoc)
236     * 
237     * @see is.hi.bok.deduplicator.CrawlDataIterator#getSourceType()
238     */
239    public String getSourceType() {
240        return "Handles Heritrix style crawl.log files";
241    }
242
243}