Source code

001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.harvester.indexserver;
025
026import java.io.BufferedReader;
027import java.io.File;
028import java.io.IOException;
029
030import org.slf4j.Logger;
031import org.slf4j.LoggerFactory;
032
033import dk.netarkivet.common.exceptions.ArgumentNotValid;
034import dk.netarkivet.common.exceptions.IOFailure;
035import dk.netarkivet.common.utils.cdx.CDXRecord;
036import is.hi.bok.deduplicator.CrawlDataItem;
037import is.hi.bok.deduplicator.CrawlLogIterator;
038
039/**
040 * This subclass of CrawlLogIterator adds the layer of digging an origin of the form "arcfile,offset" out of a
041 * corresponding CDX index. This may cause some of the entries in the crawl log to be skipped. The two files are read in
042 * parallel.
043 */
044public class CDXOriginCrawlLogIterator extends CrawlLogIterator {
045
046    /** The log. */
047    private static final Logger log = LoggerFactory.getLogger(CDXOriginCrawlLogIterator.class);
048
049    /** The reader of the (sorted) CDX index. */
050    protected BufferedReader reader;
051
052    /**
053     * The last record we read from the reader. We may overshoot on the CDX reading if there are entries not in CDX, so
054     * we hang onto this until the reading of the crawl.log catches up.
055     */
056    protected CDXRecord lastRecord;
057
058    /**
059     * The constant prefixed checksums in newer versions of Heritrix indicating the digest method. The deduplicator
060     * currently doesn't use the equivalent prefix, so we need to strip it off (see bug #1004).
061     */
062    private static final String SHA1_PREFIX = "sha1:";
063
064    /**
065     * Create a new CDXOriginCrawlLogIterator from crawl.log and CDX sources.
066     *
067     * @param source File containing a crawl.log sorted by URL (LANG=C sort -k 4b)
068     * @param cdx A reader of a sorted CDX file. This is given as a reader so that it may be closed after use
069     * (CrawlLogIterator provides no close())
070     * @throws IOException If the underlying CrawlLogIterator fails, e.g. due to missing files.
071     */
072    public CDXOriginCrawlLogIterator(File source, BufferedReader cdx) throws IOException {
073        super(source.getAbsolutePath());
074        ArgumentNotValid.checkNotNull(cdx, "BufferedReader cdx");
075        reader = cdx;
076    }
077
078    /**
079     * Parse a crawl.log line into a valid CrawlDataItem.
080     * <p>
081     * If CrawlLogIterator is ok with this line, we must make sure that it has an origin by finding missing ones in the
082     * CDX file. If multiple origins are found in the CDX files, the one that was harvested last is chosen. If no origin
083     * can be found, the item is rejected.
084     * <p>
085     * We assume that super.parseLine() delivers us the items in the crawl.log in the given (sorted) order with non-null
086     * URLs, though we admit that some undeclared exceptions can be thrown by it.
087     *
088     * @param line A crawl.log line to parse.
089     * @return A CrawlDataItem with a valid origin field, or null if we could not determine an appropriate origin.
090     * @throws IOFailure if there is an error reading the files.
091     */
092    protected CrawlDataItem parseLine(String line) throws IOFailure {
093        CrawlDataItem item;
094        log.trace("Processing crawl-log line: {}", line);
095        try {
096            item = super.parseLine(line);
097        } catch (RuntimeException e) {
098            log.debug("Skipping over bad crawl-log line '" + line + "'", e);
099            return null;
100        }
101
102        // Hack that works around bug #1004: sha1: prefix not accounted for
103        if (item != null && item.getContentDigest() != null
104                && item.getContentDigest().toLowerCase().startsWith(SHA1_PREFIX)) {
105            item.setContentDigest(item.getContentDigest().substring(SHA1_PREFIX.length()));
106        }
107
108        // If a origin was found in the crawl log, we accept that as correct.
109        // Otherwise we must find the origin in the CDX file.
110        if (item != null && item.getOrigin() == null) {
111            // Iterate through the sorted CDX file until lastRecord is not null
112            // and lastRecord.getURL() is lexicographically higher than
113            // item.getURL(), indicating that there are no more matches.
114            CDXRecord foundRecord = null;
115            while (lastRecord == null || lastRecord.getURL().compareTo(item.getURL()) <= 0) {
116                // If the cdx URL is the one we are looking for, we have a
117                // potential origin.
118                if (lastRecord != null && lastRecord.getURL().equals(item.getURL())) {
119                    // If this is our first potential origin, or if it is better
120                    // than the one we currently consider best, we remember this
121                    // entry. A better origin is defined as one with a later
122                    // date than the current choice.
123                    if (foundRecord == null || lastRecord.getDate().compareTo(foundRecord.getDate()) > 0) {
124                        foundRecord = lastRecord;
125                        log.trace("Foundrecord set to '{},{}'", foundRecord.getArcfile(), foundRecord.getOffset());
126                    }
127                }
128
129                // Read the next line
130                try {
131                    String record = reader.readLine();
132                    if (record == null) {
133                        break; // EOF, nothing to do
134                    }
135                    if (record.length() == 0) {
136                        continue; // skip empty lines
137                    }
138                    try {
139                        lastRecord = new CDXRecord(record);
140                    } catch (ArgumentNotValid e) {
141                        log.debug("Skipping over bad CDX line '{}'", record, e);
142                        continue;
143                    }
144                    log.trace("lastrecord is '{}'", record);
145                } catch (IOException e) {
146                    throw new IOFailure("Error reading CDX record", e);
147                }
148            }
149            if (foundRecord == null) {
150                if (lastRecord == null) {
151                    log.trace("No matching CDX for URL '{}'. No last CDX was found.", item.getURL());
152                } else {
153                    log.trace("No matching CDX for URL '{}'. Last CDX was for URL '{}'", item.getURL(),
154                            lastRecord.getURL());
155                }
156
157                return null;
158            }
159
160            String origin = foundRecord.getArcfile() + "," + foundRecord.getOffset();
161            item.setOrigin(origin);
162            log.trace("URL '{}' combined with origin '{}'.", item.getURL(), origin);
163        }
164        return item;
165    }
166
167}