Source code

001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.harvesting.report;
024
025import gnu.inet.encoding.IDNA;
026
027import java.io.BufferedReader;
028import java.io.File;
029import java.io.FileReader;
030import java.io.IOException;
031import java.net.URISyntaxException;
032import java.util.HashMap;
033import java.util.Map;
034
035import org.apache.commons.httpclient.URIException;
036import org.jwat.common.Uri;
037import org.jwat.common.UriProfile;
038import org.slf4j.Logger;
039import org.slf4j.LoggerFactory;
040
041import dk.netarkivet.common.exceptions.ArgumentNotValid;
042import dk.netarkivet.common.exceptions.IOFailure;
043import dk.netarkivet.common.utils.DomainUtils;
044import dk.netarkivet.common.utils.FileUtils;
045import dk.netarkivet.common.utils.Settings;
046import dk.netarkivet.common.utils.StringUtils;
047import dk.netarkivet.harvester.HarvesterSettings;
048import dk.netarkivet.harvester.datamodel.StopReason;
049import dk.netarkivet.harvester.harvesting.HeritrixFiles;
050import dk.netarkivet.harvester.harvesting.distribute.DomainStats;
051
052/**
053 * Base implementation for a harvest report.
054 * The constructor gets the data in a crawl.log file, and parses the file. The crawl.log is described in the
055 * Heritrix user-manual, section 8.2.1: http://crawler.archive.org/articles/user_manual/analysis.html#logs Note:
056 * Invalid lines are logged and then ignored.
057 * <p>
058 * Each url listed in the file is assigned to a domain, the total object count and byte count per domain is
059 * calculated. Finally, a StopReason is found for each domain: When the response is CrawlURI.S_BLOCKED_BY_QUOTA (
060 * currently = -5003), the StopReason is set to StopReason.SIZE_LIMIT, if the annotation equals "Q:group-max-all-kb"
061 * or StopReason.OBJECT_LIMIT, if the annotation equals "Q:group-max-fetch-successes".
062 */
063@SuppressWarnings({"serial"})
064public class HarvestReportGenerator {
065
066    /** The logger for this class. */
067    private static final Logger log = LoggerFactory.getLogger(HarvestReportGenerator.class);
068
069    /**
070     * Strings found in the progress-statistics.log, used to devise the default stop reason for domains.
071     */
072    public static enum ProgressStatisticsConstants {
073
074        /**
075         * String in crawl.log, that Heritrix writes as the last entry in the progress-statistics.log.
076         */
077        ORDERLY_FINISH("CRAWL ENDED"),
078
079        /**
080         * This is written when Heritrix terminates the job due to its timelimit (max-time-sec) being reached.
081         */
082        TIMELIMIT_EXCEEDED("Timelimit hit"),
083
084        /**
085         * String which shows that the harvest was deliberately aborted from the Heritrix GUI or forcibly stopped by the
086         * Netarchive Suite software due to an inactivity timeout.
087         */
088        HARVEST_ABORTED("Ended by operator");
089
090        /** The pattern associated with a given enum value. */
091        private final String pattern;
092
093        /**
094         * Constructor for this enum class.
095         *
096         * @param pattern The pattern associated with a given enum value.
097         */
098        ProgressStatisticsConstants(String pattern) {
099            this.pattern = pattern;
100        }
101
102    }
103    
104    /** Datastructure holding the domain-information contained in one harvest. */
105    private final Map<String, DomainStats> domainstats = new HashMap<String, DomainStats>();
106
107    private HeritrixFiles heritrixFiles;
108
109    /**
110     * The default reason why we stopped harvesting this domain. This value is set by looking for a CRAWL ENDED in the
111     * crawl.log.
112     */
113    private StopReason defaultStopReason;
114
115    /**
116     * Default constructor that does nothing. The real construction is supposed to be done in the subclasses by filling
117     * out the domainStats map with crawl results.
118     */
119    public HarvestReportGenerator() {
120    }
121
122    /**
123     * Constructor from Heritrix report files. Subclasses might use a different set of Heritrix reports.
124     *
125     * @param files the set of Heritrix reports.
126     */
127    public HarvestReportGenerator(HeritrixFiles files) {
128        ArgumentNotValid.checkNotNull(files, "files");
129        this.heritrixFiles = files;
130        this.defaultStopReason = findDefaultStopReason(heritrixFiles.getProgressStatisticsLog());
131        preProcess(heritrixFiles);
132    }
133
134    /**
135     * Pre-processing happens when the report is built just at the end of the crawl, before the ARC files upload.
136     */
137    public void preProcess(HeritrixFiles files) {
138        if (log.isInfoEnabled()) {
139            log.info("Starting pre-processing of harvest report for job {}", files.getJobID());
140        }
141        long startTime = System.currentTimeMillis();
142
143        File crawlLog = files.getCrawlLog();
144        if (!crawlLog.isFile() || !crawlLog.canRead()) {
145            String errorMsg = "Not a file or not readable: " + crawlLog.getAbsolutePath();
146            throw new IOFailure(errorMsg);
147        }
148        parseCrawlLog(files.getCrawlLog());
149
150        if (log.isInfoEnabled()) {
151            long time = System.currentTimeMillis() - startTime;
152            log.info("Finished pre-processing of harvest report for job {}, operation took {}", files.getJobID(),
153                    StringUtils.formatDuration(time));
154        }
155    }
156   
157    /**
158     * Attempts to get an already existing {@link DomainStats} object for that domain, and if not found creates one with
159     * zero values.
160     *
161     * @param domainName the name of the domain to get DomainStats for.
162     * @return a DomainStats object for the given domain-name.
163     */
164    protected DomainStats getOrCreateDomainStats(String domainName) {
165        DomainStats dhi = domainstats.get(domainName);
166        if (dhi == null) {
167            dhi = new DomainStats(0L, 0L, defaultStopReason);
168            domainstats.put(domainName, dhi);
169        }
170
171        return dhi;
172    }
173
174    /**
175     * Find out whether we stopped normally in progress statistics log.
176     *
177     * @param logFile A progress-statistics.log file.
178     * @return StopReason.DOWNLOAD_COMPLETE for progress statistics ending with CRAWL ENDED,
179     * StopReason.DOWNLOAD_UNFINISHED otherwise or if file does not exist.
180     */
181    public static StopReason findDefaultStopReason(File logFile) {
182        ArgumentNotValid.checkNotNull(logFile, "File logFile");
183        if (!logFile.exists()) {
184            return StopReason.DOWNLOAD_UNFINISHED;
185        }
186        String lastLine = FileUtils.readLastLine(logFile);
187        if (lastLine.contains(ProgressStatisticsConstants.ORDERLY_FINISH.pattern)) {
188            if (lastLine.contains(ProgressStatisticsConstants.HARVEST_ABORTED.pattern)) {
189                return StopReason.DOWNLOAD_UNFINISHED;
190            } else if (lastLine.contains(ProgressStatisticsConstants.TIMELIMIT_EXCEEDED.pattern)) {
191                return StopReason.TIME_LIMIT;
192            } else {
193                return StopReason.DOWNLOAD_COMPLETE;
194            }
195        } else {
196            return StopReason.DOWNLOAD_UNFINISHED;
197        }
198    }
199
200    /**
201     * Computes the domain-name/byte-count and domain-name/object-count and domain-name/stopreason maps for a crawl.log.
202     *
203     * @param file the local file to be processed
204     * @throws IOFailure if there is problem reading the file
205     */
206    private void parseCrawlLog(File file) throws IOFailure {
207        // read whether or not to disregard the SeedURL information
208        // in the crawl.log
209        boolean disregardSeedUrls = Settings.getBoolean(HarvesterSettings.DISREGARD_SEEDURL_INFORMATION_IN_CRAWLLOG);
210        BufferedReader in = null;
211
212        try {
213            in = new BufferedReader(new FileReader(file));
214            String line;
215            int lineCnt = 0;
216            while ((line = in.readLine()) != null) {
217                ++lineCnt;
218                try {
219                    processHarvestLine(line, disregardSeedUrls);
220                } catch (ArgumentNotValid e) {
221                    log.debug("Invalid line in '{}' line {}: '{}'. Ignoring due to reason: {}", file.getAbsolutePath(),
222                            lineCnt, line, e.getMessage(), e);
223                }
224            }
225        } catch (IOException e) {
226            String msg = "Unable to open/read crawl.log file '" + file.getAbsolutePath() + "'.";
227            log.warn(msg, e);
228            throw new IOFailure(msg, e);
229        } finally {
230            if (in != null) {
231                try {
232                    in.close();
233                } catch (IOException e) {
234                    log.debug("Unable to close {}", file, e);
235                    // Can't throw here, as would destroy the real exception
236                }
237            }
238        }
239    }
240
241    /**
242     * Processes a harvest-line, updating the object and byte maps.
243     *
244     * @param line the line to process.
245     * @param disregardSeedUrlInfo Boolean saying whether or not to disregard SeedURL Information
246     */
247    private void processHarvestLine(final String line, boolean disregardSeedUrlInfo) {
248        // A legal crawl log line has at least 11 parts, + optional annotations
249
250        final int MIN_CRAWL_LOG_PARTS = 11;
251        final int MAX_PARTS = 12;
252        final int ANNOTATION_PART_INDEX = 11;
253        String[] parts = line.split("\\s+", MAX_PARTS);
254        if (parts.length < MIN_CRAWL_LOG_PARTS) {
255            throw new ArgumentNotValid("Not enough fields for line in crawl.log: '" + line + "'. Was only "
256                    + parts.length + " fields. Should have been at least " + MIN_CRAWL_LOG_PARTS);
257        }
258
259        // Check the seed url (part 11 of the crawl-log-line).
260        // If equal to "-", the seed url is not written to the log,
261        // and this information is disregarded
262        // Note This information is disregarded if setting disregard_seed_url_information
263        // is enabled.
264
265        String seedURL = parts[10];
266
267        boolean sourceTagEnabled = true;
268        if (seedURL.equals("-") || disregardSeedUrlInfo) {
269            sourceTagEnabled = false;
270        }
271        String seedDomain = null;
272
273        if (sourceTagEnabled) {
274            try {
275                seedDomain = getDomainNameFromURIString(seedURL);
276                if (seedDomain != null) {
277                    // Transform any IDNA encoded seedDomain back to Unicode
278                    seedDomain = IDNA.toUnicode(seedDomain);
279                }
280            } catch (URISyntaxException e) {
281                log.debug("Unable to extract a domain from the seedURL found in field 11 of crawl.log: '{}'.", seedURL,
282                        e);
283            }
284        }
285
286        // Get the object domain name from the URL in the fourth field
287        String objectDomain = null;
288        String objectUrl = parts[3];
289
290        try {
291            objectDomain = getDomainNameFromURIString(objectUrl);
292            if (objectDomain != null) {
293                // Transform the any IDNA encoded domain back to Unicode
294                objectDomain = IDNA.toUnicode(objectDomain);
295            }
296        } catch (URISyntaxException e) {
297            log.debug("Unable to extract a domain from the object URL found in field 4 of crawl.log: '{}'.", objectUrl,
298                    e);
299        }
300
301        if (objectDomain == null && seedDomain == null) {
302            throw new ArgumentNotValid("Unable to find a domainName in the line: '" + line + "'.");
303        }
304
305        String domainName = null;
306
307        if (sourceTagEnabled && seedDomain != null) {
308            domainName = seedDomain;
309        } else if (objectDomain != null) {
310            domainName = objectDomain;
311        } else {
312            throw new ArgumentNotValid("Unable to find valid domainname");
313        }
314
315        // Get the response code for the URL in the second field
316        long response;
317        try {
318            response = Long.parseLong(parts[1]);
319        } catch (NumberFormatException e) {
320            throw new ArgumentNotValid("Unparsable response code in field 2 of crawl.log: '" + parts[1] + "'.");
321        }
322
323        // Get the byte count from annotation field "content-size"
324        // and the stop reason from annotation field if status code is -5003
325        StopReason stopReason = getDefaultStopReason();
326        long byteCounter = 0;
327        if (parts.length > MIN_CRAWL_LOG_PARTS) {
328            // test if any annotations exist
329            String[] annotations = parts[ANNOTATION_PART_INDEX].split(",");
330            for (String annotation : annotations) {
331                // ContentSizeAnnotationPostProcessor.CONTENT_SIZE_ANNOTATION_PREFIX
332                if (annotation.trim().startsWith(Heritrix1Constants.CONTENT_SIZE_ANNOTATION_PREFIX)) {
333                    try {
334                        // ContentSizeAnnotationPostProcessor.CONTENT_SIZE_ANNOTATION_PREFIX
335                        byteCounter = Long.parseLong(annotation
336                                .substring(Heritrix1Constants.CONTENT_SIZE_ANNOTATION_PREFIX.length()));
337                    } catch (NumberFormatException e) {
338                        throw new ArgumentNotValid("Unparsable annotation in field 12 of crawl.log: '"
339                                + parts[ANNOTATION_PART_INDEX] + "'.", e);
340                    }
341                }
342                if (response == Heritrix1Constants.CRAWLURI_S_BLOCKED_BY_QUOTA) {
343                    if (annotation.trim().equals("Q:group-max-all-kb")) {
344                        stopReason = StopReason.SIZE_LIMIT;
345                    } else if (annotation.trim().equals("Q:group-max-fetch-successes")) {
346                        stopReason = StopReason.OBJECT_LIMIT;
347                    }
348                }
349            }
350        }
351
352        // Update stats for domain
353        DomainStats dhi = getOrCreateDomainStats(domainName);
354
355        // Only count harvested URIs
356        if (response >= 0) {
357            long oldObjectCount = dhi.getObjectCount();
358            dhi.setObjectCount(oldObjectCount + 1);
359            long oldByteCount = dhi.getByteCount();
360            dhi.setByteCount(oldByteCount + byteCounter);
361        }
362        // Only if reason not set
363        if (dhi.getStopReason() == defaultStopReason) {
364            dhi.setStopReason(stopReason);
365        }
366    }
367
368        /**
369     * Extract DomainName from URI string. Does not handle Danish characters in URI.
370     *
371     * @param uriAsString a given URI as string.
372     * @return the domainName if possible or null, if not possible
373     * @throws URIException If unable to create valid URI from the given string
374     */
375    private String getDomainNameFromURIString(String uriAsString) throws URISyntaxException {
376        Uri uri = new Uri(uriAsString, UriProfile.RFC3986_ABS_16BIT_LAX);
377        String hostName;
378        if ("dns".equals(uri.getScheme())) {
379                hostName = uri.getPath();
380        } else {
381            hostName = uri.getHost();
382        }
383        if (hostName == null) {
384            log.debug("Not possible to extract domainname from URL: {}", uriAsString);
385            return null;
386        }
387        return DomainUtils.domainNameFromHostname(hostName);
388    }
389    
390    /**
391     * @return default stopReason
392     */
393    public StopReason getDefaultStopReason() {
394        return defaultStopReason;
395    }
396    
397    public Map<String, DomainStats> getDomainStatsMap() {
398        return this.domainstats;
399    }
400    
401    public static DomainStatsReport getDomainStatsReport(HeritrixFiles files) {
402                HarvestReportGenerator hrg = new HarvestReportGenerator(files);
403                return new DomainStatsReport(hrg.getDomainStatsMap(), hrg.getDefaultStopReason());
404        }
405    
406}