Source code

001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.harvesting.report;
024
025import gnu.inet.encoding.IDNA;
026
027import java.io.BufferedReader;
028import java.io.File;
029import java.io.FileReader;
030import java.io.IOException;
031import java.net.URISyntaxException;
032import java.util.HashMap;
033import java.util.Map;
034
035import org.apache.commons.httpclient.URIException;
036import org.jwat.common.Uri;
037import org.jwat.common.UriProfile;
038import org.slf4j.Logger;
039import org.slf4j.LoggerFactory;
040
041import dk.netarkivet.common.exceptions.ArgumentNotValid;
042import dk.netarkivet.common.exceptions.IOFailure;
043import dk.netarkivet.common.utils.DomainUtils;
044import dk.netarkivet.common.utils.FileUtils;
045import dk.netarkivet.common.utils.Settings;
046import dk.netarkivet.common.utils.StringUtils;
047import dk.netarkivet.harvester.HarvesterSettings;
048import dk.netarkivet.harvester.datamodel.StopReason;
049import dk.netarkivet.harvester.harvesting.HeritrixFiles;
050import dk.netarkivet.harvester.harvesting.distribute.DomainStats;
051
052/**
053 * Base implementation for a harvest report.
054 */
055@SuppressWarnings({"serial"})
056public class HarvestReportGenerator {
057
058    /** The logger for this class. */
059    private static final Logger log = LoggerFactory.getLogger(HarvestReportGenerator.class);
060
061    /**
062     * Strings found in the progress-statistics.log, used to devise the default stop reason for domains.
063     */
064    public static enum ProgressStatisticsConstants {
065
066        /**
067         * String in crawl.log, that Heritrix writes as the last entry in the progress-statistics.log.
068         */
069        ORDERLY_FINISH("CRAWL ENDED"),
070
071        /**
072         * This is written when Heritrix terminates the job due to its timelimit (max-time-sec) being reached.
073         */
074        TIMELIMIT_EXCEEDED("Timelimit hit"),
075
076        /**
077         * String which shows that the harvest was deliberately aborted from the Heritrix GUI or forcibly stopped by the
078         * Netarchive Suite software due to an inactivity timeout.
079         */
080        HARVEST_ABORTED("Ended by operator");
081
082        /** The pattern associated with a given enum value. */
083        private final String pattern;
084
085        /**
086         * Constructor for this enum class.
087         *
088         * @param pattern The pattern associated with a given enum value.
089         */
090        ProgressStatisticsConstants(String pattern) {
091            this.pattern = pattern;
092        }
093
094    }
095    
096    /** Datastructure holding the domain-information contained in one harvest. */
097    private final Map<String, DomainStats> domainstats = new HashMap<String, DomainStats>();
098
099    private HeritrixFiles heritrixFiles;
100
101    /**
102     * The default reason why we stopped harvesting this domain. This value is set by looking for a CRAWL ENDED in the
103     * crawl.log.
104     */
105    private StopReason defaultStopReason;
106
107    /**
108     * Default constructor that does nothing. The real construction is supposed to be done in the subclasses by filling
109     * out the domainStats map with crawl results.
110     */
111    public HarvestReportGenerator() {
112    }
113
114    /**
115     * Constructor from Heritrix report files. Subclasses might use a different set of Heritrix reports.
116     *
117     * @param files the set of Heritrix reports.
118     */
119    public HarvestReportGenerator(HeritrixFiles files) {
120        ArgumentNotValid.checkNotNull(files, "files");
121        this.heritrixFiles = files;
122        this.defaultStopReason = findDefaultStopReason(heritrixFiles.getProgressStatisticsLog());
123        preProcess(heritrixFiles);
124    }
125
126    /**
127     * Pre-processing happens when the report is built just at the end of the crawl, before the ARC files upload.
128     */
129    public void preProcess(HeritrixFiles files) {
130        if (log.isInfoEnabled()) {
131            log.info("Starting pre-processing of harvest report for job {}", files.getJobID());
132        }
133        long startTime = System.currentTimeMillis();
134
135        File crawlLog = files.getCrawlLog();
136        if (!crawlLog.isFile() || !crawlLog.canRead()) {
137            String errorMsg = "Not a file or not readable: " + crawlLog.getAbsolutePath();
138            throw new IOFailure(errorMsg);
139        }
140        parseCrawlLog(files.getCrawlLog());
141
142        if (log.isInfoEnabled()) {
143            long time = System.currentTimeMillis() - startTime;
144            log.info("Finished pre-processing of harvest report for job {}, operation took {}", files.getJobID(),
145                    StringUtils.formatDuration(time));
146        }
147    }
148   
149    /**
150     * Attempts to get an already existing {@link DomainStats} object for that domain, and if not found creates one with
151     * zero values.
152     *
153     * @param domainName the name of the domain to get DomainStats for.
154     * @return a DomainStats object for the given domain-name.
155     */
156    protected DomainStats getOrCreateDomainStats(String domainName) {
157        DomainStats dhi = domainstats.get(domainName);
158        if (dhi == null) {
159            dhi = new DomainStats(0L, 0L, defaultStopReason);
160            domainstats.put(domainName, dhi);
161        }
162
163        return dhi;
164    }
165
166    /**
167     * Find out whether we stopped normally in progress statistics log.
168     *
169     * @param logFile A progress-statistics.log file.
170     * @return StopReason.DOWNLOAD_COMPLETE for progress statistics ending with CRAWL ENDED,
171     * StopReason.DOWNLOAD_UNFINISHED otherwise or if file does not exist.
172     */
173    public static StopReason findDefaultStopReason(File logFile) {
174        ArgumentNotValid.checkNotNull(logFile, "File logFile");
175        if (!logFile.exists()) {
176            return StopReason.DOWNLOAD_UNFINISHED;
177        }
178        String lastLine = FileUtils.readLastLine(logFile);
179        if (lastLine.contains(ProgressStatisticsConstants.ORDERLY_FINISH.pattern)) {
180            if (lastLine.contains(ProgressStatisticsConstants.HARVEST_ABORTED.pattern)) {
181                return StopReason.DOWNLOAD_UNFINISHED;
182            } else if (lastLine.contains(ProgressStatisticsConstants.TIMELIMIT_EXCEEDED.pattern)) {
183                return StopReason.TIME_LIMIT;
184            } else {
185                return StopReason.DOWNLOAD_COMPLETE;
186            }
187        } else {
188            return StopReason.DOWNLOAD_UNFINISHED;
189        }
190    }
191
192    /**
193     * Computes the domain-name/byte-count and domain-name/object-count and domain-name/stopreason maps for a crawl.log.
194     *
195     * @param file the local file to be processed
196     * @throws IOFailure if there is problem reading the file
197     */
198    private void parseCrawlLog(File file) throws IOFailure {
199        // read whether or not to disregard the SeedURL information
200        // in the crawl.log
201        boolean disregardSeedUrls = Settings.getBoolean(HarvesterSettings.DISREGARD_SEEDURL_INFORMATION_IN_CRAWLLOG);
202        BufferedReader in = null;
203
204        try {
205            in = new BufferedReader(new FileReader(file));
206            String line;
207            int lineCnt = 0;
208            while ((line = in.readLine()) != null) {
209                ++lineCnt;
210                try {
211                    processHarvestLine(line, disregardSeedUrls);
212                } catch (ArgumentNotValid e) {
213                    log.debug("Invalid line in '{}' line {}: '{}'. Ignoring due to reason: {}", file.getAbsolutePath(),
214                            lineCnt, line, e.getMessage(), e);
215                }
216            }
217        } catch (IOException e) {
218            String msg = "Unable to open/read crawl.log file '" + file.getAbsolutePath() + "'.";
219            log.warn(msg, e);
220            throw new IOFailure(msg, e);
221        } finally {
222            if (in != null) {
223                try {
224                    in.close();
225                } catch (IOException e) {
226                    log.debug("Unable to close {}", file, e);
227                    // Can't throw here, as would destroy the real exception
228                }
229            }
230        }
231    }
232
233    /**
234     * Processes a harvest-line, updating the object and byte maps.
235     *
236     * @param line the line to process.
237     * @param disregardSeedUrlInfo Boolean saying whether or not to disregard SeedURL Information
238     */
239    private void processHarvestLine(final String line, boolean disregardSeedUrlInfo) {
240        // A legal crawl log line has at least 11 parts, + optional annotations
241
242        final int MIN_CRAWL_LOG_PARTS = 11;
243        final int MAX_PARTS = 12;
244        final int ANNOTATION_PART_INDEX = 11;
245        String[] parts = line.split("\\s+", MAX_PARTS);
246        if (parts.length < MIN_CRAWL_LOG_PARTS) {
247            throw new ArgumentNotValid("Not enough fields for line in crawl.log: '" + line + "'. Was only "
248                    + parts.length + " fields. Should have been at least " + MIN_CRAWL_LOG_PARTS);
249        }
250
251        // Check the seed url (part 11 of the crawl-log-line).
252        // If equal to "-", the seed url is not written to the log,
253        // and this information is disregarded
254        // Note This information is disregarded if setting disregard_seed_url_information
255        // is enabled.
256
257        String seedURL = parts[10];
258
259        boolean sourceTagEnabled = true;
260        if (seedURL.equals("-") || disregardSeedUrlInfo) {
261            sourceTagEnabled = false;
262        }
263        String seedDomain = null;
264
265        if (sourceTagEnabled) {
266            try {
267                seedDomain = getDomainNameFromURIString(seedURL);
268                if (seedDomain != null) {
269                    // Transform any IDNA encoded seedDomain back to Unicode
270                    seedDomain = IDNA.toUnicode(seedDomain);
271                }
272            } catch (URISyntaxException e) {
273                log.debug("Unable to extract a domain from the seedURL found in field 11 of crawl.log: '{}'.", seedURL,
274                        e);
275            }
276        }
277
278        // Get the object domain name from the URL in the fourth field
279        String objectDomain = null;
280        String objectUrl = parts[3];
281
282        try {
283            objectDomain = getDomainNameFromURIString(objectUrl);
284            if (objectDomain != null) {
285                // Transform the any IDNA encoded domain back to Unicode
286                objectDomain = IDNA.toUnicode(objectDomain);
287            }
288        } catch (URISyntaxException e) {
289            log.debug("Unable to extract a domain from the object URL found in field 4 of crawl.log: '{}'.", objectUrl,
290                    e);
291        }
292
293        if (objectDomain == null && seedDomain == null) {
294            throw new ArgumentNotValid("Unable to find a domainName in the line: '" + line + "'.");
295        }
296
297        String domainName = null;
298
299        if (sourceTagEnabled && seedDomain != null) {
300            domainName = seedDomain;
301        } else if (objectDomain != null) {
302            domainName = objectDomain;
303        } else {
304            throw new ArgumentNotValid("Unable to find valid domainname");
305        }
306
307        // Get the response code for the URL in the second field
308        long response;
309        try {
310            response = Long.parseLong(parts[1]);
311        } catch (NumberFormatException e) {
312            throw new ArgumentNotValid("Unparsable response code in field 2 of crawl.log: '" + parts[1] + "'.");
313        }
314
315        // Get the byte count from annotation field "content-size"
316        // and the stop reason from annotation field if status code is -5003
317        StopReason stopReason = getDefaultStopReason();
318        long byteCounter = 0;
319        if (parts.length > MIN_CRAWL_LOG_PARTS) {
320            // test if any annotations exist
321            String[] annotations = parts[ANNOTATION_PART_INDEX].split(",");
322            for (String annotation : annotations) {
323                // ContentSizeAnnotationPostProcessor.CONTENT_SIZE_ANNOTATION_PREFIX
324                if (annotation.trim().startsWith(Heritrix1Constants.CONTENT_SIZE_ANNOTATION_PREFIX)) {
325                    try {
326                        // ContentSizeAnnotationPostProcessor.CONTENT_SIZE_ANNOTATION_PREFIX
327                        byteCounter = Long.parseLong(annotation
328                                .substring(Heritrix1Constants.CONTENT_SIZE_ANNOTATION_PREFIX.length()));
329                    } catch (NumberFormatException e) {
330                        throw new ArgumentNotValid("Unparsable annotation in field 12 of crawl.log: '"
331                                + parts[ANNOTATION_PART_INDEX] + "'.", e);
332                    }
333                }
334                if (response == Heritrix1Constants.CRAWLURI_S_BLOCKED_BY_QUOTA) {
335                    if (annotation.trim().equals("Q:group-max-all-kb")) {
336                        stopReason = StopReason.SIZE_LIMIT;
337                    } else if (annotation.trim().equals("Q:group-max-fetch-successes")) {
338                        stopReason = StopReason.OBJECT_LIMIT;
339                    }
340                }
341            }
342        }
343
344        // Update stats for domain
345        DomainStats dhi = getOrCreateDomainStats(domainName);
346
347        // Only count harvested URIs
348        if (response >= 0) {
349            long oldObjectCount = dhi.getObjectCount();
350            dhi.setObjectCount(oldObjectCount + 1);
351            long oldByteCount = dhi.getByteCount();
352            dhi.setByteCount(oldByteCount + byteCounter);
353        }
354        // Only if reason not set
355        if (dhi.getStopReason() == defaultStopReason) {
356            dhi.setStopReason(stopReason);
357        }
358    }
359
360        /**
361     * Extract DomainName from URI string. Does not handle Danish characters in URI.
362     *
363     * @param uriAsString a given URI as string.
364     * @return the domainName if possible or null, if not possible
365     * @throws URIException If unable to create valid URI from the given string
366     */
367    private String getDomainNameFromURIString(String uriAsString) throws URISyntaxException {
368        Uri uri = new Uri(uriAsString, UriProfile.RFC3986_ABS_16BIT_LAX);
369        String hostName;
370        if ("dns".equals(uri.getScheme())) {
371                hostName = uri.getPath();
372        } else {
373            hostName = uri.getHost();
374        }
375        if (hostName == null) {
376            log.debug("Not possible to extract domainname from URL: {}", uriAsString);
377            return null;
378        }
379        return DomainUtils.domainNameFromHostname(hostName);
380    }
381    
382    /**
383     * 
384     * @return
385     */
386    public StopReason getDefaultStopReason() {
387        return defaultStopReason;
388    }
389    
390    public Map<String, DomainStats> getDomainStatsMap() {
391        return this.domainstats;
392    }
393    
394    public static DomainStatsReport getDomainStatsReport(HeritrixFiles files) {
395                HarvestReportGenerator hrg = new HarvestReportGenerator(files);
396                return new DomainStatsReport(hrg.getDomainStatsMap(), hrg.getDefaultStopReason());
397        }
398    
399}