Source code

001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.heritrix3.report;
024
025import gnu.inet.encoding.IDNA;
026
027import java.io.BufferedReader;
028import java.io.File;
029import java.io.FileReader;
030import java.io.IOException;
031import java.util.HashMap;
032import java.util.Map;
033
034import org.apache.commons.httpclient.URIException;
035import org.archive.url.UsableURI;
036import org.slf4j.Logger;
037import org.slf4j.LoggerFactory;
038
039import dk.netarkivet.common.exceptions.ArgumentNotValid;
040import dk.netarkivet.common.exceptions.IOFailure;
041import dk.netarkivet.common.utils.DomainUtils;
042import dk.netarkivet.common.utils.FileUtils;
043import dk.netarkivet.common.utils.FixedUURI;
044import dk.netarkivet.common.utils.Settings;
045import dk.netarkivet.common.utils.StringUtils;
046import dk.netarkivet.harvester.datamodel.StopReason;
047import dk.netarkivet.harvester.harvesting.distribute.DomainStats;
048import dk.netarkivet.harvester.harvesting.report.DomainStatsReport;
049import dk.netarkivet.harvester.harvesting.report.Heritrix1Constants;
050import dk.netarkivet.harvester.heritrix3.Heritrix3Files;
051import dk.netarkivet.harvester.heritrix3.Heritrix3Settings;
052
053/**
054 * Base implementation for a harvest report.
055 */
056@SuppressWarnings({"serial"})
057public class HarvestReportGenerator {
058
059    /** The logger for this class. */
060    private static final Logger log = LoggerFactory.getLogger(HarvestReportGenerator.class);
061
062
063    private static final String BYTE_LIMIT_REACHED_ANNOTATION = "Q:groupMaxAllKb";
064    private static final String OBJECT_LIMIT_REACHED_ANNOTATION = "Q:groupMaxFetchSuccesses";
065
066    /**
067     * Strings found in the progress-statistics.log, used to devise the default stop reason for domains.
068     */
069    public static enum ProgressStatisticsConstants {
070
071        /**
072         * String in crawl.log, that Heritrix writes as the last entry in the progress-statistics.log.
073         */
074        ORDERLY_FINISH("CRAWL ENDED"),
075
076        /**
077         * This is written when Heritrix terminates the job due to its timelimit (max-time-sec) being reached.
078         */
079        TIMELIMIT_EXCEEDED("Timelimit hit"),
080
081        /**
082         * String which shows that the harvest was deliberately aborted from the Heritrix GUI or forcibly stopped by the
083         * Netarchive Suite software due to an inactivity timeout.
084         */
085        HARVEST_ABORTED("Ended by operator");
086
087        /** The pattern associated with a given enum value. */
088        private final String pattern;
089
090        /**
091         * Constructor for this enum class.
092         *
093         * @param pattern The pattern associated with a given enum value.
094         */
095        ProgressStatisticsConstants(String pattern) {
096            this.pattern = pattern;
097        }
098
099    }
100    
101    /** Datastructure holding the domain-information contained in one harvest. */
102    private final Map<String, DomainStats> domainstats = new HashMap<String, DomainStats>();
103
104    private Heritrix3Files heritrixFiles;
105
106    /**
107     * The default reason why we stopped harvesting this domain. This value is set by looking for a CRAWL ENDED in the
108     * crawl.log.
109     */
110    private StopReason defaultStopReason;
111
112    /**
113     * Default constructor that does nothing. The real construction is supposed to be done in the subclasses by filling
114     * out the domainStats map with crawl results.
115     */
116    public HarvestReportGenerator() {
117    }
118
119    /**
120     * Constructor from Heritrix report files. Subclasses might use a different set of Heritrix reports.
121     *
122     * @param files the set of Heritrix reports.
123     */
124    public HarvestReportGenerator(Heritrix3Files files) {
125        ArgumentNotValid.checkNotNull(files, "files");
126        this.heritrixFiles = files;
127        this.defaultStopReason = findDefaultStopReason(heritrixFiles.getProgressStatisticsLog());
128        preProcess(heritrixFiles);
129    }
130
131    /**
132     * Pre-processing happens when the report is built just at the end of the crawl, before the ARC files upload.
133     */
134    public void preProcess(Heritrix3Files files) {
135        if (log.isInfoEnabled()) {
136            log.info("Starting pre-processing of harvest report for job {}", files.getJobID());
137        }
138        long startTime = System.currentTimeMillis();
139
140        File crawlLog = files.getCrawlLog();
141        if (!crawlLog.isFile() || !crawlLog.canRead()) {
142            String errorMsg = "Not a file or not readable: " + crawlLog.getAbsolutePath();
143            throw new IOFailure(errorMsg);
144        }
145        parseCrawlLog(files.getCrawlLog());
146
147        if (log.isInfoEnabled()) {
148            long time = System.currentTimeMillis() - startTime;
149            log.info("Finished pre-processing of harvest report for job {}, operation took {}", files.getJobID(),
150                    StringUtils.formatDuration(time));
151        }
152    }
153   
154    /**
155     * Attempts to get an already existing {@link DomainStats} object for that domain, and if not found creates one with
156     * zero values.
157     *
158     * @param domainName the name of the domain to get DomainStats for.
159     * @return a DomainStats object for the given domain-name.
160     */
161    protected DomainStats getOrCreateDomainStats(String domainName) {
162        DomainStats dhi = domainstats.get(domainName);
163        if (dhi == null) {
164            dhi = new DomainStats(0L, 0L, defaultStopReason);
165            domainstats.put(domainName, dhi);
166        }
167
168        return dhi;
169    }
170
171    /**
172     * Find out whether we stopped normally in progress statistics log.
173     *
174     * @param logFile A progress-statistics.log file.
175     * @return StopReason.DOWNLOAD_COMPLETE for progress statistics ending with CRAWL ENDED,
176     * StopReason.DOWNLOAD_UNFINISHED otherwise or if file does not exist.
177     */
178    public static StopReason findDefaultStopReason(File logFile) {
179        ArgumentNotValid.checkNotNull(logFile, "File logFile");
180        if (!logFile.exists()) {
181            return StopReason.DOWNLOAD_UNFINISHED;
182        }
183        String lastLine = FileUtils.readLastLine(logFile);
184        if (lastLine.contains(ProgressStatisticsConstants.ORDERLY_FINISH.pattern)) {
185            if (lastLine.contains(ProgressStatisticsConstants.HARVEST_ABORTED.pattern)) {
186                return StopReason.DOWNLOAD_UNFINISHED;
187            } else if (lastLine.contains(ProgressStatisticsConstants.TIMELIMIT_EXCEEDED.pattern)) {
188                return StopReason.TIME_LIMIT;
189            } else {
190                return StopReason.DOWNLOAD_COMPLETE;
191            }
192        } else {
193            return StopReason.DOWNLOAD_UNFINISHED;
194        }
195    }
196
197    /**
198     * Computes the domain-name/byte-count and domain-name/object-count and domain-name/stopreason maps for a crawl.log.
199     *
200     * @param file the local file to be processed
201     * @throws IOFailure if there is problem reading the file
202     */
203    private void parseCrawlLog(File file) throws IOFailure {
204        // read whether or not to disregard the SeedURL information
205        // in the crawl.log
206        boolean disregardSeedUrls = Settings.getBoolean(Heritrix3Settings.DISREGARD_SEEDURL_INFORMATION_IN_CRAWLLOG);
207        log.info("DISREGARD_SEEDURL_INFORMATION_IN_CRAWLLOG: " + disregardSeedUrls); 
208        BufferedReader in = null;
209
210        try {
211            in = new BufferedReader(new FileReader(file));
212            String line;
213            int lineCnt = 0;
214            while ((line = in.readLine()) != null) {
215                ++lineCnt;
216                try {
217                    processHarvestLine(line, disregardSeedUrls);
218                } catch (ArgumentNotValid e) {
219                    log.debug("Invalid line in '{}' line {}: '{}'. Ignoring due to reason: {}", file.getAbsolutePath(),
220                            lineCnt, line, e.getMessage(), e);
221                }
222            }
223        } catch (IOException e) {
224            String msg = "Unable to open/read crawl.log file '" + file.getAbsolutePath() + "'.";
225            log.warn(msg, e);
226            throw new IOFailure(msg, e);
227        } finally {
228                
229            if (in != null) {
230                try {
231                    in.close();
232                } catch (IOException e) {
233                    log.debug("Unable to close {}", file, e);
234                    // Can't throw here, as would destroy the real exception
235                }
236            }
237        }
238    }
239
240    /**
241     * Processes a harvest-line, updating the object and byte maps.
242     *
243     * @param line the line to process.
244     * @param disregardSeedUrlInfo Boolean saying whether or not to disregard SeedURL Information
245     */
246    private void processHarvestLine(final String line, boolean disregardSeedUrlInfo) {
247        // A legal crawl log line has at least 11 parts, + optional annotations
248
249        final int MIN_CRAWL_LOG_PARTS = 11;
250        final int MAX_PARTS = 12;
251        final int ANNOTATION_PART_INDEX = 11;
252        String[] parts = line.split("\\s+", MAX_PARTS);
253        if (parts.length < MIN_CRAWL_LOG_PARTS) {
254            throw new ArgumentNotValid("Not enough fields for line in crawl.log: '" + line + "'. Was only "
255                    + parts.length + " fields. Should have been at least " + MIN_CRAWL_LOG_PARTS);
256        }
257
258        // Check the seed url (part 11 of the crawl-log-line).
259        // If equal to "-", the seed url is not written to the log,
260        // and this information is disregarded
261        // Note This information is disregarded if setting disregard_seed_url_information
262        // is enabled.
263
264        String seedURL = parts[10];
265
266        boolean sourceTagEnabled = true;
267        if (seedURL.equals("-") || disregardSeedUrlInfo) {
268            sourceTagEnabled = false;
269        }
270        String seedDomain = null;
271
272        if (sourceTagEnabled) {
273            try {
274                seedDomain = getDomainNameFromURIString(seedURL);
275                if (seedDomain != null) {
276                    // Transform any IDNA encoded seedDomain back to Unicode
277                    seedDomain = IDNA.toUnicode(seedDomain);
278                }
279            } catch (URIException e) {
280                log.debug("Unable to extract a domain from the seedURL found in field 11 of crawl.log: '{}'.", seedURL,
281                        e);
282            }
283        }
284
285        // Get the object domain name from the URL in the fourth field
286        String objectDomain = null;
287        String objectUrl = parts[3];
288
289        try {
290            objectDomain = getDomainNameFromURIString(objectUrl);
291            if (objectDomain != null) {
292                // Transform the any IDNA encoded domain back to Unicode
293                objectDomain = IDNA.toUnicode(objectDomain);
294            }
295        } catch (URIException e) {
296            log.debug("Unable to extract a domain from the object URL found in field 4 of crawl.log: '{}'.", objectUrl,
297                    e);
298        }
299
300        if (objectDomain == null && seedDomain == null) {
301            throw new ArgumentNotValid("Unable to find a domainName in the line: '" + line + "'.");
302        }
303
304        String domainName = null;
305
306        if (sourceTagEnabled && seedDomain != null) {
307            domainName = seedDomain;
308        } else if (objectDomain != null) {
309            domainName = objectDomain;
310        } else {
311            throw new ArgumentNotValid("Unable to find valid domainname");
312        }
313
314        // Get the response code for the URL in the second field
315        long response;
316        try {
317            response = Long.parseLong(parts[1]);
318        } catch (NumberFormatException e) {
319            throw new ArgumentNotValid("Unparsable response code in field 2 of crawl.log: '" + parts[1] + "'.");
320        }
321
322        // Get the byte count from annotation field "content-size"
323        // and the stop reason from annotation field if status code is -5003
324        StopReason stopReason = getDefaultStopReason();
325        long byteCounter = 0;
326        if (parts.length > MIN_CRAWL_LOG_PARTS) {
327            // test if any annotations exist
328            String[] annotations = parts[ANNOTATION_PART_INDEX].split(",");
329            for (String annotation : annotations) {
330                // ContentSizeAnnotationPostProcessor.CONTENT_SIZE_ANNOTATION_PREFIX
331                if (annotation.trim().startsWith(Heritrix1Constants.CONTENT_SIZE_ANNOTATION_PREFIX)) {
332                    try {
333                        // ContentSizeAnnotationPostProcessor.CONTENT_SIZE_ANNOTATION_PREFIX
334                        byteCounter = Long.parseLong(annotation
335                                .substring(Heritrix1Constants.CONTENT_SIZE_ANNOTATION_PREFIX.length()));
336                    } catch (NumberFormatException e) {
337                        throw new ArgumentNotValid("Unparsable annotation in field 12 of crawl.log: '"
338                                + parts[ANNOTATION_PART_INDEX] + "'.", e);
339                    }
340                }
341                if (response == Heritrix1Constants.CRAWLURI_S_BLOCKED_BY_QUOTA) {
342                    if (annotation.trim().equals(BYTE_LIMIT_REACHED_ANNOTATION)) {
343                        stopReason = StopReason.SIZE_LIMIT;
344                    } else if (annotation.trim().equals(OBJECT_LIMIT_REACHED_ANNOTATION)) {
345                        stopReason = StopReason.OBJECT_LIMIT;
346                    }
347                }
348            }
349        }
350
351        // Update stats for domain
352        DomainStats dhi = getOrCreateDomainStats(domainName);
353
354        // Only count harvested URIs
355        if (response >= 0) {
356            long oldObjectCount = dhi.getObjectCount();
357            dhi.setObjectCount(oldObjectCount + 1);
358            long oldByteCount = dhi.getByteCount();
359            dhi.setByteCount(oldByteCount + byteCounter);
360        }
361        // Only if reason not set
362        if (dhi.getStopReason() == defaultStopReason) {
363            dhi.setStopReason(stopReason);
364        }
365    }
366
367        /**
368     * Extract DomainName from URI string. Does not handle Danish characters in URI.
369     *
370     * @param uriAsString a given URI as string.
371     * @return the domainName if possible or null, if not possible
372     * @throws URIException If unable to create valid URI from the given string
373     */
374    private String getDomainNameFromURIString(String uriAsString) throws URIException {
375        UsableURI uuri = new FixedUURI(uriAsString, false);
376        String hostName = uuri.getReferencedHost();
377        if (hostName == null) {
378            log.debug("Not possible to extract domainname from URL: {}", uriAsString);
379            return null;
380        }
381        return DomainUtils.domainNameFromHostname(hostName);
382    }
383    
384    /**
385     * @return the default stop reason. 
386     */
387    public StopReason getDefaultStopReason() {
388        return defaultStopReason;
389    }
390    
391    /**
392     * @return the domainStatsMap generated from parsing the crawl-log.
393     */
394    public Map<String, DomainStats> getDomainStatsMap() {
395        return this.domainstats;
396    }
397    
398    /**
399     * @param files A set of Heritrix3 files used to produce a a HarvestReport.
400     * @return a DomainStatsReport for a specific H3 crawl.
401     */
402        public static DomainStatsReport getDomainStatsReport(Heritrix3Files files) {
403                HarvestReportGenerator hrg = new HarvestReportGenerator(files);
404                return new DomainStatsReport(hrg.getDomainStatsMap(), hrg.getDefaultStopReason());
405        }
406    
407}