001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.heritrix3.report; 024 025import gnu.inet.encoding.IDNA; 026 027import java.io.BufferedReader; 028import java.io.File; 029import java.io.FileReader; 030import java.io.IOException; 031import java.util.HashMap; 032import java.util.Map; 033 034import org.apache.commons.httpclient.URIException; 035import org.archive.url.UsableURI; 036import org.slf4j.Logger; 037import org.slf4j.LoggerFactory; 038 039import dk.netarkivet.common.exceptions.ArgumentNotValid; 040import dk.netarkivet.common.exceptions.IOFailure; 041import dk.netarkivet.common.utils.DomainUtils; 042import dk.netarkivet.common.utils.FileUtils; 043import dk.netarkivet.common.utils.FixedUURI; 044import dk.netarkivet.common.utils.Settings; 045import dk.netarkivet.common.utils.StringUtils; 046import dk.netarkivet.harvester.datamodel.StopReason; 047import dk.netarkivet.harvester.harvesting.distribute.DomainStats; 048import dk.netarkivet.harvester.harvesting.report.DomainStatsReport; 049import dk.netarkivet.harvester.harvesting.report.Heritrix1Constants; 050import dk.netarkivet.harvester.heritrix3.Heritrix3Files; 051import dk.netarkivet.harvester.heritrix3.Heritrix3Settings; 052 053/** 054 * Base implementation for a harvest report. 055 */ 056@SuppressWarnings({"serial"}) 057public class HarvestReportGenerator { 058 059 /** The logger for this class. */ 060 private static final Logger log = LoggerFactory.getLogger(HarvestReportGenerator.class); 061 062 063 private static final String BYTE_LIMIT_REACHED_ANNOTATION = "Q:groupMaxAllKb"; 064 private static final String OBJECT_LIMIT_REACHED_ANNOTATION = "Q:groupMaxFetchSuccesses"; 065 066 /** 067 * Strings found in the progress-statistics.log, used to devise the default stop reason for domains. 068 */ 069 public static enum ProgressStatisticsConstants { 070 071 /** 072 * String in crawl.log, that Heritrix writes as the last entry in the progress-statistics.log. 073 */ 074 ORDERLY_FINISH("CRAWL ENDED"), 075 076 /** 077 * This is written when Heritrix terminates the job due to its timelimit (max-time-sec) being reached. 078 */ 079 TIMELIMIT_EXCEEDED("Timelimit hit"), 080 081 /** 082 * String which shows that the harvest was deliberately aborted from the Heritrix GUI or forcibly stopped by the 083 * Netarchive Suite software due to an inactivity timeout. 084 */ 085 HARVEST_ABORTED("Ended by operator"); 086 087 /** The pattern associated with a given enum value. */ 088 private final String pattern; 089 090 /** 091 * Constructor for this enum class. 092 * 093 * @param pattern The pattern associated with a given enum value. 094 */ 095 ProgressStatisticsConstants(String pattern) { 096 this.pattern = pattern; 097 } 098 099 } 100 101 /** Datastructure holding the domain-information contained in one harvest. */ 102 private final Map<String, DomainStats> domainstats = new HashMap<String, DomainStats>(); 103 104 private Heritrix3Files heritrixFiles; 105 106 /** 107 * The default reason why we stopped harvesting this domain. This value is set by looking for a CRAWL ENDED in the 108 * crawl.log. 109 */ 110 private StopReason defaultStopReason; 111 112 /** 113 * Default constructor that does nothing. The real construction is supposed to be done in the subclasses by filling 114 * out the domainStats map with crawl results. 115 */ 116 public HarvestReportGenerator() { 117 } 118 119 /** 120 * Constructor from Heritrix report files. Subclasses might use a different set of Heritrix reports. 121 * 122 * @param files the set of Heritrix reports. 123 */ 124 public HarvestReportGenerator(Heritrix3Files files) { 125 ArgumentNotValid.checkNotNull(files, "files"); 126 this.heritrixFiles = files; 127 this.defaultStopReason = findDefaultStopReason(heritrixFiles.getProgressStatisticsLog()); 128 preProcess(heritrixFiles); 129 } 130 131 /** 132 * Pre-processing happens when the report is built just at the end of the crawl, before the ARC files upload. 133 */ 134 public void preProcess(Heritrix3Files files) { 135 if (log.isInfoEnabled()) { 136 log.info("Starting pre-processing of harvest report for job {}", files.getJobID()); 137 } 138 long startTime = System.currentTimeMillis(); 139 140 File crawlLog = files.getCrawlLog(); 141 if (!crawlLog.isFile() || !crawlLog.canRead()) { 142 String errorMsg = "Not a file or not readable: " + crawlLog.getAbsolutePath(); 143 throw new IOFailure(errorMsg); 144 } 145 parseCrawlLog(files.getCrawlLog()); 146 147 if (log.isInfoEnabled()) { 148 long time = System.currentTimeMillis() - startTime; 149 log.info("Finished pre-processing of harvest report for job {}, operation took {}", files.getJobID(), 150 StringUtils.formatDuration(time)); 151 } 152 } 153 154 /** 155 * Attempts to get an already existing {@link DomainStats} object for that domain, and if not found creates one with 156 * zero values. 157 * 158 * @param domainName the name of the domain to get DomainStats for. 159 * @return a DomainStats object for the given domain-name. 160 */ 161 protected DomainStats getOrCreateDomainStats(String domainName) { 162 DomainStats dhi = domainstats.get(domainName); 163 if (dhi == null) { 164 dhi = new DomainStats(0L, 0L, defaultStopReason); 165 domainstats.put(domainName, dhi); 166 } 167 168 return dhi; 169 } 170 171 /** 172 * Find out whether we stopped normally in progress statistics log. 173 * 174 * @param logFile A progress-statistics.log file. 175 * @return StopReason.DOWNLOAD_COMPLETE for progress statistics ending with CRAWL ENDED, 176 * StopReason.DOWNLOAD_UNFINISHED otherwise or if file does not exist. 177 */ 178 public static StopReason findDefaultStopReason(File logFile) { 179 ArgumentNotValid.checkNotNull(logFile, "File logFile"); 180 if (!logFile.exists()) { 181 return StopReason.DOWNLOAD_UNFINISHED; 182 } 183 String lastLine = FileUtils.readLastLine(logFile); 184 if (lastLine.contains(ProgressStatisticsConstants.ORDERLY_FINISH.pattern)) { 185 if (lastLine.contains(ProgressStatisticsConstants.HARVEST_ABORTED.pattern)) { 186 return StopReason.DOWNLOAD_UNFINISHED; 187 } else if (lastLine.contains(ProgressStatisticsConstants.TIMELIMIT_EXCEEDED.pattern)) { 188 return StopReason.TIME_LIMIT; 189 } else { 190 return StopReason.DOWNLOAD_COMPLETE; 191 } 192 } else { 193 return StopReason.DOWNLOAD_UNFINISHED; 194 } 195 } 196 197 /** 198 * Computes the domain-name/byte-count and domain-name/object-count and domain-name/stopreason maps for a crawl.log. 199 * 200 * @param file the local file to be processed 201 * @throws IOFailure if there is problem reading the file 202 */ 203 private void parseCrawlLog(File file) throws IOFailure { 204 // read whether or not to disregard the SeedURL information 205 // in the crawl.log 206 boolean disregardSeedUrls = Settings.getBoolean(Heritrix3Settings.DISREGARD_SEEDURL_INFORMATION_IN_CRAWLLOG); 207 log.info("DISREGARD_SEEDURL_INFORMATION_IN_CRAWLLOG: " + disregardSeedUrls); 208 BufferedReader in = null; 209 210 try { 211 in = new BufferedReader(new FileReader(file)); 212 String line; 213 int lineCnt = 0; 214 while ((line = in.readLine()) != null) { 215 ++lineCnt; 216 try { 217 processHarvestLine(line, disregardSeedUrls); 218 } catch (ArgumentNotValid e) { 219 log.debug("Invalid line in '{}' line {}: '{}'. Ignoring due to reason: {}", file.getAbsolutePath(), 220 lineCnt, line, e.getMessage(), e); 221 } 222 } 223 } catch (IOException e) { 224 String msg = "Unable to open/read crawl.log file '" + file.getAbsolutePath() + "'."; 225 log.warn(msg, e); 226 throw new IOFailure(msg, e); 227 } finally { 228 229 if (in != null) { 230 try { 231 in.close(); 232 } catch (IOException e) { 233 log.debug("Unable to close {}", file, e); 234 // Can't throw here, as would destroy the real exception 235 } 236 } 237 } 238 } 239 240 /** 241 * Processes a harvest-line, updating the object and byte maps. 242 * 243 * @param line the line to process. 244 * @param disregardSeedUrlInfo Boolean saying whether or not to disregard SeedURL Information 245 */ 246 private void processHarvestLine(final String line, boolean disregardSeedUrlInfo) { 247 // A legal crawl log line has at least 11 parts, + optional annotations 248 249 final int MIN_CRAWL_LOG_PARTS = 11; 250 final int MAX_PARTS = 12; 251 final int ANNOTATION_PART_INDEX = 11; 252 String[] parts = line.split("\\s+", MAX_PARTS); 253 if (parts.length < MIN_CRAWL_LOG_PARTS) { 254 throw new ArgumentNotValid("Not enough fields for line in crawl.log: '" + line + "'. Was only " 255 + parts.length + " fields. Should have been at least " + MIN_CRAWL_LOG_PARTS); 256 } 257 258 // Check the seed url (part 11 of the crawl-log-line). 259 // If equal to "-", the seed url is not written to the log, 260 // and this information is disregarded 261 // Note This information is disregarded if setting disregard_seed_url_information 262 // is enabled. 263 264 String seedURL = parts[10]; 265 266 boolean sourceTagEnabled = true; 267 if (seedURL.equals("-") || disregardSeedUrlInfo) { 268 sourceTagEnabled = false; 269 } 270 String seedDomain = null; 271 272 if (sourceTagEnabled) { 273 try { 274 seedDomain = getDomainNameFromURIString(seedURL); 275 if (seedDomain != null) { 276 // Transform any IDNA encoded seedDomain back to Unicode 277 seedDomain = IDNA.toUnicode(seedDomain); 278 } 279 } catch (URIException e) { 280 log.debug("Unable to extract a domain from the seedURL found in field 11 of crawl.log: '{}'.", seedURL, 281 e); 282 } 283 } 284 285 // Get the object domain name from the URL in the fourth field 286 String objectDomain = null; 287 String objectUrl = parts[3]; 288 289 try { 290 objectDomain = getDomainNameFromURIString(objectUrl); 291 if (objectDomain != null) { 292 // Transform the any IDNA encoded domain back to Unicode 293 objectDomain = IDNA.toUnicode(objectDomain); 294 } 295 } catch (URIException e) { 296 log.debug("Unable to extract a domain from the object URL found in field 4 of crawl.log: '{}'.", objectUrl, 297 e); 298 } 299 300 if (objectDomain == null && seedDomain == null) { 301 throw new ArgumentNotValid("Unable to find a domainName in the line: '" + line + "'."); 302 } 303 304 String domainName = null; 305 306 if (sourceTagEnabled && seedDomain != null) { 307 domainName = seedDomain; 308 } else if (objectDomain != null) { 309 domainName = objectDomain; 310 } else { 311 throw new ArgumentNotValid("Unable to find valid domainname"); 312 } 313 314 // Get the response code for the URL in the second field 315 long response; 316 try { 317 response = Long.parseLong(parts[1]); 318 } catch (NumberFormatException e) { 319 throw new ArgumentNotValid("Unparsable response code in field 2 of crawl.log: '" + parts[1] + "'."); 320 } 321 322 // Get the byte count from annotation field "content-size" 323 // and the stop reason from annotation field if status code is -5003 324 StopReason stopReason = getDefaultStopReason(); 325 long byteCounter = 0; 326 if (parts.length > MIN_CRAWL_LOG_PARTS) { 327 // test if any annotations exist 328 String[] annotations = parts[ANNOTATION_PART_INDEX].split(","); 329 for (String annotation : annotations) { 330 // ContentSizeAnnotationPostProcessor.CONTENT_SIZE_ANNOTATION_PREFIX 331 if (annotation.trim().startsWith(Heritrix1Constants.CONTENT_SIZE_ANNOTATION_PREFIX)) { 332 try { 333 // ContentSizeAnnotationPostProcessor.CONTENT_SIZE_ANNOTATION_PREFIX 334 byteCounter = Long.parseLong(annotation 335 .substring(Heritrix1Constants.CONTENT_SIZE_ANNOTATION_PREFIX.length())); 336 } catch (NumberFormatException e) { 337 throw new ArgumentNotValid("Unparsable annotation in field 12 of crawl.log: '" 338 + parts[ANNOTATION_PART_INDEX] + "'.", e); 339 } 340 } 341 if (response == Heritrix1Constants.CRAWLURI_S_BLOCKED_BY_QUOTA) { 342 if (annotation.trim().equals(BYTE_LIMIT_REACHED_ANNOTATION)) { 343 stopReason = StopReason.SIZE_LIMIT; 344 } else if (annotation.trim().equals(OBJECT_LIMIT_REACHED_ANNOTATION)) { 345 stopReason = StopReason.OBJECT_LIMIT; 346 } 347 } 348 } 349 } 350 351 // Update stats for domain 352 DomainStats dhi = getOrCreateDomainStats(domainName); 353 354 // Only count harvested URIs 355 if (response >= 0) { 356 long oldObjectCount = dhi.getObjectCount(); 357 dhi.setObjectCount(oldObjectCount + 1); 358 long oldByteCount = dhi.getByteCount(); 359 dhi.setByteCount(oldByteCount + byteCounter); 360 } 361 // Only if reason not set 362 if (dhi.getStopReason() == defaultStopReason) { 363 dhi.setStopReason(stopReason); 364 } 365 } 366 367 /** 368 * Extract DomainName from URI string. Does not handle Danish characters in URI. 369 * 370 * @param uriAsString a given URI as string. 371 * @return the domainName if possible or null, if not possible 372 * @throws URIException If unable to create valid URI from the given string 373 */ 374 private String getDomainNameFromURIString(String uriAsString) throws URIException { 375 UsableURI uuri = new FixedUURI(uriAsString, false); 376 String hostName = uuri.getReferencedHost(); 377 if (hostName == null) { 378 log.debug("Not possible to extract domainname from URL: {}", uriAsString); 379 return null; 380 } 381 return DomainUtils.domainNameFromHostname(hostName); 382 } 383 384 /** 385 * @return the default stop reason. 386 */ 387 public StopReason getDefaultStopReason() { 388 return defaultStopReason; 389 } 390 391 /** 392 * @return the domainStatsMap generated from parsing the crawl-log. 393 */ 394 public Map<String, DomainStats> getDomainStatsMap() { 395 return this.domainstats; 396 } 397 398 /** 399 * @param files A set of Heritrix3 files used to produce a a HarvestReport. 400 * @return a DomainStatsReport for a specific H3 crawl. 401 */ 402 public static DomainStatsReport getDomainStatsReport(Heritrix3Files files) { 403 HarvestReportGenerator hrg = new HarvestReportGenerator(files); 404 return new DomainStatsReport(hrg.getDomainStatsMap(), hrg.getDefaultStopReason()); 405 } 406 407}