001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.harvesting.report; 024 025import gnu.inet.encoding.IDNA; 026 027import java.io.BufferedReader; 028import java.io.File; 029import java.io.FileReader; 030import java.io.IOException; 031import java.net.URISyntaxException; 032import java.util.HashMap; 033import java.util.Map; 034 035import org.apache.commons.httpclient.URIException; 036import org.jwat.common.Uri; 037import org.jwat.common.UriProfile; 038import org.slf4j.Logger; 039import org.slf4j.LoggerFactory; 040 041import dk.netarkivet.common.exceptions.ArgumentNotValid; 042import dk.netarkivet.common.exceptions.IOFailure; 043import dk.netarkivet.common.utils.DomainUtils; 044import dk.netarkivet.common.utils.FileUtils; 045import dk.netarkivet.common.utils.Settings; 046import dk.netarkivet.common.utils.StringUtils; 047import dk.netarkivet.harvester.HarvesterSettings; 048import dk.netarkivet.harvester.datamodel.StopReason; 049import dk.netarkivet.harvester.harvesting.HeritrixFiles; 050import dk.netarkivet.harvester.harvesting.distribute.DomainStats; 051 052/** 053 * Base implementation for a harvest report. 054 * The constructor gets the data in a crawl.log file, and parses the file. The crawl.log is described in the 055 * Heritrix user-manual, section 8.2.1: http://crawler.archive.org/articles/user_manual/analysis.html#logs Note: 056 * Invalid lines are logged and then ignored. 057 * <p> 058 * Each url listed in the file is assigned to a domain, the total object count and byte count per domain is 059 * calculated. Finally, a StopReason is found for each domain: When the response is CrawlURI.S_BLOCKED_BY_QUOTA ( 060 * currently = -5003), the StopReason is set to StopReason.SIZE_LIMIT, if the annotation equals "Q:group-max-all-kb" 061 * or StopReason.OBJECT_LIMIT, if the annotation equals "Q:group-max-fetch-successes". 062 */ 063@SuppressWarnings({"serial"}) 064public class HarvestReportGenerator { 065 066 /** The logger for this class. */ 067 private static final Logger log = LoggerFactory.getLogger(HarvestReportGenerator.class); 068 069 /** 070 * Strings found in the progress-statistics.log, used to devise the default stop reason for domains. 071 */ 072 public static enum ProgressStatisticsConstants { 073 074 /** 075 * String in crawl.log, that Heritrix writes as the last entry in the progress-statistics.log. 076 */ 077 ORDERLY_FINISH("CRAWL ENDED"), 078 079 /** 080 * This is written when Heritrix terminates the job due to its timelimit (max-time-sec) being reached. 081 */ 082 TIMELIMIT_EXCEEDED("Timelimit hit"), 083 084 /** 085 * String which shows that the harvest was deliberately aborted from the Heritrix GUI or forcibly stopped by the 086 * Netarchive Suite software due to an inactivity timeout. 087 */ 088 HARVEST_ABORTED("Ended by operator"); 089 090 /** The pattern associated with a given enum value. */ 091 private final String pattern; 092 093 /** 094 * Constructor for this enum class. 095 * 096 * @param pattern The pattern associated with a given enum value. 097 */ 098 ProgressStatisticsConstants(String pattern) { 099 this.pattern = pattern; 100 } 101 102 } 103 104 /** Datastructure holding the domain-information contained in one harvest. */ 105 private final Map<String, DomainStats> domainstats = new HashMap<String, DomainStats>(); 106 107 private HeritrixFiles heritrixFiles; 108 109 /** 110 * The default reason why we stopped harvesting this domain. This value is set by looking for a CRAWL ENDED in the 111 * crawl.log. 112 */ 113 private StopReason defaultStopReason; 114 115 /** 116 * Default constructor that does nothing. The real construction is supposed to be done in the subclasses by filling 117 * out the domainStats map with crawl results. 118 */ 119 public HarvestReportGenerator() { 120 } 121 122 /** 123 * Constructor from Heritrix report files. Subclasses might use a different set of Heritrix reports. 124 * 125 * @param files the set of Heritrix reports. 126 */ 127 public HarvestReportGenerator(HeritrixFiles files) { 128 ArgumentNotValid.checkNotNull(files, "files"); 129 this.heritrixFiles = files; 130 this.defaultStopReason = findDefaultStopReason(heritrixFiles.getProgressStatisticsLog()); 131 preProcess(heritrixFiles); 132 } 133 134 /** 135 * Pre-processing happens when the report is built just at the end of the crawl, before the ARC files upload. 136 */ 137 public void preProcess(HeritrixFiles files) { 138 if (log.isInfoEnabled()) { 139 log.info("Starting pre-processing of harvest report for job {}", files.getJobID()); 140 } 141 long startTime = System.currentTimeMillis(); 142 143 File crawlLog = files.getCrawlLog(); 144 if (!crawlLog.isFile() || !crawlLog.canRead()) { 145 String errorMsg = "Not a file or not readable: " + crawlLog.getAbsolutePath(); 146 throw new IOFailure(errorMsg); 147 } 148 parseCrawlLog(files.getCrawlLog()); 149 150 if (log.isInfoEnabled()) { 151 long time = System.currentTimeMillis() - startTime; 152 log.info("Finished pre-processing of harvest report for job {}, operation took {}", files.getJobID(), 153 StringUtils.formatDuration(time)); 154 } 155 } 156 157 /** 158 * Attempts to get an already existing {@link DomainStats} object for that domain, and if not found creates one with 159 * zero values. 160 * 161 * @param domainName the name of the domain to get DomainStats for. 162 * @return a DomainStats object for the given domain-name. 163 */ 164 protected DomainStats getOrCreateDomainStats(String domainName) { 165 DomainStats dhi = domainstats.get(domainName); 166 if (dhi == null) { 167 dhi = new DomainStats(0L, 0L, defaultStopReason); 168 domainstats.put(domainName, dhi); 169 } 170 171 return dhi; 172 } 173 174 /** 175 * Find out whether we stopped normally in progress statistics log. 176 * 177 * @param logFile A progress-statistics.log file. 178 * @return StopReason.DOWNLOAD_COMPLETE for progress statistics ending with CRAWL ENDED, 179 * StopReason.DOWNLOAD_UNFINISHED otherwise or if file does not exist. 180 */ 181 public static StopReason findDefaultStopReason(File logFile) { 182 ArgumentNotValid.checkNotNull(logFile, "File logFile"); 183 if (!logFile.exists()) { 184 return StopReason.DOWNLOAD_UNFINISHED; 185 } 186 String lastLine = FileUtils.readLastLine(logFile); 187 if (lastLine.contains(ProgressStatisticsConstants.ORDERLY_FINISH.pattern)) { 188 if (lastLine.contains(ProgressStatisticsConstants.HARVEST_ABORTED.pattern)) { 189 return StopReason.DOWNLOAD_UNFINISHED; 190 } else if (lastLine.contains(ProgressStatisticsConstants.TIMELIMIT_EXCEEDED.pattern)) { 191 return StopReason.TIME_LIMIT; 192 } else { 193 return StopReason.DOWNLOAD_COMPLETE; 194 } 195 } else { 196 return StopReason.DOWNLOAD_UNFINISHED; 197 } 198 } 199 200 /** 201 * Computes the domain-name/byte-count and domain-name/object-count and domain-name/stopreason maps for a crawl.log. 202 * 203 * @param file the local file to be processed 204 * @throws IOFailure if there is problem reading the file 205 */ 206 private void parseCrawlLog(File file) throws IOFailure { 207 // read whether or not to disregard the SeedURL information 208 // in the crawl.log 209 boolean disregardSeedUrls = Settings.getBoolean(HarvesterSettings.DISREGARD_SEEDURL_INFORMATION_IN_CRAWLLOG); 210 BufferedReader in = null; 211 212 try { 213 in = new BufferedReader(new FileReader(file)); 214 String line; 215 int lineCnt = 0; 216 while ((line = in.readLine()) != null) { 217 ++lineCnt; 218 try { 219 processHarvestLine(line, disregardSeedUrls); 220 } catch (ArgumentNotValid e) { 221 log.debug("Invalid line in '{}' line {}: '{}'. Ignoring due to reason: {}", file.getAbsolutePath(), 222 lineCnt, line, e.getMessage(), e); 223 } 224 } 225 } catch (IOException e) { 226 String msg = "Unable to open/read crawl.log file '" + file.getAbsolutePath() + "'."; 227 log.warn(msg, e); 228 throw new IOFailure(msg, e); 229 } finally { 230 if (in != null) { 231 try { 232 in.close(); 233 } catch (IOException e) { 234 log.debug("Unable to close {}", file, e); 235 // Can't throw here, as would destroy the real exception 236 } 237 } 238 } 239 } 240 241 /** 242 * Processes a harvest-line, updating the object and byte maps. 243 * 244 * @param line the line to process. 245 * @param disregardSeedUrlInfo Boolean saying whether or not to disregard SeedURL Information 246 */ 247 private void processHarvestLine(final String line, boolean disregardSeedUrlInfo) { 248 // A legal crawl log line has at least 11 parts, + optional annotations 249 250 final int MIN_CRAWL_LOG_PARTS = 11; 251 final int MAX_PARTS = 12; 252 final int ANNOTATION_PART_INDEX = 11; 253 String[] parts = line.split("\\s+", MAX_PARTS); 254 if (parts.length < MIN_CRAWL_LOG_PARTS) { 255 throw new ArgumentNotValid("Not enough fields for line in crawl.log: '" + line + "'. Was only " 256 + parts.length + " fields. Should have been at least " + MIN_CRAWL_LOG_PARTS); 257 } 258 259 // Check the seed url (part 11 of the crawl-log-line). 260 // If equal to "-", the seed url is not written to the log, 261 // and this information is disregarded 262 // Note This information is disregarded if setting disregard_seed_url_information 263 // is enabled. 264 265 String seedURL = parts[10]; 266 267 boolean sourceTagEnabled = true; 268 if (seedURL.equals("-") || disregardSeedUrlInfo) { 269 sourceTagEnabled = false; 270 } 271 String seedDomain = null; 272 273 if (sourceTagEnabled) { 274 try { 275 seedDomain = getDomainNameFromURIString(seedURL); 276 if (seedDomain != null) { 277 // Transform any IDNA encoded seedDomain back to Unicode 278 seedDomain = IDNA.toUnicode(seedDomain); 279 } 280 } catch (URISyntaxException e) { 281 log.debug("Unable to extract a domain from the seedURL found in field 11 of crawl.log: '{}'.", seedURL, 282 e); 283 } 284 } 285 286 // Get the object domain name from the URL in the fourth field 287 String objectDomain = null; 288 String objectUrl = parts[3]; 289 290 try { 291 objectDomain = getDomainNameFromURIString(objectUrl); 292 if (objectDomain != null) { 293 // Transform the any IDNA encoded domain back to Unicode 294 objectDomain = IDNA.toUnicode(objectDomain); 295 } 296 } catch (URISyntaxException e) { 297 log.debug("Unable to extract a domain from the object URL found in field 4 of crawl.log: '{}'.", objectUrl, 298 e); 299 } 300 301 if (objectDomain == null && seedDomain == null) { 302 throw new ArgumentNotValid("Unable to find a domainName in the line: '" + line + "'."); 303 } 304 305 String domainName = null; 306 307 if (sourceTagEnabled && seedDomain != null) { 308 domainName = seedDomain; 309 } else if (objectDomain != null) { 310 domainName = objectDomain; 311 } else { 312 throw new ArgumentNotValid("Unable to find valid domainname"); 313 } 314 315 // Get the response code for the URL in the second field 316 long response; 317 try { 318 response = Long.parseLong(parts[1]); 319 } catch (NumberFormatException e) { 320 throw new ArgumentNotValid("Unparsable response code in field 2 of crawl.log: '" + parts[1] + "'."); 321 } 322 323 // Get the byte count from annotation field "content-size" 324 // and the stop reason from annotation field if status code is -5003 325 StopReason stopReason = getDefaultStopReason(); 326 long byteCounter = 0; 327 if (parts.length > MIN_CRAWL_LOG_PARTS) { 328 // test if any annotations exist 329 String[] annotations = parts[ANNOTATION_PART_INDEX].split(","); 330 for (String annotation : annotations) { 331 // ContentSizeAnnotationPostProcessor.CONTENT_SIZE_ANNOTATION_PREFIX 332 if (annotation.trim().startsWith(Heritrix1Constants.CONTENT_SIZE_ANNOTATION_PREFIX)) { 333 try { 334 // ContentSizeAnnotationPostProcessor.CONTENT_SIZE_ANNOTATION_PREFIX 335 byteCounter = Long.parseLong(annotation 336 .substring(Heritrix1Constants.CONTENT_SIZE_ANNOTATION_PREFIX.length())); 337 } catch (NumberFormatException e) { 338 throw new ArgumentNotValid("Unparsable annotation in field 12 of crawl.log: '" 339 + parts[ANNOTATION_PART_INDEX] + "'.", e); 340 } 341 } 342 if (response == Heritrix1Constants.CRAWLURI_S_BLOCKED_BY_QUOTA) { 343 if (annotation.trim().equals("Q:group-max-all-kb")) { 344 stopReason = StopReason.SIZE_LIMIT; 345 } else if (annotation.trim().equals("Q:group-max-fetch-successes")) { 346 stopReason = StopReason.OBJECT_LIMIT; 347 } 348 } 349 } 350 } 351 352 // Update stats for domain 353 DomainStats dhi = getOrCreateDomainStats(domainName); 354 355 // Only count harvested URIs 356 if (response >= 0) { 357 long oldObjectCount = dhi.getObjectCount(); 358 dhi.setObjectCount(oldObjectCount + 1); 359 long oldByteCount = dhi.getByteCount(); 360 dhi.setByteCount(oldByteCount + byteCounter); 361 } 362 // Only if reason not set 363 if (dhi.getStopReason() == defaultStopReason) { 364 dhi.setStopReason(stopReason); 365 } 366 } 367 368 /** 369 * Extract DomainName from URI string. Does not handle Danish characters in URI. 370 * 371 * @param uriAsString a given URI as string. 372 * @return the domainName if possible or null, if not possible 373 * @throws URIException If unable to create valid URI from the given string 374 */ 375 private String getDomainNameFromURIString(String uriAsString) throws URISyntaxException { 376 Uri uri = new Uri(uriAsString, UriProfile.RFC3986_ABS_16BIT_LAX); 377 String hostName; 378 if ("dns".equals(uri.getScheme())) { 379 hostName = uri.getPath(); 380 } else { 381 hostName = uri.getHost(); 382 } 383 if (hostName == null) { 384 log.debug("Not possible to extract domainname from URL: {}", uriAsString); 385 return null; 386 } 387 return DomainUtils.domainNameFromHostname(hostName); 388 } 389 390 /** 391 * @return default stopReason 392 */ 393 public StopReason getDefaultStopReason() { 394 return defaultStopReason; 395 } 396 397 public Map<String, DomainStats> getDomainStatsMap() { 398 return this.domainstats; 399 } 400 401 public static DomainStatsReport getDomainStatsReport(HeritrixFiles files) { 402 HarvestReportGenerator hrg = new HarvestReportGenerator(files); 403 return new DomainStatsReport(hrg.getDomainStatsMap(), hrg.getDefaultStopReason()); 404 } 405 406}