001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.harvesting.report; 024 025import gnu.inet.encoding.IDNA; 026 027import java.io.BufferedReader; 028import java.io.File; 029import java.io.FileReader; 030import java.io.IOException; 031import java.net.URISyntaxException; 032import java.util.HashMap; 033import java.util.Map; 034 035import org.apache.commons.httpclient.URIException; 036import org.jwat.common.Uri; 037import org.jwat.common.UriProfile; 038import org.slf4j.Logger; 039import org.slf4j.LoggerFactory; 040 041import dk.netarkivet.common.exceptions.ArgumentNotValid; 042import dk.netarkivet.common.exceptions.IOFailure; 043import dk.netarkivet.common.utils.DomainUtils; 044import dk.netarkivet.common.utils.FileUtils; 045import dk.netarkivet.common.utils.Settings; 046import dk.netarkivet.common.utils.StringUtils; 047import dk.netarkivet.harvester.HarvesterSettings; 048import dk.netarkivet.harvester.datamodel.StopReason; 049import dk.netarkivet.harvester.harvesting.HeritrixFiles; 050import dk.netarkivet.harvester.harvesting.distribute.DomainStats; 051 052/** 053 * Base implementation for a harvest report. 054 */ 055@SuppressWarnings({"serial"}) 056public class HarvestReportGenerator { 057 058 /** The logger for this class. */ 059 private static final Logger log = LoggerFactory.getLogger(HarvestReportGenerator.class); 060 061 /** 062 * Strings found in the progress-statistics.log, used to devise the default stop reason for domains. 063 */ 064 public static enum ProgressStatisticsConstants { 065 066 /** 067 * String in crawl.log, that Heritrix writes as the last entry in the progress-statistics.log. 068 */ 069 ORDERLY_FINISH("CRAWL ENDED"), 070 071 /** 072 * This is written when Heritrix terminates the job due to its timelimit (max-time-sec) being reached. 073 */ 074 TIMELIMIT_EXCEEDED("Timelimit hit"), 075 076 /** 077 * String which shows that the harvest was deliberately aborted from the Heritrix GUI or forcibly stopped by the 078 * Netarchive Suite software due to an inactivity timeout. 079 */ 080 HARVEST_ABORTED("Ended by operator"); 081 082 /** The pattern associated with a given enum value. */ 083 private final String pattern; 084 085 /** 086 * Constructor for this enum class. 087 * 088 * @param pattern The pattern associated with a given enum value. 089 */ 090 ProgressStatisticsConstants(String pattern) { 091 this.pattern = pattern; 092 } 093 094 } 095 096 /** Datastructure holding the domain-information contained in one harvest. */ 097 private final Map<String, DomainStats> domainstats = new HashMap<String, DomainStats>(); 098 099 private HeritrixFiles heritrixFiles; 100 101 /** 102 * The default reason why we stopped harvesting this domain. This value is set by looking for a CRAWL ENDED in the 103 * crawl.log. 104 */ 105 private StopReason defaultStopReason; 106 107 /** 108 * Default constructor that does nothing. The real construction is supposed to be done in the subclasses by filling 109 * out the domainStats map with crawl results. 110 */ 111 public HarvestReportGenerator() { 112 } 113 114 /** 115 * Constructor from Heritrix report files. Subclasses might use a different set of Heritrix reports. 116 * 117 * @param files the set of Heritrix reports. 118 */ 119 public HarvestReportGenerator(HeritrixFiles files) { 120 ArgumentNotValid.checkNotNull(files, "files"); 121 this.heritrixFiles = files; 122 this.defaultStopReason = findDefaultStopReason(heritrixFiles.getProgressStatisticsLog()); 123 preProcess(heritrixFiles); 124 } 125 126 /** 127 * Pre-processing happens when the report is built just at the end of the crawl, before the ARC files upload. 128 */ 129 public void preProcess(HeritrixFiles files) { 130 if (log.isInfoEnabled()) { 131 log.info("Starting pre-processing of harvest report for job {}", files.getJobID()); 132 } 133 long startTime = System.currentTimeMillis(); 134 135 File crawlLog = files.getCrawlLog(); 136 if (!crawlLog.isFile() || !crawlLog.canRead()) { 137 String errorMsg = "Not a file or not readable: " + crawlLog.getAbsolutePath(); 138 throw new IOFailure(errorMsg); 139 } 140 parseCrawlLog(files.getCrawlLog()); 141 142 if (log.isInfoEnabled()) { 143 long time = System.currentTimeMillis() - startTime; 144 log.info("Finished pre-processing of harvest report for job {}, operation took {}", files.getJobID(), 145 StringUtils.formatDuration(time)); 146 } 147 } 148 149 /** 150 * Attempts to get an already existing {@link DomainStats} object for that domain, and if not found creates one with 151 * zero values. 152 * 153 * @param domainName the name of the domain to get DomainStats for. 154 * @return a DomainStats object for the given domain-name. 155 */ 156 protected DomainStats getOrCreateDomainStats(String domainName) { 157 DomainStats dhi = domainstats.get(domainName); 158 if (dhi == null) { 159 dhi = new DomainStats(0L, 0L, defaultStopReason); 160 domainstats.put(domainName, dhi); 161 } 162 163 return dhi; 164 } 165 166 /** 167 * Find out whether we stopped normally in progress statistics log. 168 * 169 * @param logFile A progress-statistics.log file. 170 * @return StopReason.DOWNLOAD_COMPLETE for progress statistics ending with CRAWL ENDED, 171 * StopReason.DOWNLOAD_UNFINISHED otherwise or if file does not exist. 172 */ 173 public static StopReason findDefaultStopReason(File logFile) { 174 ArgumentNotValid.checkNotNull(logFile, "File logFile"); 175 if (!logFile.exists()) { 176 return StopReason.DOWNLOAD_UNFINISHED; 177 } 178 String lastLine = FileUtils.readLastLine(logFile); 179 if (lastLine.contains(ProgressStatisticsConstants.ORDERLY_FINISH.pattern)) { 180 if (lastLine.contains(ProgressStatisticsConstants.HARVEST_ABORTED.pattern)) { 181 return StopReason.DOWNLOAD_UNFINISHED; 182 } else if (lastLine.contains(ProgressStatisticsConstants.TIMELIMIT_EXCEEDED.pattern)) { 183 return StopReason.TIME_LIMIT; 184 } else { 185 return StopReason.DOWNLOAD_COMPLETE; 186 } 187 } else { 188 return StopReason.DOWNLOAD_UNFINISHED; 189 } 190 } 191 192 /** 193 * Computes the domain-name/byte-count and domain-name/object-count and domain-name/stopreason maps for a crawl.log. 194 * 195 * @param file the local file to be processed 196 * @throws IOFailure if there is problem reading the file 197 */ 198 private void parseCrawlLog(File file) throws IOFailure { 199 // read whether or not to disregard the SeedURL information 200 // in the crawl.log 201 boolean disregardSeedUrls = Settings.getBoolean(HarvesterSettings.DISREGARD_SEEDURL_INFORMATION_IN_CRAWLLOG); 202 BufferedReader in = null; 203 204 try { 205 in = new BufferedReader(new FileReader(file)); 206 String line; 207 int lineCnt = 0; 208 while ((line = in.readLine()) != null) { 209 ++lineCnt; 210 try { 211 processHarvestLine(line, disregardSeedUrls); 212 } catch (ArgumentNotValid e) { 213 log.debug("Invalid line in '{}' line {}: '{}'. Ignoring due to reason: {}", file.getAbsolutePath(), 214 lineCnt, line, e.getMessage(), e); 215 } 216 } 217 } catch (IOException e) { 218 String msg = "Unable to open/read crawl.log file '" + file.getAbsolutePath() + "'."; 219 log.warn(msg, e); 220 throw new IOFailure(msg, e); 221 } finally { 222 if (in != null) { 223 try { 224 in.close(); 225 } catch (IOException e) { 226 log.debug("Unable to close {}", file, e); 227 // Can't throw here, as would destroy the real exception 228 } 229 } 230 } 231 } 232 233 /** 234 * Processes a harvest-line, updating the object and byte maps. 235 * 236 * @param line the line to process. 237 * @param disregardSeedUrlInfo Boolean saying whether or not to disregard SeedURL Information 238 */ 239 private void processHarvestLine(final String line, boolean disregardSeedUrlInfo) { 240 // A legal crawl log line has at least 11 parts, + optional annotations 241 242 final int MIN_CRAWL_LOG_PARTS = 11; 243 final int MAX_PARTS = 12; 244 final int ANNOTATION_PART_INDEX = 11; 245 String[] parts = line.split("\\s+", MAX_PARTS); 246 if (parts.length < MIN_CRAWL_LOG_PARTS) { 247 throw new ArgumentNotValid("Not enough fields for line in crawl.log: '" + line + "'. Was only " 248 + parts.length + " fields. Should have been at least " + MIN_CRAWL_LOG_PARTS); 249 } 250 251 // Check the seed url (part 11 of the crawl-log-line). 252 // If equal to "-", the seed url is not written to the log, 253 // and this information is disregarded 254 // Note This information is disregarded if setting disregard_seed_url_information 255 // is enabled. 256 257 String seedURL = parts[10]; 258 259 boolean sourceTagEnabled = true; 260 if (seedURL.equals("-") || disregardSeedUrlInfo) { 261 sourceTagEnabled = false; 262 } 263 String seedDomain = null; 264 265 if (sourceTagEnabled) { 266 try { 267 seedDomain = getDomainNameFromURIString(seedURL); 268 if (seedDomain != null) { 269 // Transform any IDNA encoded seedDomain back to Unicode 270 seedDomain = IDNA.toUnicode(seedDomain); 271 } 272 } catch (URISyntaxException e) { 273 log.debug("Unable to extract a domain from the seedURL found in field 11 of crawl.log: '{}'.", seedURL, 274 e); 275 } 276 } 277 278 // Get the object domain name from the URL in the fourth field 279 String objectDomain = null; 280 String objectUrl = parts[3]; 281 282 try { 283 objectDomain = getDomainNameFromURIString(objectUrl); 284 if (objectDomain != null) { 285 // Transform the any IDNA encoded domain back to Unicode 286 objectDomain = IDNA.toUnicode(objectDomain); 287 } 288 } catch (URISyntaxException e) { 289 log.debug("Unable to extract a domain from the object URL found in field 4 of crawl.log: '{}'.", objectUrl, 290 e); 291 } 292 293 if (objectDomain == null && seedDomain == null) { 294 throw new ArgumentNotValid("Unable to find a domainName in the line: '" + line + "'."); 295 } 296 297 String domainName = null; 298 299 if (sourceTagEnabled && seedDomain != null) { 300 domainName = seedDomain; 301 } else if (objectDomain != null) { 302 domainName = objectDomain; 303 } else { 304 throw new ArgumentNotValid("Unable to find valid domainname"); 305 } 306 307 // Get the response code for the URL in the second field 308 long response; 309 try { 310 response = Long.parseLong(parts[1]); 311 } catch (NumberFormatException e) { 312 throw new ArgumentNotValid("Unparsable response code in field 2 of crawl.log: '" + parts[1] + "'."); 313 } 314 315 // Get the byte count from annotation field "content-size" 316 // and the stop reason from annotation field if status code is -5003 317 StopReason stopReason = getDefaultStopReason(); 318 long byteCounter = 0; 319 if (parts.length > MIN_CRAWL_LOG_PARTS) { 320 // test if any annotations exist 321 String[] annotations = parts[ANNOTATION_PART_INDEX].split(","); 322 for (String annotation : annotations) { 323 // ContentSizeAnnotationPostProcessor.CONTENT_SIZE_ANNOTATION_PREFIX 324 if (annotation.trim().startsWith(Heritrix1Constants.CONTENT_SIZE_ANNOTATION_PREFIX)) { 325 try { 326 // ContentSizeAnnotationPostProcessor.CONTENT_SIZE_ANNOTATION_PREFIX 327 byteCounter = Long.parseLong(annotation 328 .substring(Heritrix1Constants.CONTENT_SIZE_ANNOTATION_PREFIX.length())); 329 } catch (NumberFormatException e) { 330 throw new ArgumentNotValid("Unparsable annotation in field 12 of crawl.log: '" 331 + parts[ANNOTATION_PART_INDEX] + "'.", e); 332 } 333 } 334 if (response == Heritrix1Constants.CRAWLURI_S_BLOCKED_BY_QUOTA) { 335 if (annotation.trim().equals("Q:group-max-all-kb")) { 336 stopReason = StopReason.SIZE_LIMIT; 337 } else if (annotation.trim().equals("Q:group-max-fetch-successes")) { 338 stopReason = StopReason.OBJECT_LIMIT; 339 } 340 } 341 } 342 } 343 344 // Update stats for domain 345 DomainStats dhi = getOrCreateDomainStats(domainName); 346 347 // Only count harvested URIs 348 if (response >= 0) { 349 long oldObjectCount = dhi.getObjectCount(); 350 dhi.setObjectCount(oldObjectCount + 1); 351 long oldByteCount = dhi.getByteCount(); 352 dhi.setByteCount(oldByteCount + byteCounter); 353 } 354 // Only if reason not set 355 if (dhi.getStopReason() == defaultStopReason) { 356 dhi.setStopReason(stopReason); 357 } 358 } 359 360 /** 361 * Extract DomainName from URI string. Does not handle Danish characters in URI. 362 * 363 * @param uriAsString a given URI as string. 364 * @return the domainName if possible or null, if not possible 365 * @throws URIException If unable to create valid URI from the given string 366 */ 367 private String getDomainNameFromURIString(String uriAsString) throws URISyntaxException { 368 Uri uri = new Uri(uriAsString, UriProfile.RFC3986_ABS_16BIT_LAX); 369 String hostName; 370 if ("dns".equals(uri.getScheme())) { 371 hostName = uri.getPath(); 372 } else { 373 hostName = uri.getHost(); 374 } 375 if (hostName == null) { 376 log.debug("Not possible to extract domainname from URL: {}", uriAsString); 377 return null; 378 } 379 return DomainUtils.domainNameFromHostname(hostName); 380 } 381 382 /** 383 * 384 * @return 385 */ 386 public StopReason getDefaultStopReason() { 387 return defaultStopReason; 388 } 389 390 public Map<String, DomainStats> getDomainStatsMap() { 391 return this.domainstats; 392 } 393 394 public static DomainStatsReport getDomainStatsReport(HeritrixFiles files) { 395 HarvestReportGenerator hrg = new HarvestReportGenerator(files); 396 return new DomainStatsReport(hrg.getDomainStatsMap(), hrg.getDefaultStopReason()); 397 } 398 399}