001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.harvesting.frontier; 024 025import java.io.Serializable; 026 027import org.slf4j.Logger; 028import org.slf4j.LoggerFactory; 029 030import com.sleepycat.persist.model.Persistent; 031 032import dk.netarkivet.common.exceptions.ArgumentNotValid; 033 034/** 035 * Wraps a line of the frontier report. As of Heritrix 1.14.4 the format of a frontier report line sequentially lists 036 * the following tokens, separated by a whitespace : 037 * <p> 038 * <ol> 039 * <li>queue</li> 040 * <li>currentSize</li> 041 * <li>totalEnqueues</li> 042 * <li>sessionBalance</li> 043 * <li>lastCost(averageCost)</li> 044 * <li>lastDequeueTime</li> 045 * <li>wakeTime</li> 046 * <li>totalSpend/totalBudget</li> 047 * <li>errorCount</li> 048 * <li>lastPeekUri</li> 049 * <li>lastQueuedUri</li> 050 * </ol> 051 * <p>In Heritrix 3.2.0, there is a new field named <strong>precedence</strong> which comes after queue. 052 * More information here : https://webarchive.jira.com/wiki/display/Heritrix/Precedence+Feature+Notes 053 * </p> 054 * <p> 055 * This class implements a natural order : comparisons are made : - first by decreasing values of totalEnqueues - 056 * secondly by domain name (string natural order) 057 * <p> 058 * Thanks to Gordon Mohr at Internet Archive for explaining the exact semantics of the frontier report fields. 059 */ 060@Persistent 061@SuppressWarnings({"serial"}) 062public class FrontierReportLine implements Serializable, Comparable<FrontierReportLine>, FrontierReportLineOrderKey { 063 064 /** The logger for this class. */ 065 private static final Logger LOG = LoggerFactory.getLogger(FrontierReportLine.class); 066 067 /** 068 * Expected size of string array when we split the line token across "\\s+" for heritrix 1 frontier format. 069 */ 070 private static final int EXPECTED_SPLIT_SEGMENTS_H1 = 11; 071 072 /** 073 * Expected size of string array when we split the line token across "\\s+" for heritrix 3 frontier format. 074 */ 075 private static final int EXPECTED_SPLIT_SEGMENTS_H3 = 12; 076 077 /** 078 * Token used to signify an empty value. 079 */ 080 static final String EMPTY_VALUE_TOKEN = "-"; 081 082 /** 083 * The queue name, in our case the domain, as we use per domain queues. 084 */ 085 private String domainName; 086 087 /** Number of URIs currently in the queue. */ 088 private long currentSize; 089 090 /** 091 * new field in heritrix 3 frontier report 092 * @see https://webarchive.jira.com/wiki/display/Heritrix/Precedence+Feature+Notes 093 */ 094 private long precedence; 095 096 /** 097 * Count of total times a URI has been enqueued to this queue; a measure of the total number of URI instances ever 098 * put on this queue. This can be a larger number than the unique URIs, as some URIs (most notably DNS/robots when 099 * refetched, but possibly other things force-requeued under advanced usage) may be enqueued more than once. 100 */ 101 private long totalEnqueues; 102 103 /** 104 * When using the 'budget/rotation' functionality (a non-zero URI cost policy), this is the running 'balance' of a 105 * queue during its current 'active' session. This balance declines; when it hits zero, another queue (if any are 106 * waiting 'inactive') gets a chance to enter active crawling (as fast as politeness allows). 107 */ 108 private long sessionBalance; 109 110 /** 111 * The 'cost' of the last URI charged against the queue's budgets. If using a cost policy that makes some URIs more 112 * costly than others, this may indicate the queue has reached more-costly URIs. (Such larger-cost URIs will be 113 * inserted later in the queue, accelerate the depletion of the session balance, and accelerate progress towards the 114 * total queue budget, which could send the queue into 'retirement'. Thus higher-cost URIs mean a queue over time 115 * gets less of the crawler's cycles.) 116 */ 117 private double lastCost; 118 119 /** Average cost of a processed URI. */ 120 private double averageCost; 121 122 /** 123 * Timestamp of when the last URI came off this queue for processing. May give an indication of how long a queue has 124 * been empty/inactive. 125 */ 126 private String lastDequeueTime; 127 128 /** 129 * If the queue is in any sort of politeness- or connect-problem-'snooze' delay, this indicates when it will again 130 * be eligible to offer URIs to waiting threads. (When it wakes, it gets in line -- so actual wait before next URI 131 * is tried may be longer depending on the balance of threads and other active queues.) 132 */ 133 private String wakeTime; 134 135 /** 136 * The total of all URI costs charged against this queue. 137 */ 138 private long totalSpend; 139 140 /** 141 * The totalBudget above which the queue will be retired (made permanently inactive unless its totalBudget is 142 * raised). 143 */ 144 private long totalBudget; 145 146 /** 147 * The number of URIs from this queue that reached 'finished' status with an error code (non-retryable errors, or 148 * exhausted retries, or other errors). When nonzero and rising there may be special problems with the site(s) 149 * related to this queue. 150 */ 151 private long errorCount; 152 153 /** 154 * The last URI peeked/dequeued from the head of this queue. 155 */ 156 private String lastPeekUri; 157 158 /** 159 * The last URI enqueued to anywhere in this queue. 160 */ 161 private String lastQueuedUri; 162 163 /** 164 * Default empty constructor. 165 */ 166 public FrontierReportLine() { 167 168 } 169 170 /** 171 * Builds a cloned line. 172 * 173 * @param original the line to clone 174 */ 175 protected FrontierReportLine(FrontierReportLine original) { 176 this.averageCost = original.averageCost; 177 this.currentSize = original.currentSize; 178 this.domainName = original.domainName; 179 this.errorCount = original.errorCount; 180 this.lastCost = original.lastCost; 181 this.lastDequeueTime = original.lastDequeueTime; 182 this.lastPeekUri = original.lastPeekUri; 183 this.lastQueuedUri = original.lastQueuedUri; 184 this.sessionBalance = original.sessionBalance; 185 this.totalBudget = original.totalBudget; 186 this.totalEnqueues = original.totalEnqueues; 187 this.totalSpend = original.totalSpend; 188 this.wakeTime = original.wakeTime; 189 } 190 191 /** 192 * Parses the given string. 193 * Handle both heritrix 1 and heritrix 3 frontier report line format 194 * 195 * @param lineToken the string to parse. 196 */ 197 FrontierReportLine(String lineToken) { 198 199 String[] split = lineToken.split("\\s+"); 200 int heritrixVersion = 0; 201 202 if(split.length == EXPECTED_SPLIT_SEGMENTS_H1) { 203 heritrixVersion = 1; 204 } else if (split.length == EXPECTED_SPLIT_SEGMENTS_H3){ 205 heritrixVersion = 3; 206 } 207 else { 208 throw new ArgumentNotValid("Format of line token '" + lineToken + "' is not a valid frontier report line!"); 209 } 210 211 this.domainName = split[0]; 212 213 int fIndex = 0; 214 if(heritrixVersion == 3) { 215 fIndex = 1; 216 this.precedence = parseLong(split[1]); 217 } 218 219 try { 220 this.currentSize = parseLong(split[fIndex+1]); 221 } catch (NumberFormatException e) { 222 LOG.warn("Found incorrect formatted currentsize " + split[fIndex+1]); 223 } 224 this.totalEnqueues = parseLong(split[fIndex+2]); 225 this.sessionBalance = parseLong(split[fIndex+3]); 226 227 // Cost token is lastCost(averageCost) 228 String costToken = split[fIndex+4]; 229 int leftParenIdx = costToken.indexOf("("); 230 this.lastCost = parseDouble(costToken.substring(0, leftParenIdx)); 231 this.averageCost = parseDouble(costToken.substring(leftParenIdx + 1, costToken.indexOf(")"))); 232 this.lastDequeueTime = split[fIndex+5]; 233 this.wakeTime = split[fIndex+6]; 234 235 // Budget token is totalSpend/totalBudget 236 String[] budgetTokens = split[fIndex+7].split("/"); 237 if (budgetTokens.length != 2) { 238 LOG.warn("Found incorrect budget token '" + split[fIndex+7]); 239 } else { 240 this.totalSpend = parseLong(budgetTokens[0]); 241 this.totalBudget = parseLong(budgetTokens[1]); 242 } 243 244 this.errorCount = parseLong(split[fIndex+8]); 245 246 this.lastPeekUri = split[fIndex+9]; 247 this.lastQueuedUri = split[fIndex+10]; 248 249 } 250 251 /** 252 * @return the domainName 253 */ 254 public String getDomainName() { 255 return domainName; 256 } 257 258 /** 259 * @param domainName the domainName to set 260 */ 261 public void setDomainName(String domainName) { 262 this.domainName = domainName; 263 } 264 265 /** 266 * @return the currentSize 267 */ 268 public long getCurrentSize() { 269 return currentSize; 270 } 271 272 /** 273 * @param precedence the precedence to set 274 */ 275 public void setPrecedence(long precedence) { 276 this.precedence = precedence; 277 } 278 279 /** 280 * @return the precedence 281 */ 282 public long getPrecedence() { 283 return precedence; 284 } 285 286 /** 287 * @param currentSize the currentSize to set 288 */ 289 public void setCurrentSize(long currentSize) { 290 this.currentSize = currentSize; 291 } 292 293 /** 294 * @return the totalEnqueues 295 */ 296 public long getTotalEnqueues() { 297 return totalEnqueues; 298 } 299 300 /** 301 * @param totalEnqueues the totalEnqueues to set 302 */ 303 public void setTotalEnqueues(long totalEnqueues) { 304 this.totalEnqueues = totalEnqueues; 305 } 306 307 /** 308 * @return the sessionBalance 309 */ 310 public long getSessionBalance() { 311 return sessionBalance; 312 } 313 314 /** 315 * @param sessionBalance the sessionBalance to set 316 */ 317 public void setSessionBalance(long sessionBalance) { 318 this.sessionBalance = sessionBalance; 319 } 320 321 /** 322 * @return the lastCost 323 */ 324 public double getLastCost() { 325 return lastCost; 326 } 327 328 /** 329 * @param lastCost the lastCost to set 330 */ 331 public void setLastCost(double lastCost) { 332 this.lastCost = lastCost; 333 } 334 335 /** 336 * @return the averageCost 337 */ 338 public double getAverageCost() { 339 return averageCost; 340 } 341 342 /** 343 * @param averageCost the averageCost to set 344 */ 345 public void setAverageCost(double averageCost) { 346 this.averageCost = averageCost; 347 } 348 349 /** 350 * @return the lastDequeueTime 351 */ 352 public String getLastDequeueTime() { 353 return lastDequeueTime; 354 } 355 356 /** 357 * @param lastDequeueTime the lastDequeueTime to set 358 */ 359 public void setLastDequeueTime(String lastDequeueTime) { 360 this.lastDequeueTime = lastDequeueTime; 361 } 362 363 /** 364 * @return the wakeTime 365 */ 366 public String getWakeTime() { 367 return wakeTime; 368 } 369 370 /** 371 * @param wakeTime the wakeTime to set 372 */ 373 public void setWakeTime(String wakeTime) { 374 this.wakeTime = wakeTime; 375 } 376 377 /** 378 * @return the totalSpend 379 */ 380 public long getTotalSpend() { 381 return totalSpend; 382 } 383 384 /** 385 * @param totalSpend the totalSpend to set 386 */ 387 public void setTotalSpend(long totalSpend) { 388 this.totalSpend = totalSpend; 389 } 390 391 /** 392 * @return the totalBudget 393 */ 394 public long getTotalBudget() { 395 return totalBudget; 396 } 397 398 /** 399 * @param totalBudget the totalBudget to set 400 */ 401 public void setTotalBudget(long totalBudget) { 402 this.totalBudget = totalBudget; 403 } 404 405 /** 406 * @return the errorCount 407 */ 408 public long getErrorCount() { 409 return errorCount; 410 } 411 412 /** 413 * @param errorCount the errorCount to set 414 */ 415 public void setErrorCount(long errorCount) { 416 this.errorCount = errorCount; 417 } 418 419 /** 420 * @return the lastPeekUri 421 */ 422 public String getLastPeekUri() { 423 return lastPeekUri; 424 } 425 426 /** 427 * @param lastPeekUri the lastPeekUri to set 428 */ 429 public void setLastPeekUri(String lastPeekUri) { 430 this.lastPeekUri = lastPeekUri; 431 } 432 433 /** 434 * @return the lastQueuedUri 435 */ 436 public String getLastQueuedUri() { 437 return lastQueuedUri; 438 } 439 440 /** 441 * @param lastQueuedUri the lastQueuedUri to set 442 */ 443 public void setLastQueuedUri(String lastQueuedUri) { 444 this.lastQueuedUri = lastQueuedUri; 445 } 446 447 /** 448 * Default order relation is descending size of the queue (totalEnqueues). 449 */ 450 @Override 451 public int compareTo(FrontierReportLine l) { 452 return FrontierReportLineNaturalOrder.getInstance().compare(this, l); 453 } 454 455 /** 456 * There is one queue per domain, so equality is based on the domain name. 457 */ 458 @Override 459 public boolean equals(Object obj) { 460 if (obj instanceof FrontierReportLine) { 461 return domainName.equals(((FrontierReportLine) obj).getDomainName()); 462 } 463 return false; 464 } 465 466 /** 467 * There is one queue per domain, so hashcode is based on the domain name. 468 */ 469 @Override 470 public int hashCode() { 471 return domainName.hashCode(); 472 } 473 474 public String getQueueId() { 475 return domainName; 476 } 477 478 public long getQueueSize() { 479 return totalEnqueues; 480 } 481 482 /** 483 * Parses the token. 484 * 485 * @param longToken token to parse. 486 * @return parsed value or default value if value is empty or unparsable. 487 */ 488 private static long parseLong(String longToken) { 489 if (EMPTY_VALUE_TOKEN.equals(longToken)) { 490 return Long.MIN_VALUE; 491 } 492 try { 493 return Long.parseLong(longToken); 494 } catch (NumberFormatException e) { 495 // Strange data my occur here, but it's harmless 496 return Long.MIN_VALUE; 497 } 498 } 499 500 /** 501 * Parses the token. 502 * 503 * @param dblToken token to parse. 504 * @return parsed value or default value if value is empty or unparsable. 505 */ 506 private static double parseDouble(String dblToken) { 507 if (EMPTY_VALUE_TOKEN.equals(dblToken)) { 508 return Double.MIN_VALUE; 509 } 510 try { 511 return Double.parseDouble(dblToken); 512 } catch (NumberFormatException e) { 513 // Strange data my occur here, but it's harmless 514 return Double.MIN_VALUE; 515 } 516 } 517 518}