001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.harvesting.frontier;
024
025import java.io.Serializable;
026
027import org.slf4j.Logger;
028import org.slf4j.LoggerFactory;
029
030import com.sleepycat.persist.model.Persistent;
031
032import dk.netarkivet.common.exceptions.ArgumentNotValid;
033
034/**
035 * Wraps a line of the frontier report. As of Heritrix 1.14.4 the format of a frontier report line sequentially lists
036 * the following tokens, separated by a whitespace :
037 * <p>
038 * <ol>
039 * <li>queue</li>
040 * <li>currentSize</li>
041 * <li>totalEnqueues</li>
042 * <li>sessionBalance</li>
043 * <li>lastCost(averageCost)</li>
044 * <li>lastDequeueTime</li>
045 * <li>wakeTime</li>
046 * <li>totalSpend/totalBudget</li>
047 * <li>errorCount</li>
048 * <li>lastPeekUri</li>
049 * <li>lastQueuedUri</li>
050 * </ol>
051 * <p>In Heritrix 3.2.0, there is a new field named <strong>precedence</strong> which comes after queue.
052 * More information here : https://webarchive.jira.com/wiki/display/Heritrix/Precedence+Feature+Notes
053 * </p>
054 * <p>
055 * This class implements a natural order : comparisons are made : - first by decreasing values of totalEnqueues -
056 * secondly by domain name (string natural order)
057 * <p>
058 * Thanks to Gordon Mohr at Internet Archive for explaining the exact semantics of the frontier report fields.
059 */
060@Persistent
061@SuppressWarnings({"serial"})
062public class FrontierReportLine implements Serializable, Comparable<FrontierReportLine>, FrontierReportLineOrderKey {
063
064    /** The logger for this class. */
065        private static final Logger LOG = LoggerFactory.getLogger(FrontierReportLine.class);
066
067    /**
068     * Expected size of string array when we split the line token across "\\s+" for heritrix 1 frontier format.
069     */
070    private static final int EXPECTED_SPLIT_SEGMENTS_H1 = 11;
071    
072    /**
073     * Expected size of string array when we split the line token across "\\s+" for heritrix 3 frontier format.
074     */
075    private static final int EXPECTED_SPLIT_SEGMENTS_H3 = 12;
076
077    /**
078     * Token used to signify an empty value.
079     */
080    static final String EMPTY_VALUE_TOKEN = "-";
081
082    /**
083     * The queue name, in our case the domain, as we use per domain queues.
084     */
085    private String domainName;
086
087    /** Number of URIs currently in the queue. */
088    private long currentSize;
089    
090    /**
091     * new field in heritrix 3 frontier report
092     * @see https://webarchive.jira.com/wiki/display/Heritrix/Precedence+Feature+Notes
093     */
094    private long precedence;
095
096    /**
097     * Count of total times a URI has been enqueued to this queue; a measure of the total number of URI instances ever
098     * put on this queue. This can be a larger number than the unique URIs, as some URIs (most notably DNS/robots when
099     * refetched, but possibly other things force-requeued under advanced usage) may be enqueued more than once.
100     */
101    private long totalEnqueues;
102
103    /**
104     * When using the 'budget/rotation' functionality (a non-zero URI cost policy), this is the running 'balance' of a
105     * queue during its current 'active' session. This balance declines; when it hits zero, another queue (if any are
106     * waiting 'inactive') gets a chance to enter active crawling (as fast as politeness allows).
107     */
108    private long sessionBalance;
109
110    /**
111     * The 'cost' of the last URI charged against the queue's budgets. If using a cost policy that makes some URIs more
112     * costly than others, this may indicate the queue has reached more-costly URIs. (Such larger-cost URIs will be
113     * inserted later in the queue, accelerate the depletion of the session balance, and accelerate progress towards the
114     * total queue budget, which could send the queue into 'retirement'. Thus higher-cost URIs mean a queue over time
115     * gets less of the crawler's cycles.)
116     */
117    private double lastCost;
118
119    /** Average cost of a processed URI. */
120    private double averageCost;
121
122    /**
123     * Timestamp of when the last URI came off this queue for processing. May give an indication of how long a queue has
124     * been empty/inactive.
125     */
126    private String lastDequeueTime;
127
128    /**
129     * If the queue is in any sort of politeness- or connect-problem-'snooze' delay, this indicates when it will again
130     * be eligible to offer URIs to waiting threads. (When it wakes, it gets in line -- so actual wait before next URI
131     * is tried may be longer depending on the balance of threads and other active queues.)
132     */
133    private String wakeTime;
134
135    /**
136     * The total of all URI costs charged against this queue.
137     */
138    private long totalSpend;
139
140    /**
141     * The totalBudget above which the queue will be retired (made permanently inactive unless its totalBudget is
142     * raised).
143     */
144    private long totalBudget;
145
146    /**
147     * The number of URIs from this queue that reached 'finished' status with an error code (non-retryable errors, or
148     * exhausted retries, or other errors). When nonzero and rising there may be special problems with the site(s)
149     * related to this queue.
150     */
151    private long errorCount;
152
153    /**
154     * The last URI peeked/dequeued from the head of this queue.
155     */
156    private String lastPeekUri;
157
158    /**
159     * The last URI enqueued to anywhere in this queue.
160     */
161    private String lastQueuedUri;
162
163    /**
164     * Default empty constructor.
165     */
166    public FrontierReportLine() {
167
168    }
169
170    /**
171     * Builds a cloned line.
172     *
173     * @param original the line to clone
174     */
175    protected FrontierReportLine(FrontierReportLine original) {
176        this.averageCost = original.averageCost;
177        this.currentSize = original.currentSize;
178        this.domainName = original.domainName;
179        this.errorCount = original.errorCount;
180        this.lastCost = original.lastCost;
181        this.lastDequeueTime = original.lastDequeueTime;
182        this.lastPeekUri = original.lastPeekUri;
183        this.lastQueuedUri = original.lastQueuedUri;
184        this.sessionBalance = original.sessionBalance;
185        this.totalBudget = original.totalBudget;
186        this.totalEnqueues = original.totalEnqueues;
187        this.totalSpend = original.totalSpend;
188        this.wakeTime = original.wakeTime;
189    }
190
191    /**
192     * Parses the given string.
193     * Handle both heritrix 1 and heritrix 3 frontier report line format
194     *
195     * @param lineToken the string to parse.
196     */
197    FrontierReportLine(String lineToken) {
198
199        String[] split = lineToken.split("\\s+");
200        int heritrixVersion = 0;
201
202        if(split.length == EXPECTED_SPLIT_SEGMENTS_H1) {
203                heritrixVersion = 1;
204        } else if (split.length == EXPECTED_SPLIT_SEGMENTS_H3){
205                heritrixVersion = 3;
206        }
207        else {
208            throw new ArgumentNotValid("Format of line token '" + lineToken + "' is not a valid frontier report line!");
209        }
210
211        this.domainName = split[0];
212
213        int fIndex = 0;
214        if(heritrixVersion == 3) {
215                fIndex = 1;
216                this.precedence = parseLong(split[1]);
217        }
218        
219        try {
220            this.currentSize = parseLong(split[fIndex+1]);
221        } catch (NumberFormatException e) {
222            LOG.warn("Found incorrect formatted currentsize " + split[fIndex+1]);
223        }
224        this.totalEnqueues = parseLong(split[fIndex+2]);
225        this.sessionBalance = parseLong(split[fIndex+3]);
226
227        // Cost token is lastCost(averageCost)
228        String costToken = split[fIndex+4];
229        int leftParenIdx = costToken.indexOf("(");
230        this.lastCost = parseDouble(costToken.substring(0, leftParenIdx));
231        this.averageCost = parseDouble(costToken.substring(leftParenIdx + 1, costToken.indexOf(")")));
232        this.lastDequeueTime = split[fIndex+5];
233        this.wakeTime = split[fIndex+6];
234
235        // Budget token is totalSpend/totalBudget
236        String[] budgetTokens = split[fIndex+7].split("/");
237        if (budgetTokens.length != 2) {
238            LOG.warn("Found incorrect budget token '" + split[fIndex+7]);
239        } else {
240            this.totalSpend = parseLong(budgetTokens[0]);
241            this.totalBudget = parseLong(budgetTokens[1]);
242        }
243
244        this.errorCount = parseLong(split[fIndex+8]);
245
246        this.lastPeekUri = split[fIndex+9];
247        this.lastQueuedUri = split[fIndex+10];
248
249    }
250
251    /**
252     * @return the domainName
253     */
254    public String getDomainName() {
255        return domainName;
256    }
257
258    /**
259     * @param domainName the domainName to set
260     */
261    public void setDomainName(String domainName) {
262        this.domainName = domainName;
263    }
264
265    /**
266     * @return the currentSize
267     */
268    public long getCurrentSize() {
269        return currentSize;
270    }
271
272    /**
273     * @param precedence the precedence to set
274     */
275    public void setPrecedence(long precedence) {
276        this.precedence = precedence;
277    }
278
279    /**
280     * @return the precedence
281     */
282    public long getPrecedence() {
283        return precedence;
284    }
285
286    /**
287     * @param currentSize the currentSize to set
288     */
289    public void setCurrentSize(long currentSize) {
290        this.currentSize = currentSize;
291    }
292
293    /**
294     * @return the totalEnqueues
295     */
296    public long getTotalEnqueues() {
297        return totalEnqueues;
298    }
299
300    /**
301     * @param totalEnqueues the totalEnqueues to set
302     */
303    public void setTotalEnqueues(long totalEnqueues) {
304        this.totalEnqueues = totalEnqueues;
305    }
306
307    /**
308     * @return the sessionBalance
309     */
310    public long getSessionBalance() {
311        return sessionBalance;
312    }
313
314    /**
315     * @param sessionBalance the sessionBalance to set
316     */
317    public void setSessionBalance(long sessionBalance) {
318        this.sessionBalance = sessionBalance;
319    }
320
321    /**
322     * @return the lastCost
323     */
324    public double getLastCost() {
325        return lastCost;
326    }
327
328    /**
329     * @param lastCost the lastCost to set
330     */
331    public void setLastCost(double lastCost) {
332        this.lastCost = lastCost;
333    }
334
335    /**
336     * @return the averageCost
337     */
338    public double getAverageCost() {
339        return averageCost;
340    }
341
342    /**
343     * @param averageCost the averageCost to set
344     */
345    public void setAverageCost(double averageCost) {
346        this.averageCost = averageCost;
347    }
348
349    /**
350     * @return the lastDequeueTime
351     */
352    public String getLastDequeueTime() {
353        return lastDequeueTime;
354    }
355
356    /**
357     * @param lastDequeueTime the lastDequeueTime to set
358     */
359    public void setLastDequeueTime(String lastDequeueTime) {
360        this.lastDequeueTime = lastDequeueTime;
361    }
362
363    /**
364     * @return the wakeTime
365     */
366    public String getWakeTime() {
367        return wakeTime;
368    }
369
370    /**
371     * @param wakeTime the wakeTime to set
372     */
373    public void setWakeTime(String wakeTime) {
374        this.wakeTime = wakeTime;
375    }
376
377    /**
378     * @return the totalSpend
379     */
380    public long getTotalSpend() {
381        return totalSpend;
382    }
383
384    /**
385     * @param totalSpend the totalSpend to set
386     */
387    public void setTotalSpend(long totalSpend) {
388        this.totalSpend = totalSpend;
389    }
390
391    /**
392     * @return the totalBudget
393     */
394    public long getTotalBudget() {
395        return totalBudget;
396    }
397
398    /**
399     * @param totalBudget the totalBudget to set
400     */
401    public void setTotalBudget(long totalBudget) {
402        this.totalBudget = totalBudget;
403    }
404
405    /**
406     * @return the errorCount
407     */
408    public long getErrorCount() {
409        return errorCount;
410    }
411
412    /**
413     * @param errorCount the errorCount to set
414     */
415    public void setErrorCount(long errorCount) {
416        this.errorCount = errorCount;
417    }
418
419    /**
420     * @return the lastPeekUri
421     */
422    public String getLastPeekUri() {
423        return lastPeekUri;
424    }
425
426    /**
427     * @param lastPeekUri the lastPeekUri to set
428     */
429    public void setLastPeekUri(String lastPeekUri) {
430        this.lastPeekUri = lastPeekUri;
431    }
432
433    /**
434     * @return the lastQueuedUri
435     */
436    public String getLastQueuedUri() {
437        return lastQueuedUri;
438    }
439
440    /**
441     * @param lastQueuedUri the lastQueuedUri to set
442     */
443    public void setLastQueuedUri(String lastQueuedUri) {
444        this.lastQueuedUri = lastQueuedUri;
445    }
446
447    /**
448     * Default order relation is descending size of the queue (totalEnqueues).
449     */
450    @Override
451    public int compareTo(FrontierReportLine l) {
452        return FrontierReportLineNaturalOrder.getInstance().compare(this, l);
453    }
454
455    /**
456     * There is one queue per domain, so equality is based on the domain name.
457     */
458    @Override
459    public boolean equals(Object obj) {
460        if (obj instanceof FrontierReportLine) {
461            return domainName.equals(((FrontierReportLine) obj).getDomainName());
462        }
463        return false;
464    }
465
466    /**
467     * There is one queue per domain, so hashcode is based on the domain name.
468     */
469    @Override
470    public int hashCode() {
471        return domainName.hashCode();
472    }
473
474    public String getQueueId() {
475        return domainName;
476    }
477
478    public long getQueueSize() {
479        return totalEnqueues;
480    }
481
482    /**
483     * Parses the token.
484     *
485     * @param longToken token to parse.
486     * @return parsed value or default value if value is empty or unparsable.
487     */
488    private static long parseLong(String longToken) {
489        if (EMPTY_VALUE_TOKEN.equals(longToken)) {
490            return Long.MIN_VALUE;
491        }
492        try {
493            return Long.parseLong(longToken);
494        } catch (NumberFormatException e) {
495            // Strange data my occur here, but it's harmless
496            return Long.MIN_VALUE;
497        }
498    }
499
500    /**
501     * Parses the token.
502     *
503     * @param dblToken token to parse.
504     * @return parsed value or default value if value is empty or unparsable.
505     */
506    private static double parseDouble(String dblToken) {
507        if (EMPTY_VALUE_TOKEN.equals(dblToken)) {
508            return Double.MIN_VALUE;
509        }
510        try {
511            return Double.parseDouble(dblToken);
512        } catch (NumberFormatException e) {
513            // Strange data my occur here, but it's harmless
514            return Double.MIN_VALUE;
515        }
516    }
517
518}