001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.harvesting.monitor;
024
025import java.net.MalformedURLException;
026import java.net.URL;
027import java.text.MessageFormat;
028import java.text.ParseException;
029import java.util.Date;
030
031import org.slf4j.Logger;
032import org.slf4j.LoggerFactory;
033
034import dk.netarkivet.common.exceptions.ArgumentNotValid;
035import dk.netarkivet.common.utils.StringUtils;
036import dk.netarkivet.harvester.datamodel.HarvestDefinitionDAO;
037import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage;
038import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage.CrawlServiceInfo;
039import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage.CrawlServiceJobInfo;
040import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage.CrawlStatus;
041
042/**
043 * This class is a simple bean storing information about a started job.
044 * <p>
045 * This class is a persistent entity as per Berkeley DB JE DPL API.
046 */
047public class StartedJobInfo implements Comparable<StartedJobInfo> {
048
049    /** The class logger. */
050    private static final Logger log = LoggerFactory.getLogger(StartedJobInfo.class);
051
052    /** list of the compare criteria. */
053    public enum Criteria {
054        JOBID, HOST, PROGRESS, ELAPSED, QFILES, TOTALQ, ACTIVEQ, EXHAUSTEDQ
055    }
056
057    ;
058
059    /** current compare criteria. */
060    private StartedJobInfo.Criteria compareCriteria = StartedJobInfo.Criteria.JOBID;
061
062    private static final String NOT_AVAILABLE_STRING = "";
063    private static final long NOT_AVAILABLE_NUM = -1L;
064
065    /** A text format used to parse Heritrix's short frontier report. */
066    private static final MessageFormat FRONTIER_SHORT_FMT = new MessageFormat(
067            "{0} queues: {1} active ({2} in-process; {3} ready; {4} snoozed); {5} inactive; {6} retired; {7} exhausted");
068
069    /** The job identifier. */
070    private long jobId;
071
072    /** The name of the harvest this job belongs to. */
073    private String harvestName;
074
075    /** Creation date of the record. */
076    private Date timestamp;
077
078    /** URL to the Heritrix admin console. */
079    private String hostUrl;
080
081    /** A percentage indicating the crawl progress. */
082    private double progress;
083
084    /** The number of URIs queued, awaiting to be processed by Heritrix. */
085    private long queuedFilesCount;
086
087    /**
088     * The number of URIS harvested by Heritrix since the beginning of the crawl.
089     */
090    private long downloadedFilesCount;
091
092    /**
093     * The total number of queues in the frontier. Queues are per-domain.
094     */
095    private long totalQueuesCount;
096
097    /** The number of queues in process. */
098    private long activeQueuesCount;
099
100    /** The number of queues that have been retired when they hit their quota. */
101    private long retiredQueuesCount;
102    /** Number of queues entirely processed. */
103    private long exhaustedQueuesCount;
104
105    /** Time in seconds elapsed since Heritrix began crawling this job. */
106    private long elapsedSeconds;
107
108    /** Current download rate in KB/sec. */
109    private long currentProcessedKBPerSec;
110
111    /** Average download rate (over a period of 20 seconds) in KB/sec. */
112    private long processedKBPerSec;
113
114    /** Current download rate in URI/sec. */
115    private double currentProcessedDocsPerSec;
116
117    /** Average download rate (over a period of 20 seconds) in URI/sec. */
118    private double processedDocsPerSec;
119
120    /** Number of active Heritrix worker threads. */
121    private int activeToeCount;
122
123    /** Number of alerts raised by Heritrix since the crawl began. */
124    private long alertsCount;
125
126    /** Current job status. */
127    private CrawlStatus status;
128
129    /**
130     * Needed by BDB DPL.
131     */
132    public StartedJobInfo() {
133
134    }
135
136    /**
137     * Instantiates all readable fields with default values.
138     *
139     * @param harvestName the name of the harvest
140     * @param jobId the ID of the job
141     */
142    public StartedJobInfo(String harvestName, long jobId) {
143        this.timestamp = new Date(System.currentTimeMillis());
144        this.jobId = jobId;
145        this.harvestName = harvestName;
146        this.hostUrl = NOT_AVAILABLE_STRING;
147        this.progress = NOT_AVAILABLE_NUM;
148        this.queuedFilesCount = NOT_AVAILABLE_NUM;
149        this.totalQueuesCount = NOT_AVAILABLE_NUM;
150        this.activeQueuesCount = NOT_AVAILABLE_NUM;
151        this.retiredQueuesCount = NOT_AVAILABLE_NUM;
152        this.exhaustedQueuesCount = NOT_AVAILABLE_NUM;
153        this.elapsedSeconds = NOT_AVAILABLE_NUM;
154        this.alertsCount = NOT_AVAILABLE_NUM;
155        this.downloadedFilesCount = NOT_AVAILABLE_NUM;
156        this.currentProcessedKBPerSec = NOT_AVAILABLE_NUM;
157        this.processedKBPerSec = NOT_AVAILABLE_NUM;
158        this.currentProcessedDocsPerSec = NOT_AVAILABLE_NUM;
159        this.processedDocsPerSec = NOT_AVAILABLE_NUM;
160        this.activeToeCount = (int) NOT_AVAILABLE_NUM;
161        this.status = CrawlStatus.PRE_CRAWL;
162    }
163
164    /**
165     * @return the job ID.
166     */
167    public long getJobId() {
168        return jobId;
169    }
170
171    /**
172     * @return the harvest name.
173     */
174    public String getHarvestName() {
175        return harvestName;
176    }
177
178    /**
179     * @return the timestamp
180     */
181    public Date getTimestamp() {
182        return timestamp;
183    }
184
185    /**
186     * @return the name of the host on which Heritrix is crawling this job.
187     */
188    public String getHostName() {
189        if (NOT_AVAILABLE_STRING.equals(hostUrl)) {
190            return NOT_AVAILABLE_STRING;
191        }
192        try {
193            return new URL(hostUrl).getHost();
194        } catch (MalformedURLException e) {
195            return NOT_AVAILABLE_STRING;
196        }
197    }
198
199    /**
200     * @return the URL of the Heritrix admin console for the instance crawling this job.
201     */
202    public String getHostUrl() {
203        return hostUrl;
204    }
205
206    /**
207     * @return the crawl progress as a numeric percentage.
208     */
209    public double getProgress() {
210        return progress;
211    }
212
213    /**
214     * @return the number of queued files reported by Heritrix.
215     */
216    public long getQueuedFilesCount() {
217        return queuedFilesCount;
218    }
219
220    /**
221     * @return the number of queues reported by Heritrix.
222     */
223    public long getTotalQueuesCount() {
224        return totalQueuesCount;
225    }
226
227    /**
228     * @return the number of active queues reported by Heritrix.
229     */
230    public long getActiveQueuesCount() {
231        return activeQueuesCount;
232    }
233
234    /**
235     * @return the number of retired heritrix queues.
236     */
237    public long getRetiredQueuesCount() {
238        return retiredQueuesCount;
239    }
240
241    /**
242     * @return the number of exhausted queues reported by Heritrix.
243     */
244    public long getExhaustedQueuesCount() {
245        return exhaustedQueuesCount;
246    }
247
248    /**
249     * @return the formatted duration of the crawl.
250     */
251    public String getElapsedTime() {
252        return StringUtils.formatDuration(elapsedSeconds);
253    }
254
255    /**
256     * @return the duration of the crawl so far.
257     */
258    public Long getElapsedSeconds() {
259        return elapsedSeconds;
260    }
261
262    /**
263     * @return the number of alerts raised by Heritrix.
264     */
265    public long getAlertsCount() {
266        return alertsCount;
267    }
268
269    /**
270     * @return the number of downloaded URIs reported by Heritrix.
271     */
272    public long getDownloadedFilesCount() {
273        return downloadedFilesCount;
274    }
275
276    /**
277     * @return the current download rate in KB/sec reported by Heritrix.
278     */
279    public long getCurrentProcessedKBPerSec() {
280        return currentProcessedKBPerSec;
281    }
282
283    /**
284     * @return the average download rate in KB/sec reported by Heritrix.
285     */
286    public long getProcessedKBPerSec() {
287        return processedKBPerSec;
288    }
289
290    /**
291     * @return the current download rate in URI/sec reported by Heritrix.
292     */
293    public double getCurrentProcessedDocsPerSec() {
294        return currentProcessedDocsPerSec;
295    }
296
297    /**
298     * @return the average download rate in URI/sec reported by Heritrix.
299     */
300    public double getProcessedDocsPerSec() {
301        return processedDocsPerSec;
302    }
303
304    /**
305     * @return the number of active processor threads reported by Heritrix.
306     */
307    public int getActiveToeCount() {
308        return activeToeCount;
309    }
310
311    /**
312     * @return the job status
313     * @see CrawlStatus
314     */
315    public CrawlStatus getStatus() {
316        return status;
317    }
318
319    @Override
320    public int compareTo(StartedJobInfo o) throws NullPointerException {
321
322        if (o == null) {
323            throw new NullPointerException("StartedJobInfo o can't be null");
324        }
325
326        if (compareCriteria == StartedJobInfo.Criteria.HOST) {
327            return hostUrl.compareTo(o.hostUrl);
328        }
329        if (compareCriteria == StartedJobInfo.Criteria.PROGRESS) {
330            return new Double(progress).compareTo(new Double(o.progress));
331        }
332        if (compareCriteria == StartedJobInfo.Criteria.ELAPSED) {
333            return Long.valueOf(elapsedSeconds).compareTo(Long.valueOf(o.elapsedSeconds));
334        }
335        if (compareCriteria == StartedJobInfo.Criteria.QFILES) {
336            return Long.valueOf(queuedFilesCount).compareTo(Long.valueOf(o.queuedFilesCount));
337        }
338        if (compareCriteria == StartedJobInfo.Criteria.TOTALQ) {
339            return Long.valueOf(totalQueuesCount).compareTo(Long.valueOf(o.totalQueuesCount));
340        }
341        if (compareCriteria == StartedJobInfo.Criteria.ACTIVEQ) {
342            return Long.valueOf(activeQueuesCount).compareTo(Long.valueOf(o.activeQueuesCount));
343        }
344        if (compareCriteria == StartedJobInfo.Criteria.EXHAUSTEDQ) {
345            return Long.valueOf(exhaustedQueuesCount).compareTo(Long.valueOf(o.exhaustedQueuesCount));
346        }
347        return Long.valueOf(jobId).compareTo(Long.valueOf(o.jobId));
348    }
349
350    /**
351     * set the criteria used in the compareTo method that way we can decide how to sort StartedJobInfo.
352     *
353     * @param criteria the criteria we want to use
354     */
355    public void chooseCompareCriteria(StartedJobInfo.Criteria criteria) {
356        ArgumentNotValid.checkNotNull(criteria, "criteria can't be null");
357        compareCriteria = criteria;
358    }
359
360    @Override
361    public String toString() {
362        return harvestName + " - " + jobId + " {" + "\n\tstatus=" + status.name() + "\n\telapsedSeconds="
363                + elapsedSeconds + "\n\thostUrl=" + hostUrl + "\n\tprogress=" + progress + "\n\tactiveToeCount="
364                + activeToeCount + "\n\talertsCount=" + alertsCount + "\n\tcurrentProcessedKBPerSec="
365                + currentProcessedKBPerSec + "\n\tprocessedKBPerSec=" + processedKBPerSec
366                + "\n\tcurrentProcessedDocsPerSec=" + currentProcessedDocsPerSec + "\n\tprocessedDocsPerSec="
367                + processedDocsPerSec + "\n\tdownloadedFilesCount=" + downloadedFilesCount + "\n\tqueuedFilesCount="
368                + queuedFilesCount + "\n\tactiveQueuesCount=" + activeQueuesCount + "\n\texhaustedQueuesCount="
369                + exhaustedQueuesCount + "\n\ttotalQueuesCount=" + totalQueuesCount + "\n}";
370    }
371
372    /**
373     * Updates the members from a {@link CrawlProgressMessage} instance.
374     *
375     * @param msg the {@link CrawlProgressMessage} to process.
376     * @return jobinfo based on the contents of the message.
377     */
378    public static StartedJobInfo build(CrawlProgressMessage msg) {
379        ArgumentNotValid.checkNotNull(msg, "CrawlProgressMessage msg");
380        String harvestName = HarvestDefinitionDAO.getInstance().getHarvestName(msg.getHarvestID());
381
382        StartedJobInfo sji = new StartedJobInfo(harvestName, msg.getJobID());
383
384        CrawlServiceInfo heritrixInfo = msg.getHeritrixStatus();
385        CrawlServiceJobInfo jobInfo = msg.getJobStatus();
386
387        CrawlStatus newStatus = msg.getStatus();
388        switch (newStatus) {
389        case PRE_CRAWL:
390            // Initialize statistics-variables before starting the crawl.
391            sji.activeQueuesCount = 0;
392            sji.activeToeCount = 0;
393            sji.alertsCount = 0;
394            sji.currentProcessedDocsPerSec = 0;
395            sji.currentProcessedKBPerSec = 0;
396            sji.downloadedFilesCount = 0;
397            sji.elapsedSeconds = 0;
398            sji.hostUrl = "";
399            sji.processedDocsPerSec = 0;
400            sji.processedKBPerSec = 0;
401            sji.progress = 0;
402            sji.queuedFilesCount = 0;
403            sji.totalQueuesCount = 0;
404            break;
405        case CRAWLER_ACTIVE:
406        case CRAWLER_PAUSING:
407        case CRAWLER_PAUSED:
408            // Update statistics for the crawl
409            double discoveredCount = jobInfo.getDiscoveredFilesCount();
410            double downloadedCount = jobInfo.getDownloadedFilesCount();
411            sji.progress = 100 * downloadedCount / discoveredCount;
412
413            String frontierShortReport = jobInfo.getFrontierShortReport();
414            if (frontierShortReport != null) {
415                try {
416                    Object[] params = FRONTIER_SHORT_FMT.parse(frontierShortReport);
417                    sji.totalQueuesCount = Long.parseLong((String) params[0]);
418                    sji.activeQueuesCount = Long.parseLong((String) params[1]);
419                    sji.retiredQueuesCount = Long.parseLong((String) params[6]);
420                    sji.exhaustedQueuesCount = Long.parseLong((String) params[7]);
421                } catch (ParseException e) {
422                    throw new ArgumentNotValid(frontierShortReport, e);
423                }
424            }
425
426            sji.activeToeCount = jobInfo.getActiveToeCount();
427            sji.alertsCount = heritrixInfo.getAlertCount();
428            sji.currentProcessedDocsPerSec = jobInfo.getCurrentProcessedDocsPerSec();
429            sji.currentProcessedKBPerSec = jobInfo.getCurrentProcessedKBPerSec();
430            sji.downloadedFilesCount = jobInfo.getDownloadedFilesCount();
431            sji.elapsedSeconds = jobInfo.getElapsedSeconds();
432            sji.hostUrl = msg.getHostUrl();
433            sji.processedDocsPerSec = jobInfo.getProcessedDocsPerSec();
434            sji.processedKBPerSec = jobInfo.getProcessedKBPerSec();
435            sji.queuedFilesCount = jobInfo.getQueuedUriCount();
436            break;
437        case CRAWLING_FINISHED:
438            // Set progress to 100 %, and reset the other values .
439            sji.progress = 100;
440            sji.hostUrl = "";
441            sji.activeQueuesCount = 0;
442            sji.activeToeCount = 0;
443            sji.currentProcessedDocsPerSec = 0;
444            sji.currentProcessedKBPerSec = 0;
445            sji.processedDocsPerSec = 0;
446            sji.processedKBPerSec = 0;
447            sji.queuedFilesCount = 0;
448            sji.totalQueuesCount = 0;
449            break;
450        default:
451            log.debug("Nothing to do for state: {}", newStatus);
452            break;
453        }
454        sji.status = newStatus;
455
456        return sji;
457    }
458
459    /**
460     * @param hostUrl the hostUrl to set
461     */
462    public void setHostUrl(String hostUrl) {
463        this.hostUrl = hostUrl;
464    }
465
466    /**
467     * @param progress the progress to set
468     */
469    public void setProgress(double progress) {
470        this.progress = progress;
471    }
472
473    /**
474     * @param queuedFilesCount the queuedFilesCount to set
475     */
476    public void setQueuedFilesCount(long queuedFilesCount) {
477        this.queuedFilesCount = queuedFilesCount;
478    }
479
480    /**
481     * @param downloadedFilesCount the downloadedFilesCount to set
482     */
483    public void setDownloadedFilesCount(long downloadedFilesCount) {
484        this.downloadedFilesCount = downloadedFilesCount;
485    }
486
487    /**
488     * @param totalQueuesCount the totalQueuesCount to set
489     */
490    public void setTotalQueuesCount(long totalQueuesCount) {
491        this.totalQueuesCount = totalQueuesCount;
492    }
493
494    /**
495     * @param activeQueuesCount the activeQueuesCount to set
496     */
497    public void setActiveQueuesCount(long activeQueuesCount) {
498        this.activeQueuesCount = activeQueuesCount;
499    }
500
501    /**
502     * @param exhaustedQueuesCount the exhaustedQueuesCount to set
503     */
504    public void setExhaustedQueuesCount(long exhaustedQueuesCount) {
505        this.exhaustedQueuesCount = exhaustedQueuesCount;
506    }
507
508    /**
509     * @param elapsedSeconds the elapsedSeconds to set
510     */
511    public void setElapsedSeconds(long elapsedSeconds) {
512        this.elapsedSeconds = elapsedSeconds;
513    }
514
515    /**
516     * @param currentProcessedKBPerSec the currentProcessedKBPerSec to set
517     */
518    public void setCurrentProcessedKBPerSec(long currentProcessedKBPerSec) {
519        this.currentProcessedKBPerSec = currentProcessedKBPerSec;
520    }
521
522    /**
523     * @param processedKBPerSec the processedKBPerSec to set
524     */
525    public void setProcessedKBPerSec(long processedKBPerSec) {
526        this.processedKBPerSec = processedKBPerSec;
527    }
528
529    /**
530     * @param currentProcessedDocsPerSec the currentProcessedDocsPerSec to set
531     */
532    public void setCurrentProcessedDocsPerSec(double currentProcessedDocsPerSec) {
533        this.currentProcessedDocsPerSec = currentProcessedDocsPerSec;
534    }
535
536    /**
537     * @param processedDocsPerSec the processedDocsPerSec to set
538     */
539    public void setProcessedDocsPerSec(double processedDocsPerSec) {
540        this.processedDocsPerSec = processedDocsPerSec;
541    }
542
543    /**
544     * @param activeToeCount the activeToeCount to set
545     */
546    public void setActiveToeCount(int activeToeCount) {
547        this.activeToeCount = activeToeCount;
548    }
549
550    /**
551     * @param alertsCount the alertsCount to set
552     */
553    public void setAlertsCount(long alertsCount) {
554        this.alertsCount = alertsCount;
555    }
556
557    /**
558     * @param status the status to set
559     */
560    public void setStatus(CrawlStatus status) {
561        this.status = status;
562    }
563
564    /**
565     * @param timestamp the timestamp to set
566     */
567    public void setTimestamp(Date timestamp) {
568        this.timestamp = timestamp;
569    }
570
571    /**
572     * @param retiredQueuesCount the retiredQueuesCount to set
573     */
574    public void setRetiredQueuesCount(long retiredQueuesCount) {
575        this.retiredQueuesCount = retiredQueuesCount;
576    }
577
578}