001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.harvesting.monitor;
024
025import java.net.MalformedURLException;
026import java.net.URL;
027import java.text.MessageFormat;
028import java.text.ParseException;
029import java.util.Date;
030
031import org.slf4j.Logger;
032import org.slf4j.LoggerFactory;
033
034import dk.netarkivet.common.exceptions.ArgumentNotValid;
035import dk.netarkivet.common.utils.StringUtils;
036import dk.netarkivet.harvester.datamodel.HarvestDefinitionDAO;
037import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage;
038import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage.CrawlServiceInfo;
039import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage.CrawlServiceJobInfo;
040import dk.netarkivet.harvester.harvesting.distribute.CrawlProgressMessage.CrawlStatus;
041
042/**
043 * This class is a simple bean storing information about a started job.
044 * <p>
045 * This class is a persistent entity as per Berkeley DB JE DPL API.
046 */
047public class StartedJobInfo implements Comparable<StartedJobInfo> {
048
049    /** The class logger. */
050    private static final Logger log = LoggerFactory.getLogger(StartedJobInfo.class);
051
052    /** list of the compare criteria. */
053    public enum Criteria {
054        JOBID, HOST, PROGRESS, ELAPSED, QFILES, TOTALQ, ACTIVEQ, INACTIVEQ, EXHAUSTEDQ
055    }
056
057    ;
058
059    /** current compare criteria. */
060    private StartedJobInfo.Criteria compareCriteria = StartedJobInfo.Criteria.JOBID;
061
062    private static final String NOT_AVAILABLE_STRING = "";
063    private static final long NOT_AVAILABLE_NUM = -1L;
064
065    /** A text format used to parse Heritrix's short frontier report. */
066    private static final MessageFormat FRONTIER_SHORT_FMT = new MessageFormat(
067            "{0} queues: {1} active ({2} in-process; {3} ready; {4} snoozed); {5} inactive; {6} retired; {7} exhausted");
068
069    /** The job identifier. */
070    private long jobId;
071
072    /** The name of the harvest this job belongs to. */
073    private String harvestName;
074
075    /** Creation date of the record. */
076    private Date timestamp;
077
078    /** URL to the Heritrix admin console. */
079    private String hostUrl;
080
081    /** A percentage indicating the crawl progress. */
082    private double progress;
083
084    /** The number of URIs queued, awaiting to be processed by Heritrix. */
085    private long queuedFilesCount;
086
087    /**
088     * The number of URIS harvested by Heritrix since the beginning of the crawl.
089     */
090    private long downloadedFilesCount;
091
092    /**
093     * The total number of queues in the frontier. Queues are per-domain.
094     */
095    private long totalQueuesCount;
096
097    /** The number of queues in process. */
098    private long activeQueuesCount;
099    
100    /** The number of inactive queues in process. */
101    private long inactiveQueuesCount;
102
103    /** The number of queues that have been retired when they hit their quota. */
104    private long retiredQueuesCount;
105    /** Number of queues entirely processed. */
106    private long exhaustedQueuesCount;
107
108    /** Time in seconds elapsed since Heritrix began crawling this job. */
109    private long elapsedSeconds;
110
111    /** Current download rate in KB/sec. */
112    private long currentProcessedKBPerSec;
113
114    /** Average download rate (over a period of 20 seconds) in KB/sec. */
115    private long processedKBPerSec;
116
117    /** Current download rate in URI/sec. */
118    private double currentProcessedDocsPerSec;
119
120    /** Average download rate (over a period of 20 seconds) in URI/sec. */
121    private double processedDocsPerSec;
122
123    /** Number of active Heritrix worker threads. */
124    private int activeToeCount;
125
126    /** Number of alerts raised by Heritrix since the crawl began. */
127    private long alertsCount;
128
129    /** Current job status. */
130    private CrawlStatus status;
131
132    /**
133     * Needed by BDB DPL.
134     */
135    public StartedJobInfo() {
136
137    }
138
139    /**
140     * Instantiates all readable fields with default values.
141     *
142     * @param harvestName the name of the harvest
143     * @param jobId the ID of the job
144     */
145    public StartedJobInfo(String harvestName, long jobId) {
146        this.timestamp = new Date(System.currentTimeMillis());
147        this.jobId = jobId;
148        this.harvestName = harvestName;
149        this.hostUrl = NOT_AVAILABLE_STRING;
150        this.progress = NOT_AVAILABLE_NUM;
151        this.queuedFilesCount = NOT_AVAILABLE_NUM;
152        this.totalQueuesCount = NOT_AVAILABLE_NUM;
153        this.activeQueuesCount = NOT_AVAILABLE_NUM;
154        this.inactiveQueuesCount = NOT_AVAILABLE_NUM;
155        this.retiredQueuesCount = NOT_AVAILABLE_NUM;
156        this.exhaustedQueuesCount = NOT_AVAILABLE_NUM;
157        this.elapsedSeconds = NOT_AVAILABLE_NUM;
158        this.alertsCount = NOT_AVAILABLE_NUM;
159        this.downloadedFilesCount = NOT_AVAILABLE_NUM;
160        this.currentProcessedKBPerSec = NOT_AVAILABLE_NUM;
161        this.processedKBPerSec = NOT_AVAILABLE_NUM;
162        this.currentProcessedDocsPerSec = NOT_AVAILABLE_NUM;
163        this.processedDocsPerSec = NOT_AVAILABLE_NUM;
164        this.activeToeCount = (int) NOT_AVAILABLE_NUM;
165        this.status = CrawlStatus.PRE_CRAWL;
166    }
167
168    /**
169     * @return the job ID.
170     */
171    public long getJobId() {
172        return jobId;
173    }
174
175    /**
176     * @return the harvest name.
177     */
178    public String getHarvestName() {
179        return harvestName;
180    }
181
182    /**
183     * @return the timestamp
184     */
185    public Date getTimestamp() {
186        return timestamp;
187    }
188
189    /**
190     * @return the name of the host on which Heritrix is crawling this job.
191     */
192    public String getHostName() {
193        if (NOT_AVAILABLE_STRING.equals(hostUrl)) {
194            return NOT_AVAILABLE_STRING;
195        }
196        try {
197            return new URL(hostUrl).getHost();
198        } catch (MalformedURLException e) {
199            return NOT_AVAILABLE_STRING;
200        }
201    }
202
203    /**
204     * @return the URL of the Heritrix admin console for the instance crawling this job.
205     */
206    public String getHostUrl() {
207        return hostUrl;
208    }
209
210    /**
211     * @return the crawl progress as a numeric percentage.
212     */
213    public double getProgress() {
214        return progress;
215    }
216
217    /**
218     * @return the number of queued files reported by Heritrix.
219     */
220    public long getQueuedFilesCount() {
221        return queuedFilesCount;
222    }
223
224    /**
225     * @return the number of queues reported by Heritrix.
226     */
227    public long getTotalQueuesCount() {
228        return totalQueuesCount;
229    }
230
231    /**
232     * @return the number of active queues reported by Heritrix.
233     */
234    public long getActiveQueuesCount() {
235        return activeQueuesCount;
236    }
237
238    /**
239     * @return the number of retired heritrix queues.
240     */
241    public long getRetiredQueuesCount() {
242        return retiredQueuesCount;
243    }
244
245    /**
246     * @return the number of exhausted queues reported by Heritrix.
247     */
248    public long getExhaustedQueuesCount() {
249        return exhaustedQueuesCount;
250    }
251
252    /**
253     * @return the formatted duration of the crawl.
254     */
255    public String getElapsedTime() {
256        return StringUtils.formatDuration(elapsedSeconds);
257    }
258
259    /**
260     * @return the duration of the crawl so far.
261     */
262    public Long getElapsedSeconds() {
263        return elapsedSeconds;
264    }
265
266    /**
267     * @return the number of alerts raised by Heritrix.
268     */
269    public long getAlertsCount() {
270        return alertsCount;
271    }
272
273    /**
274     * @return the number of downloaded URIs reported by Heritrix.
275     */
276    public long getDownloadedFilesCount() {
277        return downloadedFilesCount;
278    }
279
280    /**
281     * @return the current download rate in KB/sec reported by Heritrix.
282     */
283    public long getCurrentProcessedKBPerSec() {
284        return currentProcessedKBPerSec;
285    }
286
287    /**
288     * @return the average download rate in KB/sec reported by Heritrix.
289     */
290    public long getProcessedKBPerSec() {
291        return processedKBPerSec;
292    }
293
294    /**
295     * @return the current download rate in URI/sec reported by Heritrix.
296     */
297    public double getCurrentProcessedDocsPerSec() {
298        return currentProcessedDocsPerSec;
299    }
300
301    /**
302     * @return the average download rate in URI/sec reported by Heritrix.
303     */
304    public double getProcessedDocsPerSec() {
305        return processedDocsPerSec;
306    }
307
308    /**
309     * @return the number of active processor threads reported by Heritrix.
310     */
311    public int getActiveToeCount() {
312        return activeToeCount;
313    }
314
315    /**
316     * @return the job status
317     * @see CrawlStatus
318     */
319    public CrawlStatus getStatus() {
320        return status;
321    }
322
323    @Override
324    public int compareTo(StartedJobInfo o) throws NullPointerException {
325
326        if (o == null) {
327            throw new NullPointerException("StartedJobInfo o can't be null");
328        }
329
330        if (compareCriteria == StartedJobInfo.Criteria.HOST) {
331            return hostUrl.compareTo(o.hostUrl);
332        }
333        if (compareCriteria == StartedJobInfo.Criteria.PROGRESS) {
334            return new Double(progress).compareTo(new Double(o.progress));
335        }
336        if (compareCriteria == StartedJobInfo.Criteria.ELAPSED) {
337            return Long.valueOf(elapsedSeconds).compareTo(Long.valueOf(o.elapsedSeconds));
338        }
339        if (compareCriteria == StartedJobInfo.Criteria.QFILES) {
340            return Long.valueOf(queuedFilesCount).compareTo(Long.valueOf(o.queuedFilesCount));
341        }
342        if (compareCriteria == StartedJobInfo.Criteria.TOTALQ) {
343            return Long.valueOf(totalQueuesCount).compareTo(Long.valueOf(o.totalQueuesCount));
344        }
345        if (compareCriteria == StartedJobInfo.Criteria.ACTIVEQ) {
346            return Long.valueOf(activeQueuesCount).compareTo(Long.valueOf(o.activeQueuesCount));
347        }
348        if (compareCriteria == StartedJobInfo.Criteria.INACTIVEQ) {
349            return Long.valueOf(inactiveQueuesCount).compareTo(Long.valueOf(o.inactiveQueuesCount));
350        }
351        if (compareCriteria == StartedJobInfo.Criteria.EXHAUSTEDQ) {
352            return Long.valueOf(exhaustedQueuesCount).compareTo(Long.valueOf(o.exhaustedQueuesCount));
353        }
354        return Long.valueOf(jobId).compareTo(Long.valueOf(o.jobId));
355    }
356
357    /**
358     * set the criteria used in the compareTo method that way we can decide how to sort StartedJobInfo.
359     *
360     * @param criteria the criteria we want to use
361     */
362    public void chooseCompareCriteria(StartedJobInfo.Criteria criteria) {
363        ArgumentNotValid.checkNotNull(criteria, "criteria can't be null");
364        compareCriteria = criteria;
365    }
366
367    @Override
368    public String toString() {
369        return harvestName + " - " + jobId + " {" + "\n\tstatus=" + status.name() + "\n\telapsedSeconds="
370                + elapsedSeconds + "\n\thostUrl=" + hostUrl + "\n\tprogress=" + progress + "\n\tactiveToeCount="
371                + activeToeCount + "\n\talertsCount=" + alertsCount + "\n\tcurrentProcessedKBPerSec="
372                + currentProcessedKBPerSec + "\n\tprocessedKBPerSec=" + processedKBPerSec
373                + "\n\tcurrentProcessedDocsPerSec=" + currentProcessedDocsPerSec + "\n\tprocessedDocsPerSec="
374                + processedDocsPerSec + "\n\tdownloadedFilesCount=" + downloadedFilesCount + "\n\tqueuedFilesCount="
375                + queuedFilesCount + "\n\tactiveQueuesCount=" + activeQueuesCount + "\n\tinactiveQueuesCount="
376                + inactiveQueuesCount + "\n\texhaustedQueuesCount="
377                + exhaustedQueuesCount + "\n\ttotalQueuesCount=" + totalQueuesCount + "\n}";
378    }
379
380    /**
381     * Updates the members from a {@link CrawlProgressMessage} instance.
382     *
383     * @param msg the {@link CrawlProgressMessage} to process.
384     * @return jobinfo based on the contents of the message.
385     */
386    public static StartedJobInfo build(CrawlProgressMessage msg) {
387        ArgumentNotValid.checkNotNull(msg, "CrawlProgressMessage msg");
388        String harvestName = HarvestDefinitionDAO.getInstance().getHarvestName(msg.getHarvestID());
389
390        StartedJobInfo sji = new StartedJobInfo(harvestName, msg.getJobID());
391
392        CrawlServiceInfo heritrixInfo = msg.getHeritrixStatus();
393        CrawlServiceJobInfo jobInfo = msg.getJobStatus();
394
395        CrawlStatus newStatus = msg.getStatus();
396        switch (newStatus) {
397        case PRE_CRAWL:
398            // Initialize statistics-variables before starting the crawl.
399            sji.activeQueuesCount = 0;
400            sji.inactiveQueuesCount = 0;
401            sji.activeToeCount = 0;
402            sji.alertsCount = 0;
403            sji.currentProcessedDocsPerSec = 0;
404            sji.currentProcessedKBPerSec = 0;
405            sji.downloadedFilesCount = 0;
406            sji.elapsedSeconds = 0;
407            sji.hostUrl = "";
408            sji.processedDocsPerSec = 0;
409            sji.processedKBPerSec = 0;
410            sji.progress = 0;
411            sji.queuedFilesCount = 0;
412            sji.totalQueuesCount = 0;
413            break;
414        case CRAWLER_ACTIVE:
415        case CRAWLER_PAUSING:
416        case CRAWLER_PAUSED:
417            // Update statistics for the crawl
418            double discoveredCount = jobInfo.getDiscoveredFilesCount();
419            double downloadedCount = jobInfo.getDownloadedFilesCount();
420            sji.progress = 100 * downloadedCount / discoveredCount;
421
422            String frontierShortReport = jobInfo.getFrontierShortReport();
423            if (frontierShortReport != null) {
424                try {
425                    Object[] params = FRONTIER_SHORT_FMT.parse(frontierShortReport);
426                    sji.totalQueuesCount = Long.parseLong((String) params[0]);
427                    sji.activeQueuesCount = Long.parseLong((String) params[1]);
428                    sji.inactiveQueuesCount = Long.parseLong((String) params[5]);
429                    //FIXME : delete retiredQueuesCount to keep only inactiveQueuesCount
430                    sji.retiredQueuesCount = Long.parseLong((String) params[5]);
431                    sji.exhaustedQueuesCount = Long.parseLong((String) params[7]);
432                } catch (ParseException e) {
433                    throw new ArgumentNotValid(frontierShortReport, e);
434                }
435            }
436
437            sji.activeToeCount = jobInfo.getActiveToeCount();
438            sji.alertsCount = heritrixInfo.getAlertCount();
439            sji.currentProcessedDocsPerSec = jobInfo.getCurrentProcessedDocsPerSec();
440            sji.currentProcessedKBPerSec = jobInfo.getCurrentProcessedKBPerSec();
441            sji.downloadedFilesCount = jobInfo.getDownloadedFilesCount();
442            sji.elapsedSeconds = jobInfo.getElapsedSeconds();
443            sji.hostUrl = msg.getHostUrl();
444            sji.processedDocsPerSec = jobInfo.getProcessedDocsPerSec();
445            sji.processedKBPerSec = jobInfo.getProcessedKBPerSec();
446            sji.queuedFilesCount = jobInfo.getQueuedUriCount();
447            break;
448        case CRAWLING_FINISHED:
449            // Set progress to 100 %, and reset the other values .
450            sji.progress = 100;
451            sji.hostUrl = "";
452            sji.activeQueuesCount = 0;
453            sji.inactiveQueuesCount = 0;
454            sji.activeToeCount = 0;
455            sji.currentProcessedDocsPerSec = 0;
456            sji.currentProcessedKBPerSec = 0;
457            sji.processedDocsPerSec = 0;
458            sji.processedKBPerSec = 0;
459            sji.queuedFilesCount = 0;
460            sji.totalQueuesCount = 0;
461            break;
462        default:
463            log.debug("Nothing to do for state: {}", newStatus);
464            break;
465        }
466        sji.status = newStatus;
467
468        //FIXME : save inactiveQueuesCount in database
469        return sji;
470    }
471
472    /**
473     * @param hostUrl the hostUrl to set
474     */
475    public void setHostUrl(String hostUrl) {
476
477        this.hostUrl = hostUrl;
478    }
479
480    /**
481     * @param progress the progress to set
482     */
483    public void setProgress(double progress) {
484        this.progress = progress;
485    }
486
487    /**
488     * @param queuedFilesCount the queuedFilesCount to set
489     */
490    public void setQueuedFilesCount(long queuedFilesCount) {
491        this.queuedFilesCount = queuedFilesCount;
492    }
493
494    /**
495     * @param downloadedFilesCount the downloadedFilesCount to set
496     */
497    public void setDownloadedFilesCount(long downloadedFilesCount) {
498        this.downloadedFilesCount = downloadedFilesCount;
499    }
500
501    /**
502     * @param totalQueuesCount the totalQueuesCount to set
503     */
504    public void setTotalQueuesCount(long totalQueuesCount) {
505        this.totalQueuesCount = totalQueuesCount;
506    }
507
508    /**
509     * @param activeQueuesCount the activeQueuesCount to set
510     */
511    public void setActiveQueuesCount(long activeQueuesCount) {
512        this.activeQueuesCount = activeQueuesCount;
513    }
514
515    /**
516     * @param exhaustedQueuesCount the exhaustedQueuesCount to set
517     */
518    public void setExhaustedQueuesCount(long exhaustedQueuesCount) {
519        this.exhaustedQueuesCount = exhaustedQueuesCount;
520    }
521
522    /**
523     * @param elapsedSeconds the elapsedSeconds to set
524     */
525    public void setElapsedSeconds(long elapsedSeconds) {
526        this.elapsedSeconds = elapsedSeconds;
527    }
528
529    /**
530     * @param currentProcessedKBPerSec the currentProcessedKBPerSec to set
531     */
532    public void setCurrentProcessedKBPerSec(long currentProcessedKBPerSec) {
533        this.currentProcessedKBPerSec = currentProcessedKBPerSec;
534    }
535
536    /**
537     * @param processedKBPerSec the processedKBPerSec to set
538     */
539    public void setProcessedKBPerSec(long processedKBPerSec) {
540        this.processedKBPerSec = processedKBPerSec;
541    }
542
543    /**
544     * @param currentProcessedDocsPerSec the currentProcessedDocsPerSec to set
545     */
546    public void setCurrentProcessedDocsPerSec(double currentProcessedDocsPerSec) {
547        this.currentProcessedDocsPerSec = currentProcessedDocsPerSec;
548    }
549
550    /**
551     * @param processedDocsPerSec the processedDocsPerSec to set
552     */
553    public void setProcessedDocsPerSec(double processedDocsPerSec) {
554        this.processedDocsPerSec = processedDocsPerSec;
555    }
556
557    /**
558     * @param activeToeCount the activeToeCount to set
559     */
560    public void setActiveToeCount(int activeToeCount) {
561        this.activeToeCount = activeToeCount;
562    }
563
564    /**
565     * @param alertsCount the alertsCount to set
566     */
567    public void setAlertsCount(long alertsCount) {
568        this.alertsCount = alertsCount;
569    }
570
571    /**
572     * @param status the status to set
573     */
574    public void setStatus(CrawlStatus status) {
575        this.status = status;
576    }
577
578    /**
579     * @param timestamp the timestamp to set
580     */
581    public void setTimestamp(Date timestamp) {
582        this.timestamp = timestamp;
583    }
584
585    /**
586     * @param retiredQueuesCount the retiredQueuesCount to set
587     */
588    public void setRetiredQueuesCount(long retiredQueuesCount) {
589        this.retiredQueuesCount = retiredQueuesCount;
590    }
591
592        public long getInactiveQueuesCount() {
593                //FIXME : delete retiredQueuesCount from code and database
594                //and use only inactiveQueuesCount
595                //return inactiveQueuesCount;
596                return retiredQueuesCount;
597        }
598
599        public void setInactiveQueuesCount(long inactiveQueuesCount) {
600                this.inactiveQueuesCount = inactiveQueuesCount;
601        }
602
603}