001/*
002 * #%L
003 * NetarchiveSuite - harvester - core
004 * %%
005 * Copyright (C) 2005 - 2018 The Royal Danish Library, the National Library of France and the Austrian National Library.
006 * %%
007 * This program is free software: you can redistribute it and/or modify
008 * it under the terms of the GNU Lesser General Public License as
009 * published by the Free Software Foundation, either version 2.1 of the
010 * License, or (at your option) any later version.
011 * 
012 * This program is distributed in the hope that it will be useful,
013 * but WITHOUT ANY WARRANTY; without even the implied warranty of
014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
015 * GNU General Lesser Public License for more details.
016 * 
017 * You should have received a copy of the GNU General Lesser Public
018 * License along with this program.  If not, see
019 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
020 * #L%
021 */
022package dk.netarkivet.harvester.harvesting.distribute;
023
024import java.io.Serializable;
025
026import org.slf4j.Logger;
027import org.slf4j.LoggerFactory;
028
029import dk.netarkivet.common.distribute.Channels;
030import dk.netarkivet.harvester.distribute.HarvesterMessage;
031import dk.netarkivet.harvester.distribute.HarvesterMessageVisitor;
032import dk.netarkivet.harvester.harvesting.monitor.HarvestMonitor;
033import dk.netarkivet.harvester.harvesting.report.Heritrix1Constants;
034
035/**
036 * This class wraps information stored in the Heritrix MBeans, CrawlService and CrawlService.Job, and represents the
037 * crawl progress.
038 * <p>
039 * Additionally this object extends {@link HarvesterMessage} so that it can be sent on the JMS bus to be processed by
040 * {@link HarvestMonitor}.
041 */
042@SuppressWarnings({"serial"})
043public class CrawlProgressMessage extends HarvesterMessage implements Serializable {
044
045        /** The logger for this class. */
046        
047    private static final Logger log = LoggerFactory.getLogger(CrawlProgressMessage.class);
048        
049    /**
050     * The general status of a job in NAS.
051     */
052    public static enum CrawlStatus {
053        /**
054         * Initial status of a job: Heritrix has not yet started crawling.
055         */
056        PRE_CRAWL,
057        /**
058         * Heritrix is actively crawling.
059         */
060        CRAWLER_ACTIVE,
061        /**
062         * Heritrix is active but is crawling nothing.
063         */
064        CRAWLER_EMPTY,
065        /**
066         * Heritrix is crawling but is currently pausing.
067         */
068        CRAWLER_PAUSING,
069        /**
070         * Heritrix is crawling but has been paused by the user.
071         */
072        CRAWLER_PAUSED,
073        /**
074         * Heritrix has finished crawling, post processing of metadata and ARC files remains to be done.
075         */
076        CRAWLING_FINISHED
077    }
078
079    /**
080     * Wraps CrawlService MBean attributes.
081     */
082    public class CrawlServiceInfo implements Serializable {
083
084        /** The number of alerts raised by Heritrix. */
085        private int alertCount;
086
087        /** Flag is set to true when Heritrix is crawling or paused. */
088        private boolean isCrawling;
089
090        /** Contains the UID of the current job. */
091        private String currentJob;
092
093        public int getAlertCount() {
094            return alertCount;
095        }
096
097        public void setAlertCount(int alertCount) {
098            this.alertCount = alertCount;
099        }
100
101        public boolean isCrawling() {
102            return isCrawling;
103        }
104
105        public void setCrawling(boolean isCrawling) {
106            this.isCrawling = isCrawling;
107        }
108
109        public String getCurrentJob() {
110            return currentJob;
111        }
112
113        public void setCurrentJob(String currentJob) {
114            this.currentJob = currentJob;
115        }
116    }
117
118    /**
119     * Wraps CrawlService.Job MBean attributes.
120     */
121    public class CrawlServiceJobInfo implements Serializable {
122
123        /** The number of URIs currently discovered. */
124        private long discoveredFilesCount;
125
126        /** The number of URIs currently harvested. */
127        private long downloadedFilesCount;
128
129        /** A summary of the frontier queues. */
130        private String frontierShortReport;
131
132        /** The time in seconds elapsed since the crawl began. */
133        private long elapsedSeconds;
134
135        /** The current download rate in KB/sec. */
136        private long currentProcessedKBPerSec;
137
138        /** The average download rate in KB/sec. */
139        private long processedKBPerSec;
140
141        /** The current download rate in URI/sec. */
142        private double currentProcessedDocsPerSec;
143
144        /** The average download rate in URI/sec. */
145        private double processedDocsPerSec;
146
147        /** The number of active toe threads for this job. */
148        private int activeToeCount;
149
150        /** A textual summary of the crawler activity. */
151        private String progressStatistics;
152
153        /** The job status. */
154        private String status;
155
156        public long getDiscoveredFilesCount() {
157            return discoveredFilesCount;
158        }
159
160        public void setDiscoveredFilesCount(long discoveredFilesCount) {
161            this.discoveredFilesCount = discoveredFilesCount;
162        }
163
164        public long getDownloadedFilesCount() {
165            return downloadedFilesCount;
166        }
167
168        public void setDownloadedFilesCount(long downloadedFilesCount) {
169            this.downloadedFilesCount = downloadedFilesCount;
170        }
171
172        public String getFrontierShortReport() {
173            return frontierShortReport;
174        }
175
176        public void setFrontierShortReport(String frontierShortReport) {
177            this.frontierShortReport = frontierShortReport;
178        }
179
180        public long getElapsedSeconds() {
181            return elapsedSeconds;
182        }
183
184        public void setElapsedSeconds(long elapsedSeconds) {
185            this.elapsedSeconds = elapsedSeconds;
186        }
187
188        public long getCurrentProcessedKBPerSec() {
189            return currentProcessedKBPerSec;
190        }
191
192        public void setCurrentProcessedKBPerSec(long currentProcessedKBPerSec) {
193            this.currentProcessedKBPerSec = currentProcessedKBPerSec;
194        }
195
196        public long getProcessedKBPerSec() {
197            return processedKBPerSec;
198        }
199
200        public void setProcessedKBPerSec(long processedKBPerSec) {
201            this.processedKBPerSec = processedKBPerSec;
202        }
203
204        public double getCurrentProcessedDocsPerSec() {
205            return currentProcessedDocsPerSec;
206        }
207
208        public void setCurrentProcessedDocsPerSec(double currentProcessedDocsPerSec) {
209            this.currentProcessedDocsPerSec = currentProcessedDocsPerSec;
210        }
211
212        public double getProcessedDocsPerSec() {
213            return processedDocsPerSec;
214        }
215
216        public void setProcessedDocsPerSec(double processedDocsPerSec) {
217            this.processedDocsPerSec = processedDocsPerSec;
218        }
219
220        public int getActiveToeCount() {
221            return activeToeCount;
222        }
223
224        public void setActiveToeCount(int activeToeCount) {
225            this.activeToeCount = activeToeCount;
226        }
227
228        public String getProgressStatistics() {
229            return progressStatistics;
230        }
231
232        public void setProgressStatistics(String progressStatistics) {
233            this.progressStatistics = progressStatistics;
234        }
235
236        /**
237         * Helper method that approximates the number of queued URIs.
238         *
239         * @return the number of queued URIs
240         */
241        public long getQueuedUriCount() {
242            return discoveredFilesCount - downloadedFilesCount;
243        }
244
245        public String getStatus() {
246            return status;
247        }
248
249        public void setStatus(String status) {
250            this.status = status;
251        }
252
253    }
254
255    /** The unique identifier of the job. */
256    private final long jobID;
257
258    /** The unique identifier of the associated harvest definition. */
259    private long harvestID;
260
261    /** The URL to the host Heritrix admin UI. */
262    private String hostUrl;
263
264    /** The job's status. */
265    private CrawlStatus status;
266
267    /** A legend, fetched only once, for the {@link CrawlServiceJobInfo#progressStatistics} property. */
268    private final String progressStatisticsLegend;
269
270    /** The information provided by the CrawlService MBean. */
271    private CrawlServiceInfo heritrixStatus = new CrawlServiceInfo();
272
273    /** The information provided by the CrawlService.Job MBean. */
274    private CrawlServiceJobInfo jobStatus = new CrawlServiceJobInfo();
275
276    /**
277     * Builds an empty message. MBean wrapper values are not set and the appropriate getters should be used to do so.
278     *
279     * @param harvestID the harvest definition ID
280     * @param jobId the job ID
281     * @param progressStatisticsLegend the legend of the progress statistics summary string
282     * @see CrawlProgressMessage#progressStatisticsLegend
283     */
284    public CrawlProgressMessage(long harvestID, long jobId, String progressStatisticsLegend) {
285        super(HarvestMonitor.HARVEST_MONITOR_CHANNEL_ID, Channels.getError());
286        this.harvestID = harvestID;
287        this.jobID = jobId;
288        this.status = CrawlStatus.PRE_CRAWL;
289        this.progressStatisticsLegend = progressStatisticsLegend;
290    }
291
292    /**
293     * Builds an empty message. MBean wrapper values are not set and the appropriate getters should be used to do so.
294     * The progressStatisticsLegend is set to the empty string.
295     *
296     * @param harvestID the harvest definition ID
297     * @param jobId the job ID
298     */
299    public CrawlProgressMessage(long harvestID, long jobId) {
300        this(harvestID, jobId, "");
301    }
302
303    public long getHarvestID() {
304        return harvestID;
305    }
306
307    public String getHostUrl() {
308        return hostUrl;
309    }
310
311    public void setHostUrl(String hostUrl) {
312        this.hostUrl = hostUrl;
313    }
314
315    public CrawlStatus getStatus() {
316        return status;
317    }
318
319    public void setStatus(CrawlStatus status) {
320        this.status = status;
321    }
322
323    public long getJobID() {
324        return jobID;
325    }
326
327    public String getProgressStatisticsLegend() {
328        return progressStatisticsLegend;
329    }
330
331    public CrawlServiceInfo getHeritrixStatus() {
332        return heritrixStatus;
333    }
334
335    public CrawlServiceJobInfo getJobStatus() {
336        return jobStatus;
337    }
338
339    @Override
340    public void accept(HarvesterMessageVisitor v) {
341        v.visit(this);
342    }
343
344    /**
345     * Returns true if the crawler has been paused, and thus not supposed to fetch anything. Heritrix may still be
346     * fetching stuff, as it takes some time for it to go into full pause mode. This method can be used as an indicator
347     * that we should not be worried if Heritrix appears to be idle.
348     *
349     * @return True if the crawler has been paused, e.g. by using the Heritrix GUI.
350     */
351    public boolean isPaused() {
352        return CrawlStatus.CRAWLER_PAUSED.equals(status);
353    }
354
355    /**
356     * Checks whether Heritrix has finished crawling the job.
357     *
358     * @return true if Heritrix has finished crawling the job, false otherwise.
359     */
360    public boolean crawlIsFinished() {
361        // Evidently heritrixStatus.currentJob is set to "", if no job is crawling
362        boolean jobInProgress = heritrixStatus.isCrawling() && !heritrixStatus.getCurrentJob().isEmpty();
363
364        if (!jobInProgress) {
365                // FIXME does this work for H3 as well (If not modify the above logic)
366                log.info("Job {} seems to be no longer in progress. ", jobID);
367            return true;
368        }
369        
370        String statusAsString = getJobStatus().getStatus();
371        
372        if (statusAsString != null) {
373                // FIXME probably only works for H1 equals to the String "FINISHED"
374                log.info("StatusAsString = '{}'", statusAsString);
375            return statusAsString.equals(Heritrix1Constants.CRAWLCONTROLLER_FINISHED);
376        } 
377        // statusAsString is null
378        log.info("statusAsString is null for job {}. Considering the crawl to be not finished", jobID);
379        
380        return false;
381    }
382
383}