001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.harvesting.distribute;
024
025import java.io.Serializable;
026
027import org.slf4j.Logger;
028import org.slf4j.LoggerFactory;
029
030import dk.netarkivet.common.distribute.Channels;
031import dk.netarkivet.harvester.distribute.HarvesterMessage;
032import dk.netarkivet.harvester.distribute.HarvesterMessageVisitor;
033import dk.netarkivet.harvester.harvesting.monitor.HarvestMonitor;
034import dk.netarkivet.harvester.harvesting.report.Heritrix1Constants;
035
036/**
037 * This class wraps information stored in the Heritrix MBeans, CrawlService and CrawlService.Job, and represents the
038 * crawl progress.
039 * <p>
040 * Additionally this object extends {@link HarvesterMessage} so that it can be sent on the JMS bus to be processed by
041 * {@link HarvestMonitor}.
042 */
043@SuppressWarnings({"serial"})
044public class CrawlProgressMessage extends HarvesterMessage implements Serializable {
045
046        /** The logger for this class. */
047        
048    private static final Logger log = LoggerFactory.getLogger(CrawlProgressMessage.class);
049        
050    /**
051     * The general status of a job in NAS.
052     */
053    public static enum CrawlStatus {
054        /**
055         * Initial status of a job: Heritrix has not yet started crawling.
056         */
057        PRE_CRAWL,
058        /**
059         * Heritrix is actively crawling.
060         */
061        CRAWLER_ACTIVE,
062        /**
063         * Heritrix is crawling but is currently pausing.
064         */
065        CRAWLER_PAUSING,
066        /**
067         * Heritrix is crawling but has been paused by the user.
068         */
069        CRAWLER_PAUSED,
070        /**
071         * Heritrix has finished crawling, post processing of metadata and ARC files remains to be done.
072         */
073        CRAWLING_FINISHED
074    }
075
076    /**
077     * Wraps CrawlService MBean attributes.
078     */
079    public class CrawlServiceInfo implements Serializable {
080
081        /** The number of alerts raised by Heritrix. */
082        private int alertCount;
083
084        /** Flag is set to true when Heritrix is crawling or paused. */
085        private boolean isCrawling;
086
087        /** Contains the UID of the current job. */
088        private String currentJob;
089
090        public int getAlertCount() {
091            return alertCount;
092        }
093
094        public void setAlertCount(int alertCount) {
095            this.alertCount = alertCount;
096        }
097
098        public boolean isCrawling() {
099            return isCrawling;
100        }
101
102        public void setCrawling(boolean isCrawling) {
103            this.isCrawling = isCrawling;
104        }
105
106        public String getCurrentJob() {
107            return currentJob;
108        }
109
110        public void setCurrentJob(String currentJob) {
111            this.currentJob = currentJob;
112        }
113    }
114
115    /**
116     * Wraps CrawlService.Job MBean attributes.
117     */
118    public class CrawlServiceJobInfo implements Serializable {
119
120        /** The number of URIs currently discovered. */
121        private long discoveredFilesCount;
122
123        /** The number of URIs currently harvested. */
124        private long downloadedFilesCount;
125
126        /** A summary of the frontier queues. */
127        private String frontierShortReport;
128
129        /** The time in seconds elapsed since the crawl began. */
130        private long elapsedSeconds;
131
132        /** The current download rate in KB/sec. */
133        private long currentProcessedKBPerSec;
134
135        /** The average download rate in KB/sec. */
136        private long processedKBPerSec;
137
138        /** The current download rate in URI/sec. */
139        private double currentProcessedDocsPerSec;
140
141        /** The average download rate in URI/sec. */
142        private double processedDocsPerSec;
143
144        /** The number of active toe threads for this job. */
145        private int activeToeCount;
146
147        /** A textual summary of the crawler activity. */
148        private String progressStatistics;
149
150        /** The job status. */
151        private String status;
152
153        public long getDiscoveredFilesCount() {
154            return discoveredFilesCount;
155        }
156
157        public void setDiscoveredFilesCount(long discoveredFilesCount) {
158            this.discoveredFilesCount = discoveredFilesCount;
159        }
160
161        public long getDownloadedFilesCount() {
162            return downloadedFilesCount;
163        }
164
165        public void setDownloadedFilesCount(long downloadedFilesCount) {
166            this.downloadedFilesCount = downloadedFilesCount;
167        }
168
169        public String getFrontierShortReport() {
170            return frontierShortReport;
171        }
172
173        public void setFrontierShortReport(String frontierShortReport) {
174            this.frontierShortReport = frontierShortReport;
175        }
176
177        public long getElapsedSeconds() {
178            return elapsedSeconds;
179        }
180
181        public void setElapsedSeconds(long elapsedSeconds) {
182            this.elapsedSeconds = elapsedSeconds;
183        }
184
185        public long getCurrentProcessedKBPerSec() {
186            return currentProcessedKBPerSec;
187        }
188
189        public void setCurrentProcessedKBPerSec(long currentProcessedKBPerSec) {
190            this.currentProcessedKBPerSec = currentProcessedKBPerSec;
191        }
192
193        public long getProcessedKBPerSec() {
194            return processedKBPerSec;
195        }
196
197        public void setProcessedKBPerSec(long processedKBPerSec) {
198            this.processedKBPerSec = processedKBPerSec;
199        }
200
201        public double getCurrentProcessedDocsPerSec() {
202            return currentProcessedDocsPerSec;
203        }
204
205        public void setCurrentProcessedDocsPerSec(double currentProcessedDocsPerSec) {
206            this.currentProcessedDocsPerSec = currentProcessedDocsPerSec;
207        }
208
209        public double getProcessedDocsPerSec() {
210            return processedDocsPerSec;
211        }
212
213        public void setProcessedDocsPerSec(double processedDocsPerSec) {
214            this.processedDocsPerSec = processedDocsPerSec;
215        }
216
217        public int getActiveToeCount() {
218            return activeToeCount;
219        }
220
221        public void setActiveToeCount(int activeToeCount) {
222            this.activeToeCount = activeToeCount;
223        }
224
225        public String getProgressStatistics() {
226            return progressStatistics;
227        }
228
229        public void setProgressStatistics(String progressStatistics) {
230            this.progressStatistics = progressStatistics;
231        }
232
233        /**
234         * Helper method that approximates the number of queued URIs.
235         *
236         * @return the number of queued URIs
237         */
238        public long getQueuedUriCount() {
239            return discoveredFilesCount - downloadedFilesCount;
240        }
241
242        public String getStatus() {
243            return status;
244        }
245
246        public void setStatus(String status) {
247            this.status = status;
248        }
249
250    }
251
252    /** The unique identifier of the job. */
253    private final long jobID;
254
255    /** The unique identifier of the associated harvest definition. */
256    private long harvestID;
257
258    /** The URL to the host Heritrix admin UI. */
259    private String hostUrl;
260
261    /** The job's status. */
262    private CrawlStatus status;
263
264    /** A legend, fetched only once, for the {@link CrawlServiceJobInfo#progressStatistics} property. */
265    private final String progressStatisticsLegend;
266
267    /** The information provided by the CrawlService MBean. */
268    private CrawlServiceInfo heritrixStatus = new CrawlServiceInfo();
269
270    /** The information provided by the CrawlService.Job MBean. */
271    private CrawlServiceJobInfo jobStatus = new CrawlServiceJobInfo();
272
273    /**
274     * Builds an empty message. MBean wrapper values are not set and the appropriate getters should be used to do so.
275     *
276     * @param harvestID the harvest definition ID
277     * @param jobId the job ID
278     * @param progressStatisticsLegend the legend of the progress statistics summary string
279     * @see CrawlProgressMessage#progressStatisticsLegend
280     */
281    public CrawlProgressMessage(long harvestID, long jobId, String progressStatisticsLegend) {
282        super(HarvestMonitor.HARVEST_MONITOR_CHANNEL_ID, Channels.getError());
283        this.harvestID = harvestID;
284        this.jobID = jobId;
285        this.status = CrawlStatus.PRE_CRAWL;
286        this.progressStatisticsLegend = progressStatisticsLegend;
287    }
288
289    /**
290     * Builds an empty message. MBean wrapper values are not set and the appropriate getters should be used to do so.
291     * The progressStatisticsLegend is set to the empty string.
292     *
293     * @param harvestID the harvest definition ID
294     * @param jobId the job ID
295     */
296    public CrawlProgressMessage(long harvestID, long jobId) {
297        this(harvestID, jobId, "");
298    }
299
300    public long getHarvestID() {
301        return harvestID;
302    }
303
304    public String getHostUrl() {
305        return hostUrl;
306    }
307
308    public void setHostUrl(String hostUrl) {
309        this.hostUrl = hostUrl;
310    }
311
312    public CrawlStatus getStatus() {
313        return status;
314    }
315
316    public void setStatus(CrawlStatus status) {
317        this.status = status;
318    }
319
320    public long getJobID() {
321        return jobID;
322    }
323
324    public String getProgressStatisticsLegend() {
325        return progressStatisticsLegend;
326    }
327
328    public CrawlServiceInfo getHeritrixStatus() {
329        return heritrixStatus;
330    }
331
332    public CrawlServiceJobInfo getJobStatus() {
333        return jobStatus;
334    }
335
336    @Override
337    public void accept(HarvesterMessageVisitor v) {
338        v.visit(this);
339    }
340
341    /**
342     * Returns true if the crawler has been paused, and thus not supposed to fetch anything. Heritrix may still be
343     * fetching stuff, as it takes some time for it to go into full pause mode. This method can be used as an indicator
344     * that we should not be worried if Heritrix appears to be idle.
345     *
346     * @return True if the crawler has been paused, e.g. by using the Heritrix GUI.
347     */
348    public boolean isPaused() {
349        return CrawlStatus.CRAWLER_PAUSED.equals(status);
350    }
351
352    /**
353     * Checks whether Heritrix has finished crawling the job.
354     *
355     * @return true if Heritrix has finished crawling the job, false otherwise.
356     */
357    public boolean crawlIsFinished() {
358        // Evidently heritrixStatus.currentJob is set to "", if no job is crawling
359        boolean jobInProgress = heritrixStatus.isCrawling() && !heritrixStatus.getCurrentJob().isEmpty();
360
361        if (!jobInProgress) {
362                // FIXME does this work for H3 as well (If not modify the above logic)
363                log.info("Job {} seems to be no longer in progress. ", jobID);
364            return true;
365        }
366        
367        String statusAsString = getJobStatus().getStatus();
368        
369        if (statusAsString != null) {
370                // FIXME probably only works for H1 equals to the String "FINISHED"
371                log.info("StatusAsString = '{}'", statusAsString);
372            return statusAsString.equals(Heritrix1Constants.CRAWLCONTROLLER_FINISHED);
373        } 
374        // statusAsString is null
375        log.info("statusAsString is null for job {}. Considering the crawl to be not finished", jobID);
376        
377        return false;
378    }
379
380}