001/*
002 * #%L
003 * Netarchivesuite - heritrix 3 monitor
004 * %%
005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.heritrix3.monitor;
025
026import java.io.File;
027import java.io.IOException;
028import java.io.RandomAccessFile;
029import java.util.HashMap;
030import java.util.Iterator;
031import java.util.List;
032import java.util.Map;
033
034import org.apache.commons.io.IOUtils;
035import org.netarchivesuite.heritrix3wrapper.ByteRange;
036import org.netarchivesuite.heritrix3wrapper.Heritrix3Wrapper;
037import org.netarchivesuite.heritrix3wrapper.JobResult;
038import org.netarchivesuite.heritrix3wrapper.StreamResult;
039import org.slf4j.Logger;
040import org.slf4j.LoggerFactory;
041
042import dk.netarkivet.harvester.datamodel.Job;
043import dk.netarkivet.harvester.harvesting.monitor.StartedJobInfo;
044
045public class Heritrix3JobMonitor implements Pageable {
046
047    /** The logger for this class. */
048    private static final Logger LOG = LoggerFactory.getLogger(Heritrix3JobMonitorThread.class);
049
050    protected NASEnvironment environment;
051
052    public boolean bActive = true;
053
054    public boolean bPull = false;
055
056    public boolean bInitialized;
057
058    public long jobId;
059
060    public Job job;
061
062    public Heritrix3Wrapper h3wrapper;
063
064    public String h3HostnamePort;
065
066    public String hostUrl;
067
068    public String jobname;
069
070    public JobResult jobResult;
071
072    public String crawlLogFilePath;
073
074    public File logFile;
075
076    public RandomAccessFile logRaf;
077
078    public File idxFile;
079
080    public RandomAccessFile idxRaf;
081
082    public long lastIndexed = 0;
083    
084    public long totalCachedLines = 0;
085
086    protected Heritrix3JobMonitor() {
087    }
088
089    public static Heritrix3JobMonitor getInstance(Long jobId, NASEnvironment environment) throws IOException {
090        Heritrix3JobMonitor jobmonitor = new Heritrix3JobMonitor();
091        jobmonitor.environment = environment;
092        jobmonitor.jobId = jobId;
093        jobmonitor.logFile = new File(environment.tempPath, "crawllog-" + jobId + ".log");
094        jobmonitor.idxFile = new File(environment.tempPath, "crawllog-" + jobId + ".idx");
095        jobmonitor.init();
096        return jobmonitor;
097    }
098
099    public synchronized void init() {
100        try {
101            if (bActive && !bInitialized) {
102                if (job == null) {
103                    job = Heritrix3JobMonitorThread.jobDAO.read(jobId);
104                }
105                if (h3wrapper == null) {
106                    StartedJobInfo startedInfo = Heritrix3JobMonitorThread.runningJobsInfoDAO.getMostRecentByJobId(jobId);
107                    if (startedInfo != null) {
108                        hostUrl = startedInfo.getHostUrl();
109                        if (hostUrl != null && hostUrl.length() > 0) {
110                            h3wrapper = Heritrix3WrapperManager.getHeritrix3Wrapper(hostUrl, environment.h3AdminName, environment.h3AdminPassword);
111                        }
112                    }
113                }
114                if (jobname == null && h3wrapper != null) {
115                    jobname = Heritrix3WrapperManager.getJobname(h3wrapper, jobId);
116                }
117                if ((jobResult == null || jobResult.job == null) && jobname != null) {
118                    jobResult = h3wrapper.job(jobname);
119                }
120                if (jobResult != null && jobResult.job != null) {
121                    crawlLogFilePath = jobResult.job.crawlLogFilePath;
122                }
123                if (crawlLogFilePath != null) {
124                    logRaf = new RandomAccessFile(logFile, "rw");
125                    idxRaf = new RandomAccessFile(idxFile, "rw");
126                    if (idxRaf.length() == 0) {
127                        idxRaf.writeLong(0);
128                    } else {
129                        idxRaf.seek(idxRaf.length() - 8);
130                        lastIndexed = idxRaf.readLong();
131                        totalCachedLines = (idxRaf.length() / 8) - 1;
132                    }
133                    idxRaf.seek(idxRaf.length());
134                    logRaf.seek(logRaf.length());
135                    bInitialized = true;
136                }
137            }
138        } catch (Throwable t) {
139        }
140    }
141
142    public synchronized void update() {
143        try {
144            if (job != null) {
145                Job tmpJob = job = Heritrix3JobMonitorThread.jobDAO.read(jobId);
146                if (tmpJob != null) {
147                    job = tmpJob;
148                }
149            }
150            if (jobResult != null && jobResult.job != null && jobname != null) {
151                JobResult tmpJobResult = h3wrapper.job(jobname);
152                if (tmpJobResult != null) {
153                    jobResult = tmpJobResult;
154                }
155            }
156        } catch (Throwable t) {
157        }
158    }
159
160    public synchronized void updateCrawlLog(byte[] tmpBuf) {
161        long pos;
162        long to;
163        int idx;
164        boolean bLoop;
165        ByteRange byteRange;
166        try {
167            if (bActive && !bInitialized) {
168                init();
169            }
170            if (bActive && bInitialized) {
171                bLoop = true;
172                while (bLoop) {
173                    idxRaf.seek(idxRaf.length());
174                    pos = logRaf.length();
175                    to = pos;
176                    if (jobResult != null && jobResult.job != null && jobResult.job.crawlLogFilePath != null) {
177                        long rangeFrom = pos;
178                        long rangeTo = pos + tmpBuf.length - 1;
179                        StreamResult anypathResult = h3wrapper.anypath(jobResult.job.crawlLogFilePath, null, null, true);
180                        if (anypathResult != null && rangeFrom < anypathResult.contentLength) {
181                            LOG.info("Crawllog length for job {}={}.", jobId, anypathResult.contentLength);
182                                if (rangeTo >= anypathResult.contentLength) {
183                                        rangeTo = anypathResult.contentLength - 1;
184                                }
185                                anypathResult = h3wrapper.anypath(jobResult.job.crawlLogFilePath, rangeFrom, rangeTo);
186                            LOG.info("Crawllog byterange download for job {}. ({}-{})", jobId, rangeFrom, rangeTo);
187                            if (anypathResult != null && anypathResult.byteRange != null && anypathResult.in != null) {
188                                byteRange = anypathResult.byteRange;
189                                if (byteRange.contentLength > 0) {
190                                    logRaf.seek(pos);
191                                    int read;
192                                    try {
193                                        while ((read = anypathResult.in.read(tmpBuf)) != -1) {
194                                            logRaf.write(tmpBuf, 0, read);
195                                            to += read;
196                                            idx = 0;
197                                            while (read > 0) {
198                                                ++pos;
199                                                --read;
200                                                if (tmpBuf[idx++] == '\n') {
201                                                    idxRaf.writeLong(pos);
202                                                    lastIndexed = pos;
203                                                    totalCachedLines++;
204                                                }
205                                            }
206                                        }
207                                    }
208                                    catch (IOException e) {
209                                        e.printStackTrace();
210                                    }
211                                    IOUtils.closeQuietly(anypathResult);
212                                    if (byteRange.contentLength == to) {
213                                        bLoop = false;
214                                    }
215                                } else {
216                                    bLoop = false;
217                                }
218                            } else {
219                                bLoop = false;
220                            }
221                        } else {
222                            bLoop = false;
223                        }
224                    } else {
225                        bLoop = false;
226                    }
227                }
228            }
229        } catch (Throwable t) {
230        }
231    }
232
233    public synchronized void cleanup(List<File> oldFilesList) {
234        try {
235            bActive = false;
236            bInitialized = false;
237            hostUrl = null;
238            h3wrapper = null;
239            jobname = null;
240            jobResult = null;
241            crawlLogFilePath = null;
242            totalCachedLines = 0;
243            IOUtils.closeQuietly(logRaf);
244            IOUtils.closeQuietly(idxRaf);
245            oldFilesList.add(logFile);
246            oldFilesList.add(idxFile);
247            Iterator<SearchResult> srIter = qSearchResultMap.values().iterator();
248            SearchResult sr;
249            while (srIter.hasNext()) {
250                sr = srIter.next();
251                oldFilesList.add(sr.srIdxFile);
252                oldFilesList.add(sr.srLogFile);
253                sr.cleanup();
254            }
255            qSearchResultMap.clear();
256        } catch (Throwable t) {
257        }
258    }
259
260    @Override
261    public synchronized long getIndexSize() {
262        return idxFile.length();
263    }
264
265    @Override
266    public long getLastIndexed() {
267        return lastIndexed;
268    }
269    
270    public long getTotalCachedLines() {
271        return totalCachedLines;
272    }
273
274    @Override
275    public synchronized byte[] readPage(long page, long itemsPerPage, boolean descending) throws IOException {
276        return StringIndexFile.readPage(idxRaf, logRaf, page, itemsPerPage, descending);
277    }
278
279    public synchronized boolean isReady() {
280        return (bActive && bInitialized);
281    }
282
283    protected Map<String, SearchResult> qSearchResultMap = new HashMap<String, SearchResult>();
284
285    protected int searchResultNr = 1;
286
287    public synchronized SearchResult getSearchResult(String q) throws IOException {
288        SearchResult searchResult = qSearchResultMap.get(q);
289        if (searchResult == null) {
290            searchResult = new SearchResult(environment, this, q, searchResultNr++);
291            qSearchResultMap.put(q, searchResult);
292        }
293        return searchResult;
294    }
295
296    /**
297     * Set the file path to the crawl log
298     *
299     * @param crawlLogFilePath File path to the crawl log
300     */
301    public void setCrawlLogFilePath(String crawlLogFilePath) {
302        this.crawlLogFilePath = crawlLogFilePath;
303    }
304}