001/* 002 * #%L 003 * Netarchivesuite - heritrix 3 monitor 004 * %% 005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.heritrix3.monitor; 025 026import java.io.File; 027import java.io.IOException; 028import java.io.RandomAccessFile; 029import java.util.HashMap; 030import java.util.Iterator; 031import java.util.List; 032import java.util.Map; 033 034import org.apache.commons.io.IOUtils; 035import org.netarchivesuite.heritrix3wrapper.ByteRange; 036import org.netarchivesuite.heritrix3wrapper.Heritrix3Wrapper; 037import org.netarchivesuite.heritrix3wrapper.JobResult; 038import org.netarchivesuite.heritrix3wrapper.StreamResult; 039import org.slf4j.Logger; 040import org.slf4j.LoggerFactory; 041 042import dk.netarkivet.harvester.datamodel.Job; 043import dk.netarkivet.harvester.harvesting.monitor.StartedJobInfo; 044 045public class Heritrix3JobMonitor implements Pageable { 046 047 /** The logger for this class. */ 048 private static final Logger LOG = LoggerFactory.getLogger(Heritrix3JobMonitorThread.class); 049 050 protected NASEnvironment environment; 051 052 public boolean bActive = true; 053 054 public boolean bPull = false; 055 056 public boolean bInitialized; 057 058 public long jobId; 059 060 public Job job; 061 062 public Heritrix3Wrapper h3wrapper; 063 064 public String h3HostnamePort; 065 066 public String hostUrl; 067 068 public String jobname; 069 070 public JobResult jobResult; 071 072 public String crawlLogFilePath; 073 074 public File logFile; 075 076 public RandomAccessFile logRaf; 077 078 public File idxFile; 079 080 public RandomAccessFile idxRaf; 081 082 public long lastIndexed = 0; 083 084 public long totalCachedLines = 0; 085 086 protected Heritrix3JobMonitor() { 087 } 088 089 public static Heritrix3JobMonitor getInstance(Long jobId, NASEnvironment environment) throws IOException { 090 Heritrix3JobMonitor jobmonitor = new Heritrix3JobMonitor(); 091 jobmonitor.environment = environment; 092 jobmonitor.jobId = jobId; 093 jobmonitor.logFile = new File(environment.tempPath, "crawllog-" + jobId + ".log"); 094 jobmonitor.idxFile = new File(environment.tempPath, "crawllog-" + jobId + ".idx"); 095 jobmonitor.init(); 096 return jobmonitor; 097 } 098 099 public synchronized void init() { 100 try { 101 if (bActive && !bInitialized) { 102 if (job == null) { 103 job = Heritrix3JobMonitorThread.jobDAO.read(jobId); 104 } 105 if (h3wrapper == null) { 106 StartedJobInfo startedInfo = Heritrix3JobMonitorThread.runningJobsInfoDAO.getMostRecentByJobId(jobId); 107 if (startedInfo != null) { 108 hostUrl = startedInfo.getHostUrl(); 109 if (hostUrl != null && hostUrl.length() > 0) { 110 h3wrapper = Heritrix3WrapperManager.getHeritrix3Wrapper(hostUrl, environment.h3AdminName, environment.h3AdminPassword); 111 } 112 } 113 } 114 if (jobname == null && h3wrapper != null) { 115 jobname = Heritrix3WrapperManager.getJobname(h3wrapper, jobId); 116 } 117 if ((jobResult == null || jobResult.job == null) && jobname != null) { 118 jobResult = h3wrapper.job(jobname); 119 } 120 if (jobResult != null && jobResult.job != null) { 121 crawlLogFilePath = jobResult.job.crawlLogFilePath; 122 } 123 if (crawlLogFilePath != null) { 124 logRaf = new RandomAccessFile(logFile, "rw"); 125 idxRaf = new RandomAccessFile(idxFile, "rw"); 126 if (idxRaf.length() == 0) { 127 idxRaf.writeLong(0); 128 } else { 129 idxRaf.seek(idxRaf.length() - 8); 130 lastIndexed = idxRaf.readLong(); 131 totalCachedLines = (idxRaf.length() / 8) - 1; 132 } 133 idxRaf.seek(idxRaf.length()); 134 logRaf.seek(logRaf.length()); 135 bInitialized = true; 136 } 137 } 138 } catch (Throwable t) { 139 } 140 } 141 142 public synchronized void update() { 143 try { 144 if (job != null) { 145 Job tmpJob = job = Heritrix3JobMonitorThread.jobDAO.read(jobId); 146 if (tmpJob != null) { 147 job = tmpJob; 148 } 149 } 150 if (jobResult != null && jobResult.job != null && jobname != null) { 151 JobResult tmpJobResult = h3wrapper.job(jobname); 152 if (tmpJobResult != null) { 153 jobResult = tmpJobResult; 154 } 155 } 156 } catch (Throwable t) { 157 } 158 } 159 160 public synchronized void updateCrawlLog(byte[] tmpBuf) { 161 long pos; 162 long to; 163 int idx; 164 boolean bLoop; 165 ByteRange byteRange; 166 try { 167 if (bActive && !bInitialized) { 168 init(); 169 } 170 if (bActive && bInitialized) { 171 bLoop = true; 172 while (bLoop) { 173 idxRaf.seek(idxRaf.length()); 174 pos = logRaf.length(); 175 to = pos; 176 if (jobResult != null && jobResult.job != null && jobResult.job.crawlLogFilePath != null) { 177 long rangeFrom = pos; 178 long rangeTo = pos + tmpBuf.length - 1; 179 StreamResult anypathResult = h3wrapper.anypath(jobResult.job.crawlLogFilePath, null, null, true); 180 if (anypathResult != null && rangeFrom < anypathResult.contentLength) { 181 LOG.info("Crawllog length for job {}={}.", jobId, anypathResult.contentLength); 182 if (rangeTo >= anypathResult.contentLength) { 183 rangeTo = anypathResult.contentLength - 1; 184 } 185 anypathResult = h3wrapper.anypath(jobResult.job.crawlLogFilePath, rangeFrom, rangeTo); 186 LOG.info("Crawllog byterange download for job {}. ({}-{})", jobId, rangeFrom, rangeTo); 187 if (anypathResult != null && anypathResult.byteRange != null && anypathResult.in != null) { 188 byteRange = anypathResult.byteRange; 189 if (byteRange.contentLength > 0) { 190 logRaf.seek(pos); 191 int read; 192 try { 193 while ((read = anypathResult.in.read(tmpBuf)) != -1) { 194 logRaf.write(tmpBuf, 0, read); 195 to += read; 196 idx = 0; 197 while (read > 0) { 198 ++pos; 199 --read; 200 if (tmpBuf[idx++] == '\n') { 201 idxRaf.writeLong(pos); 202 lastIndexed = pos; 203 totalCachedLines++; 204 } 205 } 206 } 207 } 208 catch (IOException e) { 209 e.printStackTrace(); 210 } 211 IOUtils.closeQuietly(anypathResult); 212 if (byteRange.contentLength == to) { 213 bLoop = false; 214 } 215 } else { 216 bLoop = false; 217 } 218 } else { 219 bLoop = false; 220 } 221 } else { 222 bLoop = false; 223 } 224 } else { 225 bLoop = false; 226 } 227 } 228 } 229 } catch (Throwable t) { 230 } 231 } 232 233 public synchronized void cleanup(List<File> oldFilesList) { 234 try { 235 bActive = false; 236 bInitialized = false; 237 hostUrl = null; 238 h3wrapper = null; 239 jobname = null; 240 jobResult = null; 241 crawlLogFilePath = null; 242 totalCachedLines = 0; 243 IOUtils.closeQuietly(logRaf); 244 IOUtils.closeQuietly(idxRaf); 245 oldFilesList.add(logFile); 246 oldFilesList.add(idxFile); 247 Iterator<SearchResult> srIter = qSearchResultMap.values().iterator(); 248 SearchResult sr; 249 while (srIter.hasNext()) { 250 sr = srIter.next(); 251 oldFilesList.add(sr.srIdxFile); 252 oldFilesList.add(sr.srLogFile); 253 sr.cleanup(); 254 } 255 qSearchResultMap.clear(); 256 } catch (Throwable t) { 257 } 258 } 259 260 @Override 261 public synchronized long getIndexSize() { 262 return idxFile.length(); 263 } 264 265 @Override 266 public long getLastIndexed() { 267 return lastIndexed; 268 } 269 270 public long getTotalCachedLines() { 271 return totalCachedLines; 272 } 273 274 @Override 275 public synchronized byte[] readPage(long page, long itemsPerPage, boolean descending) throws IOException { 276 return StringIndexFile.readPage(idxRaf, logRaf, page, itemsPerPage, descending); 277 } 278 279 public synchronized boolean isReady() { 280 return (bActive && bInitialized); 281 } 282 283 protected Map<String, SearchResult> qSearchResultMap = new HashMap<String, SearchResult>(); 284 285 protected int searchResultNr = 1; 286 287 public synchronized SearchResult getSearchResult(String q) throws IOException { 288 SearchResult searchResult = qSearchResultMap.get(q); 289 if (searchResult == null) { 290 searchResult = new SearchResult(environment, this, q, searchResultNr++); 291 qSearchResultMap.put(q, searchResult); 292 } 293 return searchResult; 294 } 295 296 /** 297 * Set the file path to the crawl log 298 * 299 * @param crawlLogFilePath File path to the crawl log 300 */ 301 public void setCrawlLogFilePath(String crawlLogFilePath) { 302 this.crawlLogFilePath = crawlLogFilePath; 303 } 304}