001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.harvester.harvesting; 025 026import java.io.File; 027import java.io.FileOutputStream; 028import java.io.IOException; 029import java.io.InputStream; 030import java.io.OutputStream; 031 032import org.apache.commons.io.IOUtils; 033import org.slf4j.Logger; 034import org.slf4j.LoggerFactory; 035 036import dk.netarkivet.common.CommonSettings; 037import dk.netarkivet.common.Constants; 038import dk.netarkivet.common.exceptions.ArgumentNotValid; 039import dk.netarkivet.common.exceptions.IOFailure; 040import dk.netarkivet.common.utils.FileUtils; 041import dk.netarkivet.common.utils.Settings; 042import dk.netarkivet.common.utils.StreamUtils; 043import dk.netarkivet.harvester.datamodel.HeritrixTemplate; 044 045/** 046 * This class encapsulates all the files that Heritrix gets from our system, and all files we read from Heritrix. 047 */ 048public class HeritrixFiles { 049 050 /** The logger. */ 051 private static final Logger log = LoggerFactory.getLogger(HeritrixFiles.class); 052 053 /** The directory that crawls are performed in. */ 054 private final File crawlDir; 055 /** The job ID this object represents files for. */ 056 private final Long jobID; 057 /** The job ID this object represents files for. */ 058 private final Long harvestID; 059 060 /** The prefix we put on generated ARC or WARC files. */ 061 private final String arcFilePrefix; 062 063 /** The JMX password file to be used by Heritrix 1.X. */ 064 private final File jmxPasswordFile; 065 /** The JMX access file to be used by Heritrix 1.X. */ 066 private final File jmxAccessFile; 067 068 /** The name of the order.xml file. */ 069 private static final String ORDER_XML_FILENAME = "order.xml"; 070 071 /** The name of the seeds.txt file. */ 072 private static final String SEEDS_TXT_FILENAME = "seeds.txt"; 073 074 /** The name of the recoverBackup.gz file. */ 075 private static final String RECOVERBACKUP_GZ_FILENAME = "recoverBackup.gz"; 076 077 /** The name of the index directory. */ 078 private File indexDir; 079 080 /** The name of the progress statistics log. */ 081 private static final String PROGRESS_STATISTICS_LOG_FILENAME = "progress-statistics.log"; 082 /** The name of the crawl log. */ 083 private static final String CRAWL_LOG_FILENAME = "crawl.log"; 084 /** The name of the stdout/stderr file from Heritrix. */ 085 private static final String OUTPUT_FILENAME = "heritrix.out"; 086 087 /** The version of Heritrix. */ 088 private Version version; 089 090 /** 091 * Create a new HeritrixFiles object for a job. 092 * 093 * @param crawlDir The dir, where the crawl-files are placed. Assumes, that crawlDir exists already. 094 * @param harvestJob The harvestjob behind this instance of HeritrixFiles 095 * @param jmxPasswordFile The jmx password file to be used by Heritrix 1. The existence of this file is checked 096 * another place. 097 * @param jmxAccessFile The JMX access file to be used by Heritrix 1. The existence of this file is checked another 098 * place. 099 * @throws ArgumentNotValid if null crawlDir, or non-positive jobID and harvestID. 100 */ 101 public HeritrixFiles(File crawlDir, JobInfo harvestJob, File jmxPasswordFile, File jmxAccessFile) { 102 ArgumentNotValid.checkNotNull(crawlDir, "crawlDir"); 103 ArgumentNotValid.checkNotNull(harvestJob, "harvestJob"); 104 ArgumentNotValid.checkNotNull(jmxPasswordFile, "jmxPasswordFile"); 105 ArgumentNotValid.checkNotNull(jmxAccessFile, "jmxAccessFile"); 106 this.crawlDir = crawlDir; 107 this.jobID = harvestJob.getJobID(); 108 this.harvestID = harvestJob.getOrigHarvestDefinitionID(); 109 this.arcFilePrefix = harvestJob.getHarvestFilenamePrefix(); 110 this.jmxPasswordFile = jmxPasswordFile; 111 this.jmxAccessFile = jmxAccessFile; 112 this.version = Version.HERITRIX_1; 113 } 114 115 public HeritrixFiles(File crawlDir, JobInfo harvestJob, File jmxPasswordFile, File jmxAccessFile, 116 Version version) { 117 ArgumentNotValid.checkNotNull(crawlDir, "crawlDir"); 118 ArgumentNotValid.checkNotNull(harvestJob, "harvestJob"); 119 this.crawlDir = crawlDir; 120 this.jobID = harvestJob.getJobID(); 121 this.harvestID = harvestJob.getOrigHarvestDefinitionID(); 122 this.arcFilePrefix = harvestJob.getHarvestFilenamePrefix(); 123 this.jmxPasswordFile = jmxPasswordFile; 124 this.jmxAccessFile = jmxAccessFile; 125 this.version = version; 126 } 127 128 public static HeritrixFiles getH1HeritrixFilesWithDefaultJmxFiles(File crawlDir, JobInfo harvestJob) { 129 return new HeritrixFiles(crawlDir, harvestJob, 130 new File(Settings.get(CommonSettings.JMX_PASSWORD_FILE)), 131 new File(Settings.get(CommonSettings.JMX_ACCESS_FILE)), Version.HERITRIX_1); 132 } 133 134 public static HeritrixFiles getH3HeritrixFiles(File crawlDir, JobInfo harvestJob) { 135 return new HeritrixFiles(crawlDir, harvestJob, null, null, Version.HERITRIX_3); 136 } 137 138 /* 139 /** 140 * Alternate constructor that by default reads the jmxPasswordFile, and jmxAccessFile from the current settings. 141 * 142 * @param crawlDir The dir, where the crawl-files are placed 143 * @param harvestJob The harvestjob behind this instance of HeritrixFiles 144 */ 145 /* 146 public HeritrixFiles(File crawlDir, JobInfo harvestJob) { 147 this(crawlDir, harvestJob, new File(Settings.get(CommonSettings.JMX_PASSWORD_FILE)), new File( 148 Settings.get(CommonSettings.JMX_ACCESS_FILE))); 149 } 150 */ 151 152 /** 153 * Returns the directory that crawls are performed inside. 154 * 155 * @return A directory (that is created as part of harvest setup) that all of Heritrix' files live in. 156 */ 157 public File getCrawlDir() { 158 return crawlDir; 159 } 160 161 /** 162 * Returns the prefix used to generate Archive files (ARC or WARC). 163 * 164 * @return The archive file prefix, currently jobID-harvestID. 165 */ 166 public String getArchiveFilePrefix() { 167 return this.arcFilePrefix; 168 } 169 170 /** 171 * Returns the order.xml file object. 172 * 173 * @return A file object for the order.xml file (which may not have been written yet). 174 */ 175 public File getOrderXmlFile() { 176 return new File(crawlDir, ORDER_XML_FILENAME); 177 } 178 179 /** 180 * Returns the seeds.txt file object. 181 * 182 * @return A file object for the seeds.txt file (which may not have been written yet). 183 */ 184 public File getSeedsTxtFile() { 185 return new File(crawlDir, SEEDS_TXT_FILENAME); 186 } 187 188 /** 189 * Returns the recoverbackup file object. 190 * 191 * @return A file object for the recoverbackup.gz. file (which may or may not exist). 192 */ 193 public File getRecoverBackupGzFile() { 194 return new File(crawlDir, RECOVERBACKUP_GZ_FILENAME); 195 } 196 197 /** 198 * Try to write the recover-backup file. 199 * 200 * @param recoverlog The recoverlog in the form of an InputStream 201 * @return true, if operation succeeds, otherwise false 202 */ 203 public boolean writeRecoverBackupfile(InputStream recoverlog) { 204 OutputStream os = null; 205 try { 206 os = new FileOutputStream(getRecoverBackupGzFile()); 207 StreamUtils.copyInputStreamToOutputStream(recoverlog, os); 208 } catch (IOException e) { 209 log.debug("The writing of the recoverlog failed: ", e); 210 return false; 211 } finally { 212 IOUtils.closeQuietly(os); 213 } 214 return true; 215 } 216 217 /** 218 * Writes the given content to the seeds.txt file. 219 * 220 * @param seeds The intended content of seeds.txt 221 * @throws ArgumentNotValid if seeds is null or empty 222 */ 223 public void writeSeedsTxt(String seeds) { 224 ArgumentNotValid.checkNotNullOrEmpty(seeds, "String seeds"); 225 log.debug("Writing seeds to disk as file: {}", getSeedsTxtFile().getAbsolutePath()); 226 FileUtils.writeBinaryFile(getSeedsTxtFile(), seeds.getBytes()); 227 } 228 229 /** 230 * Writes the given order.xml content to the order.xml file. 231 * 232 * @param doc The intended content of order.xml 233 * @throws ArgumentNotValid, if doc is null or empty 234 */ 235 public void writeOrderXml(HeritrixTemplate doc) { 236 ArgumentNotValid.checkNotNull(doc, "Document doc"); 237 ArgumentNotValid.checkTrue(doc.hasContent(), "HeritrixTemplate document must not be empty"); 238 log.debug("Writing order-file to disk as file: {}", getOrderXmlFile().getAbsolutePath()); 239 doc.writeToFile(getOrderXmlFile()); 240 } 241 242 /** 243 * Get the file that contains output from Heritrix on stdout/stderr. 244 * 245 * @return File that contains output from Heritrix on stdout/stderr. 246 */ 247 public File getHeritrixOutput() { 248 return new File(crawlDir, OUTPUT_FILENAME); 249 } 250 251 /** 252 * Set the deduplicate index dir. 253 * 254 * @param indexDir the cache dir containing unzipped files 255 * @throws ArgumentNotValid if indexDir is not a directory or is null 256 */ 257 public void setIndexDir(File indexDir) { 258 ArgumentNotValid.checkNotNull(indexDir, "File indexDir"); 259 ArgumentNotValid.checkTrue(indexDir.isDirectory(), "indexDir '" + indexDir + "' should be a directory"); 260 this.indexDir = indexDir; 261 log.debug("Setting deduplication index dir '{}'", indexDir); 262 } 263 264 /** 265 * Returns the index directory, if one has been set. 266 * 267 * @return the index directory or null if no index has been set. 268 */ 269 public File getIndexDir() { 270 return indexDir; 271 } 272 273 /** 274 * Return a list of disposable heritrix-files. Currently the list consists of the File "state.job", and the 275 * directories: "checkpoints", "state", "scratch". 276 * 277 * @return a list of disposable heritrix-files. 278 */ 279 public File[] getDisposableFiles() { 280 return new File[] {new File(crawlDir, "state.job"), new File(crawlDir, "state"), 281 new File(crawlDir, "checkpoints"), new File(crawlDir, "scratch")}; 282 } 283 284 /** 285 * Retrieve the crawlLog as a File object. 286 * 287 * @return the crawlLog as a File object. 288 */ 289 public File getCrawlLog() { 290 File logDir = new File(crawlDir, "logs"); 291 return new File(logDir, CRAWL_LOG_FILENAME); 292 } 293 294 /** 295 * Retrieve the progress statistics log as a File object. 296 * 297 * @return the progress statistics log as a File object. 298 */ 299 public File getProgressStatisticsLog() { 300 File logDir = new File(crawlDir, "logs"); 301 return new File(logDir, PROGRESS_STATISTICS_LOG_FILENAME); 302 } 303 304 /** 305 * Get the job ID. 306 * 307 * @return Job ID this heritrix files object is for. 308 */ 309 public Long getJobID() { 310 return jobID; 311 } 312 313 /** 314 * Get the harvest ID. 315 * 316 * @return Harvest ID this heritrix files object is for. 317 */ 318 public Long getHarvestID() { 319 return harvestID; 320 } 321 322 /** 323 * Delete statefile etc. and move crawl directory to oldjobs. 324 * 325 * @param oldJobsDir Directory to move the rest of any existing files to. 326 */ 327 public void cleanUpAfterHarvest(File oldJobsDir) { 328 // delete disposable files 329 for (File disposable : getDisposableFiles()) { 330 if (disposable.exists()) { 331 try { 332 FileUtils.removeRecursively(disposable); 333 } catch (IOFailure e) { 334 // Log harmless trouble 335 log.debug("Couldn't delete leftover file '{}'", disposable.getAbsolutePath(), e); 336 } 337 } 338 } 339 // move the rest to oldjobs 340 FileUtils.createDir(oldJobsDir); 341 File destDir = new File(oldJobsDir, crawlDir.getName()); 342 boolean success = crawlDir.renameTo(destDir); 343 if (!success) { 344 log.warn("Failed to rename jobdir '{}' to '{}'", crawlDir, destDir); 345 } 346 } 347 348 /** 349 * Helper method to delete the crawl.log and progress statistics log. Will log errors but otherwise continue. 350 */ 351 public void deleteFinalLogs() { 352 try { 353 FileUtils.remove(getCrawlLog()); 354 } catch (IOFailure e) { 355 // Log harmless trouble 356 log.debug("Couldn't delete crawl log file.", e); 357 } 358 try { 359 FileUtils.remove(getProgressStatisticsLog()); 360 } catch (IOFailure e) { 361 // Log harmless trouble 362 log.debug("Couldn't delete progress statistics log file.", e); 363 } 364 } 365 366 /** 367 * Return the directory, where Heritrix writes its arcfiles. 368 * 369 * @return the directory, where Heritrix writes its arcfiles. 370 */ 371 public File getArcsDir() { 372 return new File(crawlDir, Constants.ARCDIRECTORY_NAME); 373 } 374 375 /** 376 * Return the directory, where Heritrix writes its warcfiles. 377 * 378 * @return the directory, where Heritrix writes its warcfiles. 379 */ 380 public File getWarcsDir() { 381 return new File(crawlDir, Constants.WARCDIRECTORY_NAME); 382 } 383 384 /** 385 * Method for retrieving the jmxremote.password file. 386 * 387 * @return the jmxPasswordFile. 388 */ 389 public File getJmxPasswordFile() { 390 return jmxPasswordFile; 391 } 392 393 /** 394 * Method for retrieving the jmxremote.access file. 395 * 396 * @return the jmxAccessFile. 397 */ 398 public File getJmxAccessFile() { 399 return jmxAccessFile; 400 } 401 402 public static enum Version { 403 HERITRIX_1, 404 HERITRIX_3; 405 } 406}