001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.heritrix3; 024 025import java.io.File; 026 027import org.slf4j.Logger; 028import org.slf4j.LoggerFactory; 029 030import dk.netarkivet.common.exceptions.ArgumentNotValid; 031import dk.netarkivet.common.exceptions.IOFailure; 032import dk.netarkivet.common.exceptions.UnknownID; 033import dk.netarkivet.common.utils.FileUtils; 034import dk.netarkivet.common.utils.Settings; 035import dk.netarkivet.harvester.HarvesterSettings; 036import dk.netarkivet.harvester.datamodel.HeritrixTemplate; 037import dk.netarkivet.harvester.datamodel.Job; 038import dk.netarkivet.harvester.harvesting.PersistentJobData; 039 040/** 041 * This class encapsulates the information generated by Heritrix3 or delivered to Heritrix3 before a crawl. 042 */ 043public class Heritrix3Files { 044 045 /** The logger for this class. */ 046 private static final Logger LOG = LoggerFactory.getLogger(Heritrix3Files.class); 047 048 049 private static final String HERITRIX_UNPACKDIR = "heritrix3/"; 050 051 private File crawlDir; 052 private Long harvestID; 053 private Long jobID; 054 private File orderXML; 055 private File indexDir; 056 private String archiveFilePrefix; 057 private File h3ZipBall; 058 private File h3CerticateFile; 059 private File h3BaseDir; 060 private File h3JobDir; 061 private String jobName; 062 private File h3LogDir; 063 064 private File seedsFile; 065 066 private File orderFile; 067 068 public static Heritrix3Files getH3HeritrixFiles(File crawldir, PersistentJobData harvestInfo) { 069 Heritrix3Files files = new Heritrix3Files(); 070 files.setCrawldir(crawldir); 071 files.setJobId(harvestInfo.getJobID()); 072 files.setHarvestID(harvestInfo.getOrigHarvestDefinitionID()); 073 files.setArchivePrefix(harvestInfo.getHarvestFilenamePrefix()); 074 files.setHeritrixZip(); 075 files.setCertificateFile(); 076 files.setHeritrixBaseDir(); 077 files.setHeritrixJobDir(); 078 return files; 079 } 080 081 public static Heritrix3Files getH3HeritrixFiles(File crawldir, Job job) { 082 Heritrix3Files files = new Heritrix3Files(); 083 files.setCrawldir(crawldir); 084 files.setJobId(job.getJobID()); 085 files.setHarvestID(job.getOrigHarvestDefinitionID()); 086 files.setArchivePrefix(job.getHarvestFilenamePrefix()); 087 files.setHeritrixZip(); 088 files.setCertificateFile(); 089 files.setHeritrixBaseDir(); 090 files.setHeritrixJobDir(); 091 return files; 092 } 093 094 private void setHarvestID(Long origHarvestDefinitionID) { 095 this.harvestID = origHarvestDefinitionID; 096 } 097 098 private void setHeritrixJobDir() { 099 jobName = crawlDir.getName(); 100 h3JobDir = new File(h3BaseDir, "jobs/" + jobName); 101 h3LogDir = new File(h3JobDir, "logs"); 102 } 103 104 private void setHeritrixBaseDir() { 105 h3BaseDir = new File(crawlDir, HERITRIX_UNPACKDIR); 106 } 107 108 private void setHeritrixZip() { 109 h3ZipBall = Settings.getFile(HarvesterSettings.HERITRIX3_BUNDLE); 110 if (!h3ZipBall.isFile()) { 111 throw new IOFailure("The path to the heritrix3 zipfile '" 112 + h3ZipBall.getAbsolutePath() + "' does not represent a proper file"); 113 } 114 } 115 116 private void setArchivePrefix(String harvestFilenamePrefix) { 117 this.archiveFilePrefix = harvestFilenamePrefix; 118 119 } 120 121 private void setJobId(Long jobID) { 122 this.jobID = jobID; 123 } 124 125 private void setCrawldir(File crawldir) { 126 this.crawlDir = crawldir; 127 this.seedsFile = new File(crawldir, "seeds.txt"); 128 this.orderFile = new File(crawldir, "crawler-beans.cxml"); 129 } 130 131 private Heritrix3Files(){ 132 } 133 134 public File getCrawlDir() { 135 return this.crawlDir; 136 } 137 138 public void writeSeedsTxt(String seedListAsString) { 139 ArgumentNotValid.checkNotNullOrEmpty(seedListAsString, "String seedListAsString"); 140 LOG.debug("Writing seeds to disk as file: " + seedsFile.getAbsolutePath()); 141 FileUtils.writeBinaryFile(seedsFile, seedListAsString.getBytes()); 142 } 143 144 public File getSeedsFile() { 145 return this.seedsFile; 146 } 147 148 public File getOrderFile() { 149 return this.orderFile; 150 } 151 152 153 public void setIndexDir(File indexDir) { 154 ArgumentNotValid.checkExistsDirectory(indexDir, "File indexDir"); 155 this.indexDir = indexDir;; 156 157 } 158 public void writeOrderXml(HeritrixTemplate orderXMLdoc) { 159 File destination = this.orderFile; 160 161 orderXMLdoc.writeToFile(destination); 162 this.orderXML = destination; 163 } 164 165 public File getProgressStatisticsLog() { 166 return new File(h3LogDir, "progress-statistics.log"); 167 } 168 169 public Long getJobID() { 170 return this.jobID; 171 } 172 173 public File getOrderXmlFile() { 174 return this.orderXML; 175 } 176 public File getSeedsTxtFile() { 177 return new File(h3JobDir, "seeds.txt"); 178 } 179 180 public Long getHarvestID() { 181 return this.harvestID; 182 } 183 184 public String getArchiveFilePrefix() { 185 return this.archiveFilePrefix; 186 } 187 188 public File getIndexDir() { 189 return this.indexDir; 190 } 191 192 public File getCrawlLog() { 193 return new File(h3LogDir, "crawl.log"); 194 } 195 196 public File getHeritrixZip() { 197 return this.h3ZipBall; 198 } 199 200 public File getCertificateFile() { 201 return h3CerticateFile; 202 } 203 204 private void setCertificateFile() { 205 try { 206 h3CerticateFile = Settings.getFile(HarvesterSettings.HERITRIX3_CERTIFICATE); 207 } catch (UnknownID unknownID) { 208 LOG.debug("No heritrix3 certificate defined in settings, using default"); 209 return; 210 } 211 if (h3CerticateFile != null && !h3CerticateFile.isFile()) { 212 throw new IOFailure("The path to the heritrix3 certificate '" 213 + h3CerticateFile.getAbsolutePath() + "' does not represent a proper file"); 214 } 215 } 216 217 public File getHeritrixOutput() { 218 return new File(crawlDir, "heritrix_out.log"); 219 } 220 221 public File getHeritrixStderrLog() { 222 return new File(crawlDir, "heritrix3_err.log"); 223 } 224 225 public File getHeritrixStdoutLog() { 226 return new File(crawlDir, "heritrix3_out.log"); 227 } 228 229 public File getHeritrixJobDir() { 230 return h3JobDir; 231 } 232 233 234 public File getHeritrixBaseDir() { 235 return h3BaseDir; 236 } 237 238 public String getJobname() { 239 return this.jobName; 240 } 241 242 public void deleteFinalLogs() { 243 try { 244 FileUtils.remove(getCrawlLog()); 245 } catch (IOFailure e) { 246 // Log harmless trouble 247 LOG.debug("Couldn't delete crawl log file.", e); 248 } 249 try { 250 FileUtils.remove(getProgressStatisticsLog()); 251 } catch (IOFailure e) { 252 // Log harmless trouble 253 LOG.debug("Couldn't delete progress statistics log file.", e); 254 } 255 } 256 257 public void cleanUpAfterHarvest(File oldJobsDir) { 258 // delete disposable files 259 for (File disposable : getDisposableFiles()) { 260 if (disposable.exists()) { 261 try { 262 FileUtils.removeRecursively(disposable); 263 } catch (IOFailure e) { 264 // Log harmless trouble 265 LOG.debug("Couldn't delete leftover file '{}'", disposable.getAbsolutePath(), e); 266 } 267 } 268 } 269 // move the rest to oldjobs 270 FileUtils.createDir(oldJobsDir); 271 File destDir = new File(oldJobsDir, crawlDir.getName()); 272 boolean success = crawlDir.renameTo(destDir); 273 if (!success) { 274 LOG.warn("Failed to rename jobdir '{}' to '{}'", crawlDir, destDir); 275 } 276 } 277 278 public File[] getDisposableFiles() { 279 return new File[] {new File(h3JobDir, "state"), new File(crawlDir, "checkpoints"), new File(h3JobDir, "scratch")}; 280 } 281}