001package dk.netarkivet.harvester.heritrix3; 002 003import java.io.File; 004import java.io.InputStream; 005 006import org.slf4j.Logger; 007import org.slf4j.LoggerFactory; 008 009import dk.netarkivet.common.exceptions.ArgumentNotValid; 010import dk.netarkivet.common.exceptions.IOFailure; 011import dk.netarkivet.common.exceptions.UnknownID; 012import dk.netarkivet.common.utils.FileUtils; 013import dk.netarkivet.common.utils.Settings; 014import dk.netarkivet.harvester.datamodel.HeritrixTemplate; 015import dk.netarkivet.harvester.datamodel.Job; 016import dk.netarkivet.harvester.harvesting.PersistentJobData; 017 018/** 019 * This class encapsulates the information generated by Heritrix3 or delivered to Heritrix3 before a crawl. 020 * @author svc 021 * 022 * TODO implementing recoverlog handling 023 */ 024public class Heritrix3Files { 025 026 /** The logger for this class. */ 027 //private static final Log LOG = LogFactory.getLog(Heritrix3Files.class); 028 private static final Logger LOG = LoggerFactory.getLogger(Heritrix3Files.class); 029 030 031 private static final String HERITRIX_UNPACKDIR = "heritrix3/"; 032 033 private File crawlDir; 034 private Long harvestID; 035 private Long jobID; 036 private File orderXML; 037 private File indexDir; 038 private String archiveFilePrefix; 039 private File h3ZipBall; 040 private File h3CerticateFile; 041 private File h3BaseDir; 042 private File h3JobDir; 043 private String jobName; 044 private File h3LogDir; 045 046 private File seedsFile; 047 048 private File orderFile; 049 050 public static Heritrix3Files getH3HeritrixFiles(File crawldir, PersistentJobData harvestInfo) { 051 Heritrix3Files files = new Heritrix3Files(); 052 files.setCrawldir(crawldir); 053 files.setJobId(harvestInfo.getJobID()); 054 files.setHarvestID(harvestInfo.getOrigHarvestDefinitionID()); 055 files.setArchivePrefix(harvestInfo.getHarvestFilenamePrefix()); 056 files.setHeritrixZip(); 057 files.setCertificateFile(); 058 files.setHeritrixBaseDir(); 059 files.setHeritrixJobDir(); 060 return files; 061 } 062 063 public static Heritrix3Files getH3HeritrixFiles(File crawldir, Job job) { 064 Heritrix3Files files = new Heritrix3Files(); 065 files.setCrawldir(crawldir); 066 files.setJobId(job.getJobID()); 067 files.setHarvestID(job.getOrigHarvestDefinitionID()); 068 files.setArchivePrefix(job.getHarvestFilenamePrefix()); 069 files.setHeritrixZip(); 070 files.setCertificateFile(); 071 files.setHeritrixBaseDir(); 072 files.setHeritrixJobDir(); 073 return files; 074 } 075 076 private void setHarvestID(Long origHarvestDefinitionID) { 077 this.harvestID = origHarvestDefinitionID; 078 } 079 080 private void setHeritrixJobDir() { 081 jobName = crawlDir.getName(); 082 h3JobDir = new File(h3BaseDir, "jobs/" + jobName); 083 h3LogDir = new File(h3JobDir, "logs"); 084 } 085 086 private void setHeritrixBaseDir() { 087 h3BaseDir = new File(crawlDir, HERITRIX_UNPACKDIR); 088 } 089 090 private void setHeritrixZip() { 091 h3ZipBall = Settings.getFile(Heritrix3Settings.HERITRIX3_BUNDLE); 092 if (!h3ZipBall.isFile()) { 093 throw new IOFailure("The path to the heritrix3 zipfile '" 094 + h3ZipBall.getAbsolutePath() + "' does not represent a proper file"); 095 } 096 } 097 098 private void setArchivePrefix(String harvestFilenamePrefix) { 099 this.archiveFilePrefix = harvestFilenamePrefix; 100 101 } 102 103 private void setJobId(Long jobID) { 104 this.jobID = jobID; 105 } 106 107 private void setCrawldir(File crawldir) { 108 this.crawlDir = crawldir; 109 this.seedsFile = new File(crawldir, "seeds.txt"); 110 this.orderFile = new File(crawldir, "crawler-beans.cxml"); 111 } 112 113 private Heritrix3Files(){ 114 } 115 116 public File getCrawlDir() { 117 return this.crawlDir; 118 } 119 120 public void writeSeedsTxt(String seedListAsString) { 121 ArgumentNotValid.checkNotNullOrEmpty(seedListAsString, "String seedListAsString"); 122 LOG.debug("Writing seeds to disk as file: " + seedsFile.getAbsolutePath()); 123 FileUtils.writeBinaryFile(seedsFile, seedListAsString.getBytes()); 124 } 125 126 public File getSeedsFile() { 127 return this.seedsFile; 128 } 129 130 public File getOrderFile() { 131 return this.orderFile; 132 } 133 134 135 public void setIndexDir(File indexDir) { 136 ArgumentNotValid.checkExistsDirectory(indexDir, "File indexDir"); 137 this.indexDir = indexDir;; 138 139 } 140 public void writeOrderXml(HeritrixTemplate orderXMLdoc) { 141 File destination = this.orderFile; 142 143 orderXMLdoc.writeToFile(destination); 144 this.orderXML = destination; 145 } 146 147 public File getProgressStatisticsLog() { 148 return new File(h3LogDir, "progress-statistics.log"); 149 } 150 151 public Long getJobID() { 152 return this.jobID; 153 } 154 155 public File getOrderXmlFile() { 156 return this.orderXML; 157 } 158 public File getSeedsTxtFile() { 159 return new File(h3JobDir, "seeds.txt"); 160 } 161 162 public Long getHarvestID() { 163 return this.harvestID; 164 } 165 166 public String getArchiveFilePrefix() { 167 return this.archiveFilePrefix; 168 } 169 170 public File getIndexDir() { 171 return this.indexDir; 172 } 173 174 public File getCrawlLog() { 175 return new File(h3LogDir, "crawl.log"); 176 } 177 178 public File getHeritrixZip() { 179 return this.h3ZipBall; 180 } 181 182 public File getCertificateFile() { 183 return h3CerticateFile; 184 } 185 186 private void setCertificateFile() { 187 try { 188 h3CerticateFile = Settings.getFile(Heritrix3Settings.HERITRIX3_CERTIFICATE); 189 } catch (UnknownID unknownID) { 190 LOG.debug("No heritrix3 certificate defined in settings, using default"); 191 return; 192 } 193 if (h3CerticateFile != null && !h3CerticateFile.isFile()) { 194 throw new IOFailure("The path to the heritrix3 certificate '" 195 + h3CerticateFile.getAbsolutePath() + "' does not represent a proper file"); 196 } 197 } 198 199 public File getHeritrixOutput() { 200 return new File(crawlDir, "heritrix_out.log"); 201 } 202 203 public File getHeritrixStderrLog() { 204 return new File(crawlDir, "heritrix3_err.log"); 205 } 206 207 public File getHeritrixStdoutLog() { 208 return new File(crawlDir, "heritrix3_out.log"); 209 } 210 211 public File getHeritrixJobDir() { 212 return h3JobDir; 213 } 214 215 216 public File getHeritrixBaseDir() { 217 return h3BaseDir; 218 } 219 220 public String getJobname() { 221 return this.jobName; 222 } 223 224 public void deleteFinalLogs() { 225 try { 226 FileUtils.remove(getCrawlLog()); 227 } catch (IOFailure e) { 228 // Log harmless trouble 229 LOG.debug("Couldn't delete crawl log file.", e); 230 } 231 try { 232 FileUtils.remove(getProgressStatisticsLog()); 233 } catch (IOFailure e) { 234 // Log harmless trouble 235 LOG.debug("Couldn't delete progress statistics log file.", e); 236 } 237 } 238 239 public void cleanUpAfterHarvest(File oldJobsDir) { 240 // delete disposable files 241 for (File disposable : getDisposableFiles()) { 242 if (disposable.exists()) { 243 try { 244 FileUtils.removeRecursively(disposable); 245 } catch (IOFailure e) { 246 // Log harmless trouble 247 LOG.debug("Couldn't delete leftover file '{}'", disposable.getAbsolutePath(), e); 248 } 249 } 250 } 251 // move the rest to oldjobs 252 FileUtils.createDir(oldJobsDir); 253 File destDir = new File(oldJobsDir, crawlDir.getName()); 254 boolean success = crawlDir.renameTo(destDir); 255 if (!success) { 256 LOG.warn("Failed to rename jobdir '{}' to '{}'", crawlDir, destDir); 257 } 258 } 259 260 public File[] getDisposableFiles() { 261 return new File[] {new File(h3JobDir, "state"), new File(crawlDir, "checkpoints"), new File(h3JobDir, "scratch")}; 262 } 263 264 ////////////////////// UNIMPLEMENTED METHODS /////////////////////////////// 265 266 // FIXME Handling of the recoverLog is postponed 267 268 public boolean writeRecoverBackupfile(InputStream data) { 269 // TODO Auto-generated method stub 270 return false; 271 } 272 273 public File getRecoverBackupGzFile() { 274 // TODO Auto-generated method stub 275 return null; 276 } 277}