001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.heritrix3; 024 025import java.io.File; 026import java.io.FilenameFilter; 027import java.util.Arrays; 028import java.util.LinkedList; 029import java.util.List; 030 031import org.slf4j.Logger; 032import org.slf4j.LoggerFactory; 033 034import dk.netarkivet.common.Constants; 035import dk.netarkivet.common.exceptions.ArgumentNotValid; 036import dk.netarkivet.common.exceptions.IOFailure; 037import dk.netarkivet.common.exceptions.PermissionDenied; 038import dk.netarkivet.common.exceptions.IllegalState; 039import dk.netarkivet.common.utils.FileUtils; 040import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriter; 041 042/** 043 * Encapsulation of files to be ingested into the archive. These files are presently placed subdirectories under the 044 * crawldir. 045 */ 046public class IngestableFiles { 047 048 private static final Logger log = LoggerFactory.getLogger(IngestableFiles.class); 049 050 /** Subdir with final metadata file in it. */ 051 protected static final String METADATA_SUB_DIR = "metadata"; 052 053 /** Subdir with temporary metadata file in it. */ 054 private static final String TMP_SUB_DIR = "tmp-meta"; 055 056 /** jobId for present harvestjob. */ 057 private long jobId; 058 059 /** crawlDir for present harvestjob. */ 060 private File crawlDir; 061 062 /** 063 * Writer to this jobs metadatafile. This is closed when the metadata is marked as ready. 064 */ 065 private MetadataFileWriter writer = null; 066 067 /** Whether we've had an error in metadata generation. */ 068 private boolean error = false; 069 070 private String harvestnamePrefix; 071 072 private Long harvestId; 073 074 private File heritrixJobDir; 075 /** 076 * Constructor for this class. HeritrixFiles contains information about crawlDir, jobId, and harvestnameprefix for a 077 * specific finished harvestjob. 078 * 079 * @param files An instance of Heritrix3Files 080 * @throws ArgumentNotValid if null-arguments are given; if jobID < 1; if crawlDir does not exist 081 */ 082 public IngestableFiles(Heritrix3Files files) { 083 ArgumentNotValid.checkNotNull(files, "files"); 084 ArgumentNotValid.checkNotNull(files.getCrawlDir(), "crawlDir"); 085 ArgumentNotValid.checkPositive(files.getJobID(), "jobID"); 086 ArgumentNotValid.checkNotNullOrEmpty(files.getArchiveFilePrefix(), "harvestnamePrefix"); 087 this.heritrixJobDir = files.getHeritrixJobDir(); 088 this.crawlDir = files.getCrawlDir(); 089 if (!crawlDir.exists()) { 090 throw new ArgumentNotValid("The given crawlDir (" + crawlDir.getAbsolutePath() + ") does not exist"); 091 } 092 this.jobId = files.getJobID(); 093 this.harvestnamePrefix = files.getArchiveFilePrefix(); 094 this.harvestId = files.getHarvestID(); 095 // Create subdir 'metadata' if not already exists. 096 FileUtils.createDir(getMetadataDir()); 097 // Create/scratch subdir 'tmp-meta' 098 FileUtils.removeRecursively(getTmpMetadataDir()); 099 FileUtils.createDir(getTmpMetadataDir()); 100 } 101 102 /** 103 * Check, if the metadatafile already exists. If this is true, metadata has been successfully generated. If false, 104 * either metadata has not finished being generated, or there was an error generating them. 105 * 106 * @return true, if it does exist; false otherwise. 107 */ 108 public boolean isMetadataReady() { 109 return getMetadataFile().isFile(); 110 } 111 112 /** 113 * Return true if the metadata generation process is known to have failed. 114 * 115 * @return True if metadata generation is finished without success, false if generation is still ongoing or has been 116 * successfully done. 117 */ 118 public boolean isMetadataFailed() { 119 return error; 120 } 121 122 /** 123 * Marks generated metadata as final, closes the writer, and moves the temporary metadata file to its final 124 * position, if successful. 125 * 126 * @param success True if metadata was successfully generated, false otherwise. 127 * @throws PermissionDenied If the metadata has already been marked as ready, or if no metadata file exists upon 128 * success. 129 * @throws IOFailure if there is an error marking the metadata as ready. 130 */ 131 public void setMetadataGenerationSucceeded(boolean success) { 132 if (isMetadataReady()) { 133 throw new PermissionDenied("Metadata file " + getMetadataFile().getAbsolutePath() + " already exists"); 134 } 135 136 if (success) { 137 writer.close(); // close writer down 138 if (!getTmpMetadataFile().exists()) { 139 String message = "No metadata was generated despite claims that metadata generation was successful."; 140 throw new PermissionDenied(message); 141 } 142 getTmpMetadataFile().renameTo(getMetadataFile()); 143 } else { 144 error = true; 145 } 146 } 147 148 /** 149 * Get a MetaDatafileWriter for the temporary metadata file. Successive calls to this method on the same object will 150 * return the same writer. Once the metadata have been finalized, calling this method will fail. 151 * 152 * @return a MetaDatafileWriter for the temporary metadata file. 153 * @throws PermissionDenied if metadata generation is already finished. 154 */ 155 public MetadataFileWriter getMetadataWriter() { 156 if (isMetadataReady()) { 157 throw new PermissionDenied("Metadata file " + getMetadataFile().getAbsolutePath() + " already exists"); 158 } 159 if (isMetadataFailed()) { 160 throw new PermissionDenied("Metadata generation of file " + getMetadataFile().getAbsolutePath() 161 + " has already failed."); 162 } 163 if (writer == null) { 164 writer = MetadataFileWriter.createWriter(getTmpMetadataFile()); 165 } 166 return writer; 167 } 168 169 /** 170 * Gets the files containing the metadata. 171 * 172 * @return the files in the metadata dir 173 * @throws IllegalState if the metadata file is not ready, either because generation is still going on or there 174 * was an error generating the metadata. 175 */ 176 public List<File> getMetadataArcFiles() { 177 // Our one known metadata file must exist. 178 if (!isMetadataReady()) { 179 throw new IllegalState("Metadata file " + getMetadataFile().getAbsolutePath() + " does not exist"); 180 } 181 return Arrays.asList(new File[] {getMetadataFile()}); 182 } 183 184 /** 185 * Constructs the metadata subdir from the crawlDir. 186 * 187 * @return The metadata subdir as a File 188 */ 189 private File getMetadataDir() { 190 return new File(crawlDir, METADATA_SUB_DIR); 191 } 192 193 /** 194 * Constructs the single metadata arc file from the crawlDir and the jobID. 195 * 196 * @return metadata arc file as a File 197 */ 198 protected File getMetadataFile() { 199 return new File(getMetadataDir(), MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobId))); 200 } 201 202 /** 203 * Constructs the TEMPORARY metadata subdir from the crawlDir. 204 * 205 * @return The tmp-metadata subdir as a File 206 */ 207 public File getTmpMetadataDir() { 208 return new File(crawlDir, TMP_SUB_DIR); 209 } 210 211 /** 212 * Constructs the TEMPORARY metadata arc file from the crawlDir and the jobID. 213 * 214 * @return tmp-metadata arc file as a File 215 */ 216 private File getTmpMetadataFile() { 217 return new File(getTmpMetadataDir(), MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobId))); 218 } 219 220 /** 221 * Get a list of all ARC files that should get ingested. Any open files should be closed with closeOpenFiles first. 222 * 223 * @return The ARC files that are ready to get ingested. 224 */ 225 public List<File> getArcFiles() { 226 File arcsdir = getArcsDir(); 227 if (arcsdir.exists()) { 228 if (!arcsdir.isDirectory()) { 229 throw new IOFailure(arcsdir.getPath() + " is not a directory"); 230 } 231 return Arrays.asList(arcsdir.listFiles(FileUtils.ARCS_FILTER)); 232 } else { 233 return new LinkedList<File>(); 234 } 235 } 236 237 /** 238 * @return the arcs dir in the our crawl directory. 239 */ 240 public File getArcsDir() { 241 return new File(heritrixJobDir, "latest/" + Constants.ARCDIRECTORY_NAME); 242 } 243 244 /** 245 * @return the warcs dir in the our crawl directory. 246 */ 247 public File getWarcsDir() { 248 return new File(heritrixJobDir, "latest/" + Constants.WARCDIRECTORY_NAME); 249 } 250 251 /** 252 * @return the warcs dir in the our crawl directory. 253 */ 254 public File getReportsDir() { 255 return new File(heritrixJobDir, "latest/reports"); 256 } 257 258 259 260 261 262 /** 263 * Get a list of all WARC files that should get ingested. Any open files should be closed with closeOpenFiles first. 264 * 265 * @return The WARC files that are ready to get ingested. 266 */ 267 public List<File> getWarcFiles() { 268 File warcsdir = getWarcsDir(); 269 if (warcsdir.exists()) { 270 if (!warcsdir.isDirectory()) { 271 throw new IOFailure(warcsdir.getPath() + " is not a directory"); 272 } 273 //log 274 275 return Arrays.asList(warcsdir.listFiles(FileUtils.WARCS_FILTER)); 276 } else { 277 return new LinkedList<File>(); 278 } 279 } 280 281 public File getHeritrix3JobDir() { 282 return this.heritrixJobDir; 283 } 284 285 /** 286 * Close any ".open" files left by a crashed Heritrix. ARC and/or WARC files ending in .open indicate that Heritrix 287 * is still writing to them. If Heritrix has died, we can just rename them before we upload. This must not be done 288 * while harvesting is still in progress. 289 * 290 * @param waitSeconds How many seconds to wait before closing files. This may be done in order to allow Heritrix to 291 * finish writing before we close the files. 292 */ 293 public void closeOpenFiles(int waitSeconds) { 294 // wait for Heritrix threads to create and close last arc or warc files 295 try { 296 Thread.sleep(waitSeconds * 1000L); 297 } catch (InterruptedException e) { 298 log.debug("Thread woken prematurely from sleep.", e); 299 } 300 301 closeOpenFiles(Constants.ARCDIRECTORY_NAME, FileUtils.OPEN_ARCS_FILTER); 302 closeOpenFiles(Constants.WARCDIRECTORY_NAME, FileUtils.OPEN_WARCS_FILTER); 303 } 304 305 /** 306 * Given an archive sub-directory name and a filter to match against this method tries to rename the matched files. 307 * Files that can not be renamed generate a log message. The filter should always match files that end with ".open" 308 * as a minimum. 309 * 310 * @param archiveDirName archive directory name, currently "arc" or "warc" 311 * @param filter filename filter used to select ".open" files to rename 312 */ 313 protected void closeOpenFiles(String archiveDirName, FilenameFilter filter) { 314 File arcsdir = new File(crawlDir, archiveDirName); 315 log.debug("Trying to close open archive files in directory {}", arcsdir); 316 File[] files = arcsdir.listFiles(filter); 317 if (files != null) { 318 for (File file : files) { 319 final String fname = file.getAbsolutePath(); 320 // Note: Due to regexp we know filename is at least 5 characters 321 File tofile = new File(fname.substring(0, fname.length() - 5)); 322 if (!file.renameTo(tofile)) { 323 log.warn("Failed to rename '{}' to '{}'", file.getAbsolutePath(), tofile.getAbsolutePath()); 324 } 325 } 326 } 327 } 328 329 /** 330 * Remove any temporary files. 331 */ 332 public void cleanup() { 333 log.debug("Removing the directory '{}'", getTmpMetadataDir()); 334 FileUtils.removeRecursively(getTmpMetadataDir()); 335 writer = null; 336 } 337 338 /** 339 * @return the jobID of the harvest job being processed. 340 */ 341 public long getJobId() { 342 return this.jobId; 343 } 344 345 /** 346 * @return the harvestID of the harvest job being processed. 347 */ 348 public long getHarvestID() { 349 return this.harvestId; 350 } 351 352 /** 353 * @return the harvestnamePrefix of the harvest job being processed. 354 */ 355 public String getHarvestnamePrefix() { 356 return this.harvestnamePrefix; 357 } 358 359 /** 360 * @return the crawlDir of the harvest job being processed. 361 */ 362 public File getCrawlDir() { 363 return this.crawlDir; 364 } 365 366}