001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.heritrix3; 024 025import java.io.File; 026import java.io.FilenameFilter; 027import java.util.Arrays; 028import java.util.LinkedList; 029import java.util.List; 030 031import org.slf4j.Logger; 032import org.slf4j.LoggerFactory; 033 034import dk.netarkivet.common.Constants; 035import dk.netarkivet.common.exceptions.ArgumentNotValid; 036import dk.netarkivet.common.exceptions.IOFailure; 037import dk.netarkivet.common.exceptions.PermissionDenied; 038import dk.netarkivet.common.exceptions.IllegalState; 039import dk.netarkivet.common.utils.FileUtils; 040import dk.netarkivet.common.utils.Settings; 041import dk.netarkivet.harvester.HarvesterSettings; 042import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriter; 043 044/** 045 * Encapsulation of files to be ingested into the archive. These files are presently placed subdirectories under the 046 * crawldir. 047 */ 048public class IngestableFiles { 049 050 private static final Logger log = LoggerFactory.getLogger(IngestableFiles.class); 051 052 /** Subdir with final metadata file in it. */ 053 protected static final String METADATA_SUB_DIR = "metadata"; 054 055 /** Subdir with temporary metadata file in it. */ 056 private static final String TMP_SUB_DIR = "tmp-meta"; 057 058 /** jobId for present harvestjob. */ 059 private long jobId; 060 061 /** crawlDir for present harvestjob. */ 062 private File crawlDir; 063 064 /** 065 * Writer to this jobs metadatafile. This is closed when the metadata is marked as ready. 066 */ 067 private MetadataFileWriter writer = null; 068 069 /** Whether we've had an error in metadata generation. */ 070 private boolean error = false; 071 072 private String harvestnamePrefix; 073 074 public static final String METADATA_FILENAME_FORMAT = Settings.get(HarvesterSettings.METADATA_FILENAME_FORMAT); 075 076 private Long harvestId; 077 078 private File heritrixJobDir; 079 /** 080 * Constructor for this class. HeritrixFiles contains information about crawlDir, jobId, and harvestnameprefix for a 081 * specific finished harvestjob. 082 * 083 * @param files An instance of Heritrix3Files 084 * @throws ArgumentNotValid if null-arguments are given; if jobID < 1; if crawlDir does not exist 085 */ 086 public IngestableFiles(Heritrix3Files files) { 087 ArgumentNotValid.checkNotNull(files, "files"); 088 ArgumentNotValid.checkNotNull(files.getCrawlDir(), "crawlDir"); 089 ArgumentNotValid.checkPositive(files.getJobID(), "jobID"); 090 ArgumentNotValid.checkNotNullOrEmpty(files.getArchiveFilePrefix(), "harvestnamePrefix"); 091 this.heritrixJobDir = files.getHeritrixJobDir(); 092 this.crawlDir = files.getCrawlDir(); 093 if (!crawlDir.exists()) { 094 throw new ArgumentNotValid("The given crawlDir (" + crawlDir.getAbsolutePath() + ") does not exist"); 095 } 096 this.jobId = files.getJobID(); 097 this.harvestnamePrefix = files.getArchiveFilePrefix(); 098 this.harvestId = files.getHarvestID(); 099 // Create subdir 'metadata' if not already exists. 100 FileUtils.createDir(getMetadataDir()); 101 // Create/scratch subdir 'tmp-meta' 102 if (getTmpMetadataDir().isDirectory()) { 103 FileUtils.removeRecursively(getTmpMetadataDir()); 104 log.warn("Removed directory {} with contents", getTmpMetadataDir()); 105 } 106 FileUtils.createDir(getTmpMetadataDir()); 107 } 108 109 /** 110 * Check, if the metadatafile already exists. If this is true, metadata has been successfully generated. If false, 111 * either metadata has not finished being generated, or there was an error generating them. 112 * 113 * @return true, if it does exist; false otherwise. 114 */ 115 public boolean isMetadataReady() { 116 return getMetadataFile().isFile(); 117 } 118 119 /** 120 * Return true if the metadata generation process is known to have failed. 121 * 122 * @return True if metadata generation is finished without success, false if generation is still ongoing or has been 123 * successfully done. 124 */ 125 public boolean isMetadataFailed() { 126 return error; 127 } 128 129 /** 130 * Marks generated metadata as final, closes the writer, and moves the temporary metadata file to its final 131 * position. 132 * 133 * @throws PermissionDenied If the metadata has already been marked as ready, or if no metadata file exists upon 134 * success. 135 * @throws IOFailure if there is an error marking the metadata as ready. 136 */ 137 public void closeMetadataFile() { 138 if (isMetadataReady()) { 139 throw new PermissionDenied("Metadata file " + getMetadataFile().getAbsolutePath() + " already exists"); 140 } 141 writer.close(); // close writer down 142 if (!getTmpMetadataFile().exists()) { 143 String message = "No metadata was generated despite claims that metadata generation was successful."; 144 throw new PermissionDenied(message); 145 } 146 getTmpMetadataFile().renameTo(getMetadataFile()); 147 } 148 149 /** 150 * Set error state. 151 * @param isError True, if error, otherwise false; 152 */ 153 public void setErrorState(boolean isError) { 154 error = isError; 155 } 156 157 /** 158 * Get a MetaDatafileWriter for the temporary metadata file. Successive calls to this method on the same object will 159 * return the same writer. Once the metadata have been finalized, calling this method will fail. 160 * 161 * @return a MetaDatafileWriter for the temporary metadata file. 162 * @throws PermissionDenied if metadata generation is already finished. 163 */ 164 public MetadataFileWriter getMetadataWriter() { 165 if (isMetadataReady()) { 166 throw new PermissionDenied("Metadata file " + getMetadataFile().getAbsolutePath() + " already exists"); 167 } 168 if (isMetadataFailed()) { 169 throw new PermissionDenied("Metadata generation of file " + getMetadataFile().getAbsolutePath() 170 + " has already failed."); 171 } 172 if (writer == null) { 173 writer = MetadataFileWriter.createWriter(getTmpMetadataFile()); 174 } 175 return writer; 176 } 177 178 /** 179 * Gets the files containing the metadata. 180 * 181 * @return the files in the metadata dir 182 * @throws IllegalState if the metadata file is not ready, either because generation is still going on or there 183 * was an error generating the metadata. 184 */ 185 public List<File> getMetadataArcFiles() { 186 // Our one known metadata file must exist. 187 if (!isMetadataReady()) { 188 throw new IllegalState("Metadata file " + getMetadataFile().getAbsolutePath() + " does not exist"); 189 } 190 return Arrays.asList(new File[] {getMetadataFile()}); 191 } 192 193 /** 194 * Constructs the metadata subdir from the crawlDir. 195 * 196 * @return The metadata subdir as a File 197 */ 198 private File getMetadataDir() { 199 return new File(crawlDir, METADATA_SUB_DIR); 200 } 201 202 /** 203 * Constructs the single metadata arc file from the crawlDir and the jobID. 204 * 205 * @return metadata arc file as a File 206 */ 207 protected File getMetadataFile() { 208 return new File(getMetadataDir(), MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobId), harvestId)); 209 } 210 211 /** 212 * Constructs the TEMPORARY metadata subdir from the crawlDir. 213 * 214 * @return The tmp-metadata subdir as a File 215 */ 216 public File getTmpMetadataDir() { 217 return new File(crawlDir, TMP_SUB_DIR); 218 } 219 220 /** 221 * Constructs the TEMPORARY metadata arc file from the crawlDir and the jobID. 222 * 223 * @return tmp-metadata arc file as a File 224 */ 225 private File getTmpMetadataFile() { 226 return new File(getTmpMetadataDir(), MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobId), harvestId)); 227 } 228 229 /** 230 * Get a list of all ARC files that should get ingested. Any open files should be closed with closeOpenFiles first. 231 * 232 * @return The ARC files that are ready to get ingested. 233 */ 234 public List<File> getArcFiles() { 235 File arcsdir = getArcsDir(); 236 if (arcsdir.exists()) { 237 if (!arcsdir.isDirectory()) { 238 throw new IOFailure(arcsdir.getPath() + " is not a directory"); 239 } 240 return Arrays.asList(arcsdir.listFiles(FileUtils.ARCS_FILTER)); 241 } else { 242 return new LinkedList<File>(); 243 } 244 } 245 246 /** 247 * @return the arcs dir in the our crawl directory. 248 */ 249 public File getArcsDir() { 250 return new File(heritrixJobDir, "latest/" + Constants.ARCDIRECTORY_NAME); 251 } 252 253 /** 254 * @return the warcs dir in the our crawl directory. 255 */ 256 public File getWarcsDir() { 257 return new File(heritrixJobDir, "latest/" + Constants.WARCDIRECTORY_NAME); 258 } 259 260 /** 261 * @return the warcs dir in the our crawl directory. 262 */ 263 public File getReportsDir() { 264 return new File(heritrixJobDir, "latest/reports"); 265 } 266 267 /** 268 * Get a list of all WARC files that should get ingested. Any open files should be closed with closeOpenFiles first. 269 * 270 * @return The WARC files that are ready to get ingested. 271 */ 272 public List<File> getWarcFiles() { 273 File warcsdir = getWarcsDir(); 274 if (warcsdir.exists()) { 275 if (!warcsdir.isDirectory()) { 276 throw new IOFailure(warcsdir.getPath() + " is not a directory"); 277 } 278 //log 279 280 return Arrays.asList(warcsdir.listFiles(FileUtils.WARCS_FILTER)); 281 } else { 282 return new LinkedList<File>(); 283 } 284 } 285 286 public File getHeritrix3JobDir() { 287 return this.heritrixJobDir; 288 } 289 290 /** 291 * Close any ".open" files left by a crashed Heritrix. ARC and/or WARC files ending in .open indicate that Heritrix 292 * is still writing to them. If Heritrix has died, we can just rename them before we upload. This must not be done 293 * while harvesting is still in progress. 294 * 295 * @param waitSeconds How many seconds to wait before closing files. This may be done in order to allow Heritrix to 296 * finish writing before we close the files. 297 */ 298 public void closeOpenFiles(int waitSeconds) { 299 // wait for Heritrix threads to create and close last arc or warc files 300 try { 301 Thread.sleep(waitSeconds * 1000L); 302 } catch (InterruptedException e) { 303 log.debug("Thread woken prematurely from sleep.", e); 304 } 305 306 closeOpenFiles(Constants.ARCDIRECTORY_NAME, FileUtils.OPEN_ARCS_FILTER); 307 closeOpenFiles(Constants.WARCDIRECTORY_NAME, FileUtils.OPEN_WARCS_FILTER); 308 } 309 310 /** 311 * Given an archive sub-directory name and a filter to match against this method tries to rename the matched files. 312 * Files that can not be renamed generate a log message. The filter should always match files that end with ".open" 313 * as a minimum. 314 * 315 * @param archiveDirName archive directory name, currently "arc" or "warc" 316 * @param filter filename filter used to select ".open" files to rename 317 */ 318 protected void closeOpenFiles(String archiveDirName, FilenameFilter filter) { 319 File arcsdir = new File(crawlDir, archiveDirName); 320 log.debug("Trying to close open archive files in directory {}", arcsdir); 321 File[] files = arcsdir.listFiles(filter); 322 if (files != null) { 323 for (File file : files) { 324 final String fname = file.getAbsolutePath(); 325 // Note: Due to regexp we know filename is at least 5 characters 326 File tofile = new File(fname.substring(0, fname.length() - 5)); 327 if (!file.renameTo(tofile)) { 328 log.warn("Failed to rename '{}' to '{}'", file.getAbsolutePath(), tofile.getAbsolutePath()); 329 } 330 } 331 } 332 } 333 334 /** 335 * Remove any temporary files. 336 */ 337 public void cleanup() { 338 log.debug("Removing the directory '{}'", getTmpMetadataDir()); 339 FileUtils.removeRecursively(getTmpMetadataDir()); 340 writer = null; 341 } 342 343 /** 344 * @return the jobID of the harvest job being processed. 345 */ 346 public long getJobId() { 347 return this.jobId; 348 } 349 350 /** 351 * @return the harvestID of the harvest job being processed. 352 */ 353 public long getHarvestID() { 354 return this.harvestId; 355 } 356 357 /** 358 * @return the harvestnamePrefix of the harvest job being processed. 359 */ 360 public String getHarvestnamePrefix() { 361 return this.harvestnamePrefix; 362 } 363 364 /** 365 * @return the crawlDir of the harvest job being processed. 366 */ 367 public File getCrawlDir() { 368 return this.crawlDir; 369 } 370 371}