001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.harvesting; 024 025import java.io.File; 026import java.io.FilenameFilter; 027import java.util.Arrays; 028import java.util.LinkedList; 029import java.util.List; 030 031import org.slf4j.Logger; 032import org.slf4j.LoggerFactory; 033 034import dk.netarkivet.common.Constants; 035import dk.netarkivet.common.exceptions.ArgumentNotValid; 036import dk.netarkivet.common.exceptions.IOFailure; 037import dk.netarkivet.common.exceptions.PermissionDenied; 038import dk.netarkivet.common.utils.FileUtils; 039import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriter; 040 041/** 042 * Encapsulation of files to be ingested into the archive. These files are presently placed subdirectories under the 043 * crawldir. 044 */ 045public class IngestableFiles { 046 047 private static final Logger log = LoggerFactory.getLogger(IngestableFiles.class); 048 049 /** Subdir with final metadata file in it. */ 050 protected static final String METADATA_SUB_DIR = "metadata"; 051 052 /** Subdir with temporary metadata file in it. */ 053 private static final String TMP_SUB_DIR = "tmp-meta"; 054 055 /** jobId for present harvestjob. */ 056 private long jobId; 057 058 /** crawlDir for present harvestjob. */ 059 private File crawlDir; 060 061 /** 062 * Writer to this jobs metadatafile. This is closed when the metadata is marked as ready. 063 */ 064 private MetadataFileWriter writer = null; 065 066 /** Whether we've had an error in metadata generation. */ 067 private boolean error = false; 068 069 private String harvestnamePrefix; 070 071 private Long harvestId; 072 073 /** 074 * Constructor for this class. HeritrixFiles contains information about crawlDir, jobId, and harvestnameprefix for a 075 * specific finished harvestjob. 076 * 077 * @param files An instance of HeritrixFiles 078 * @throws ArgumentNotValid if null-arguments are given; if jobID < 1; if crawlDir does not exist 079 */ 080 public IngestableFiles(HeritrixFiles files) { 081 ArgumentNotValid.checkNotNull(files, "files"); 082 ArgumentNotValid.checkNotNull(files.getCrawlDir(), "crawlDir"); 083 ArgumentNotValid.checkPositive(files.getJobID(), "jobID"); 084 ArgumentNotValid.checkNotNullOrEmpty(files.getArchiveFilePrefix(), "harvestnamePrefix"); 085 this.crawlDir = files.getCrawlDir(); 086 if (!crawlDir.exists()) { 087 throw new ArgumentNotValid("The given crawlDir (" + crawlDir.getAbsolutePath() + ") does not exist"); 088 } 089 this.jobId = files.getJobID(); 090 this.harvestnamePrefix = files.getArchiveFilePrefix(); 091 this.harvestId = files.getHarvestID(); 092 // Create subdir 'metadata' if not already exists. 093 FileUtils.createDir(getMetadataDir()); 094 // Create/scratch subdir 'tmp-meta' 095 FileUtils.removeRecursively(getTmpMetadataDir()); 096 FileUtils.createDir(getTmpMetadataDir()); 097 } 098 099 /** 100 * Check, if the metadatafile already exists. If this is true, metadata has been successfully generated. If false, 101 * either metadata has not finished being generated, or there was an error generating them. 102 * 103 * @return true, if it does exist; false otherwise. 104 */ 105 public boolean isMetadataReady() { 106 return getMetadataFile().isFile(); 107 } 108 109 /** 110 * Return true if the metadata generation process is known to have failed. 111 * 112 * @return True if metadata generation is finished without success, false if generation is still ongoing or has been 113 * successfully done. 114 */ 115 public boolean isMetadataFailed() { 116 return error; 117 } 118 119 /** 120 * Marks generated metadata as final, closes the writer, and moves the temporary metadata file to its final 121 * position, if successful. 122 * 123 * @param success True if metadata was successfully generated, false otherwise. 124 * @throws PermissionDenied If the metadata has already been marked as ready, or if no metadata file exists upon 125 * success. 126 * @throws IOFailure if there is an error marking the metadata as ready. 127 */ 128 public void setMetadataGenerationSucceeded(boolean success) { 129 if (isMetadataReady()) { 130 throw new PermissionDenied("Metadata file " + getMetadataFile().getAbsolutePath() + " already exists"); 131 } 132 133 if (success) { 134 writer.close(); // close writer down 135 if (!getTmpMetadataFile().exists()) { 136 String message = "No metadata was generated despite claims that metadata generation was successfull."; 137 throw new PermissionDenied(message); 138 } 139 getTmpMetadataFile().renameTo(getMetadataFile()); 140 } else { 141 error = true; 142 } 143 } 144 145 /** 146 * Get a MetaDatafileWriter for the temporary metadata file. Successive calls to this method on the same object will 147 * return the same writer. Once the metadata have been finalized, calling this method will fail. 148 * 149 * @return a MetaDatafileWriter for the temporary metadata file. 150 * @throws PermissionDenied if metadata generation is already finished. 151 */ 152 public MetadataFileWriter getMetadataWriter() { 153 if (isMetadataReady()) { 154 throw new PermissionDenied("Metadata file " + getMetadataFile().getAbsolutePath() + " already exists"); 155 } 156 if (isMetadataFailed()) { 157 throw new PermissionDenied("Metadata generation of file " + getMetadataFile().getAbsolutePath() 158 + " has already failed."); 159 } 160 if (writer == null) { 161 writer = MetadataFileWriter.createWriter(getTmpMetadataFile()); 162 } 163 return writer; 164 } 165 166 /** 167 * Gets the files containing the metadata. 168 * 169 * @return the files in the metadata dir 170 * @throws PermissionDenied if the metadata file is not ready, either because generation is still going on or there 171 * was an error generating the metadata. 172 */ 173 public List<File> getMetadataArcFiles() { 174 // Our one known metadata file must exist. 175 if (!isMetadataReady()) { 176 throw new PermissionDenied("Metadata file " + getMetadataFile().getAbsolutePath() + " does not exist"); 177 } 178 return Arrays.asList(new File[] {getMetadataFile()}); 179 } 180 181 /** 182 * Constructs the metadata subdir from the crawlDir. 183 * 184 * @return The metadata subdir as a File 185 */ 186 private File getMetadataDir() { 187 return new File(crawlDir, METADATA_SUB_DIR); 188 } 189 190 /** 191 * Constructs the single metadata arc file from the crawlDir and the jobID. 192 * 193 * @return metadata arc file as a File 194 */ 195 protected File getMetadataFile() { 196 return new File(getMetadataDir(), MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobId))); 197 } 198 199 /** 200 * Constructs the TEMPORARY metadata subdir from the crawlDir. 201 * 202 * @return The tmp-metadata subdir as a File 203 */ 204 public File getTmpMetadataDir() { 205 return new File(crawlDir, TMP_SUB_DIR); 206 } 207 208 /** 209 * Constructs the TEMPORARY metadata arc file from the crawlDir and the jobID. 210 * 211 * @return tmp-metadata arc file as a File 212 */ 213 private File getTmpMetadataFile() { 214 return new File(getTmpMetadataDir(), MetadataFileWriter.getMetadataArchiveFileName(Long.toString(jobId))); 215 } 216 217 /** 218 * Get a list of all ARC files that should get ingested. Any open files should be closed with closeOpenFiles first. 219 * 220 * @return The ARC files that are ready to get ingested. 221 */ 222 public List<File> getArcFiles() { 223 File arcsdir = getArcsDir(); 224 if (arcsdir.exists()) { 225 if (!arcsdir.isDirectory()) { 226 throw new IOFailure(arcsdir.getPath() + " is not a directory"); 227 } 228 return Arrays.asList(arcsdir.listFiles(FileUtils.ARCS_FILTER)); 229 } else { 230 return new LinkedList<File>(); 231 } 232 } 233 234 /** 235 * @return the arcs dir in the our crawl directory. 236 */ 237 public File getArcsDir() { 238 return new File(crawlDir, Constants.ARCDIRECTORY_NAME); 239 } 240 241 /** 242 * @return the warcs dir in the our crawl directory. 243 */ 244 public File getWarcsDir() { 245 return new File(crawlDir, Constants.WARCDIRECTORY_NAME); 246 } 247 248 /** 249 * Get a list of all WARC files that should get ingested. Any open files should be closed with closeOpenFiles first. 250 * 251 * @return The WARC files that are ready to get ingested. 252 */ 253 public List<File> getWarcFiles() { 254 File warcsdir = getWarcsDir(); 255 if (warcsdir.exists()) { 256 if (!warcsdir.isDirectory()) { 257 throw new IOFailure(warcsdir.getPath() + " is not a directory"); 258 } 259 return Arrays.asList(warcsdir.listFiles(FileUtils.WARCS_FILTER)); 260 } else { 261 return new LinkedList<File>(); 262 } 263 } 264 265 /** 266 * Close any ".open" files left by a crashed Heritrix. ARC and/or WARC files ending in .open indicate that Heritrix 267 * is still writing to them. If Heritrix has died, we can just rename them before we upload. This must not be done 268 * while harvesting is still in progress. 269 * 270 * @param waitSeconds How many seconds to wait before closing files. This may be done in order to allow Heritrix to 271 * finish writing before we close the files. 272 */ 273 public void closeOpenFiles(int waitSeconds) { 274 // wait for Heritrix threads to create and close last arc or warc files 275 try { 276 Thread.sleep(waitSeconds * 1000L); 277 } catch (InterruptedException e) { 278 log.debug("Thread woken prematurely from sleep.", e); 279 } 280 281 closeOpenFiles(Constants.ARCDIRECTORY_NAME, FileUtils.OPEN_ARCS_FILTER); 282 closeOpenFiles(Constants.WARCDIRECTORY_NAME, FileUtils.OPEN_WARCS_FILTER); 283 } 284 285 /** 286 * Given an archive sub-directory name and a filter to match against this method tries to rename the matched files. 287 * Files that can not be renamed generate a log message. The filter should always match files that end with ".open" 288 * as a minimum. 289 * 290 * @param archiveDirName archive directory name, currently "arc" or "warc" 291 * @param filter filename filter used to select ".open" files to rename 292 */ 293 protected void closeOpenFiles(String archiveDirName, FilenameFilter filter) { 294 File arcsdir = new File(crawlDir, archiveDirName); 295 File[] files = arcsdir.listFiles(filter); 296 if (files != null) { 297 for (File file : files) { 298 final String fname = file.getAbsolutePath(); 299 // Note: Due to regexp we know filename is at least 5 characters 300 File tofile = new File(fname.substring(0, fname.length() - 5)); 301 if (!file.renameTo(tofile)) { 302 log.warn("Failed to rename '{}' to '{}'", file.getAbsolutePath(), tofile.getAbsolutePath()); 303 } 304 } 305 } 306 } 307 308 /** 309 * Remove any temporary files. 310 */ 311 public void cleanup() { 312 FileUtils.removeRecursively(getTmpMetadataDir()); 313 writer = null; 314 } 315 316 /** 317 * @return the jobID of the harvest job being processed. 318 */ 319 public long getJobId() { 320 return this.jobId; 321 } 322 323 /** 324 * @return the harvestID of the harvest job being processed. 325 */ 326 public long getHarvestID() { 327 return this.harvestId; 328 } 329 330 /** 331 * @return the harvestnamePrefix of the harvest job being processed. 332 */ 333 public String getHarvestnamePrefix() { 334 return this.harvestnamePrefix; 335 } 336 337 /** 338 * @return the crawlDir of the harvest job being processed. 339 */ 340 public File getCrawlDir() { 341 return this.crawlDir; 342 } 343 344}