001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.harvester.heritrix3; 025 026import java.io.File; 027import java.io.FileFilter; 028import java.io.IOException; 029import java.util.ArrayList; 030import java.util.Date; 031import java.util.Iterator; 032import java.util.List; 033import java.util.TreeSet; 034 035import org.jwat.common.ANVLRecord; 036import org.slf4j.Logger; 037import org.slf4j.LoggerFactory; 038 039import dk.netarkivet.common.Constants; 040import dk.netarkivet.common.exceptions.ArgumentNotValid; 041import dk.netarkivet.common.exceptions.IOFailure; 042import dk.netarkivet.common.exceptions.PermissionDenied; 043import dk.netarkivet.common.utils.FileUtils; 044import dk.netarkivet.common.utils.Settings; 045import dk.netarkivet.common.utils.SystemUtils; 046import dk.netarkivet.common.utils.archive.ArchiveProfile; 047import dk.netarkivet.common.utils.cdx.CDXUtils; 048import dk.netarkivet.harvester.harvesting.PersistentJobData; 049import dk.netarkivet.harvester.harvesting.metadata.MetadataEntry; 050import dk.netarkivet.harvester.harvesting.metadata.MetadataFile; 051import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriter; 052import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriterWarc; 053 054/** 055 * This class contains code for documenting a H3 harvest. Metadata is read from the directories associated with a given 056 * harvest-job-attempt (i.e. one DoCrawlMessage sent to a harvest server). The collected metadata are written to a new 057 * metadata file that is managed by IngestableFiles. Temporary metadata files will be deleted after this metadata file 058 * has been written. 059 */ 060public class HarvestDocumentation { 061 062 private static final Logger log = LoggerFactory.getLogger(HarvestDocumentation.class); 063 064 /** 065 * Documents the harvest under the given dir in a packaged metadata arc file in a directory 'metadata' under the 066 * current dir. Only documents the files belonging to the given jobID, the rest are moved to oldjobs. 067 * <p> 068 * In the current implementation, the documentation consists of CDX indices over all ARC files (with one CDX record 069 * per harvested ARC file), plus packaging of log files. 070 * <p> 071 * If this method finishes without an exception, it is guaranteed that metadata is ready for upload. 072 * <p> 073 * TODO Place preharvestmetadata in IngestableFiles-defined area 074 * TODO This method may be a good place to copy deduplicate information from the crawl log to the cdx file. 075 * 076 * @param ingestables Information about the finished crawl (crawldir, jobId, harvestID). 077 * @throws ArgumentNotValid if crawlDir is null or does not exist, or if jobID or harvestID is negative. 078 * @throws IOFailure if - reading ARC files or temporary files fails - writing a file to arcFilesDir fails 079 */ 080 public static void documentHarvest(IngestableFiles ingestables) throws IOFailure { 081 ArgumentNotValid.checkNotNull(ingestables, "ingestables"); 082 083 File crawlDir = ingestables.getCrawlDir(); 084 Long jobID = ingestables.getJobId(); 085 Long harvestID = ingestables.getHarvestID(); 086 087 // Prepare metadata-arcfile for ingestion of metadata, and enumerate 088 // items to ingest. 089 090 // If metadata-arcfile already exists, we are done 091 // See bug 722 092 if (ingestables.isMetadataReady()) { 093 log.warn("The metadata-file '{}' already exists, so we don't make another one!", ingestables 094 .getMetadataFile().getAbsolutePath()); 095 return; 096 } 097 List<File> filesAddedAndNowDeletable = null; 098 099 try { 100 MetadataFileWriter mdfw; 101 mdfw = ingestables.getMetadataWriter(); 102 103 if (mdfw instanceof MetadataFileWriterWarc) { 104 // add warc-info record 105 ANVLRecord infoPayload = new ANVLRecord(); 106 infoPayload.addLabelValue("software", 107 "NetarchiveSuite/" + dk.netarkivet.common.Constants.getVersionString() + "/" 108 + dk.netarkivet.common.Constants.PROJECT_WEBSITE); 109 infoPayload.addLabelValue("ip", SystemUtils.getLocalIP()); 110 infoPayload.addLabelValue("hostname", SystemUtils.getLocalHostName()); 111 infoPayload.addLabelValue("conformsTo", 112 "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf"); 113 114 PersistentJobData psj = new PersistentJobData(crawlDir); 115 infoPayload.addLabelValue("isPartOf", "" + psj.getJobID()); 116 MetadataFileWriterWarc mfww = (MetadataFileWriterWarc) mdfw; 117 mfww.insertInfoRecord(infoPayload); 118 } 119 120 // Fetch any serialized preharvest metadata objects, if they exists. 121 List<MetadataEntry> storedMetadata = getStoredMetadata(crawlDir); 122 try { 123 for (MetadataEntry m : storedMetadata) { 124 mdfw.write(m.getURL(), m.getMimeType(), SystemUtils.getLocalIP(), System.currentTimeMillis(), 125 m.getData()); 126 } 127 } catch (IOException e) { 128 log.warn("Unable to write pre-metadata to metadata archivefile", e); 129 } 130 131 // Insert the harvestdetails into metadata archivefile. 132 filesAddedAndNowDeletable = writeHarvestDetails(jobID, harvestID, ingestables, mdfw, Constants.getHeritrix3VersionString()); 133 // All these files just added to the metadata archivefile can now be deleted 134 // except for the files we need for later processing): 135 // - crawl.log is needed to create domainharvestreport later 136 // - harvestInfo.xml is needed to upload stored data after 137 // crashes/stops on the harvesters 138 // - progress-statistics.log is needed to find out if crawl ended due 139 // to hitting a size limit, or due to other completion 140 141 Iterator<File> iterator = filesAddedAndNowDeletable.iterator(); 142 while (iterator.hasNext()) { 143 File f = iterator.next(); 144 if (f.getName().equals("crawl.log") || f.getName().equals("harvestInfo.xml") 145 || f.getName().equals("progress-statistics.log")) { 146 iterator.remove(); 147 } 148 } 149 150 boolean cdxGenerationSucceeded = false; 151 152 // Try to create CDXes over ARC and WARC files. 153 File arcFilesDir = ingestables.getArcsDir(); 154 File warcFilesDir = ingestables.getWarcsDir(); 155 156 if (arcFilesDir.isDirectory() && FileUtils.hasFiles(arcFilesDir)) { 157 addCDXes(ingestables, arcFilesDir, mdfw, ArchiveProfile.ARC_PROFILE); 158 cdxGenerationSucceeded = true; 159 } 160 if (warcFilesDir.isDirectory() && FileUtils.hasFiles(warcFilesDir)) { 161 addCDXes(ingestables, warcFilesDir, mdfw, ArchiveProfile.WARC_PROFILE); 162 cdxGenerationSucceeded = true; 163 } 164 165 if (cdxGenerationSucceeded) { 166 // This indicates, that either the files in the arcsdir or in the warcsdir 167 // have now been CDX-processed. 168 // 169 // TODO refactor, as this call has too many sideeffects 170 ingestables.setMetadataGenerationSucceeded(true); 171 } else { 172 log.warn("Found no archive directory with ARC og WARC files. Looked for dirs '{}' and '{}'.", 173 arcFilesDir.getAbsolutePath(), warcFilesDir.getAbsolutePath()); 174 } 175 } finally { 176 // If at this point metadata is not ready, an error occurred. 177 if (!ingestables.isMetadataReady()) { 178 ingestables.setMetadataGenerationSucceeded(false); 179 } else { 180 for (File fileAdded : filesAddedAndNowDeletable) { 181 FileUtils.remove(fileAdded); 182 } 183 ingestables.cleanup(); 184 } 185 } 186 } 187 188 private static void addCDXes(IngestableFiles files, File archiveDir, MetadataFileWriter writer, 189 ArchiveProfile profile) { 190 moveAwayForeignFiles(profile, archiveDir, files); 191 File cdxFilesDir = FileUtils.createUniqueTempDir(files.getTmpMetadataDir(), "cdx"); 192 CDXUtils.generateCDX(profile, archiveDir, cdxFilesDir); 193 writer.insertFiles(cdxFilesDir, FileUtils.CDX_FILE_FILTER, Constants.CDX_MIME_TYPE, 194 files.getHarvestID(), files.getJobId()); 195 } 196 197 /** 198 * Restore serialized MetadataEntry objects from the "metadata" subdirectory of the crawldir. 199 * 200 * @param crawlDir the given crawl directory 201 * @return a set of deserialized MetadataEntry objects 202 */ 203 private static List<MetadataEntry> getStoredMetadata(File crawlDir) { 204 File metadataDir = new File(crawlDir, IngestableFiles.METADATA_SUB_DIR); 205 if (!metadataDir.isDirectory()) { 206 log.warn("Should have an metadata directory '{}' but there wasn't", metadataDir.getAbsolutePath()); 207 return new ArrayList<MetadataEntry>(); 208 } else { 209 return MetadataEntry.getMetadataFromDisk(metadataDir); 210 } 211 } 212 213 /** 214 * Iterates over the (W)ARC files in the given dir and moves away files that do not belong to the given job into a 215 * "lost-files" directory under oldjobs named with a timestamp. 216 * 217 * @param archiveProfile archive profile including filters, patterns, etc. 218 * @param dir A directory containing one or more (W)ARC files. 219 * @param files Information about the files produced by heritrix (jobId and harvestnamePrefix) 220 */ 221 private static void moveAwayForeignFiles(ArchiveProfile archiveProfile, File dir, IngestableFiles files) { 222 File[] archiveFiles = dir.listFiles(archiveProfile.filename_filter); 223 File oldJobsDir = new File(Settings.get(Heritrix3Settings.HARVEST_CONTROLLER_OLDJOBSDIR)); 224 File lostfilesDir = new File(oldJobsDir, "lost-files-" + new Date().getTime()); 225 List<File> movedFiles = new ArrayList<File>(); 226 log.info("Looking for files not having harvestprefix '{}'", files.getHarvestnamePrefix()); 227 for (File archiveFile : archiveFiles) { 228 if (!(archiveFile.getName().startsWith(files.getHarvestnamePrefix()))) { 229 // move unidentified file to lostfiles directory 230 log.info("removing unidentified file {}", archiveFile.getAbsolutePath()); 231 try { 232 if (!lostfilesDir.exists()) { 233 FileUtils.createDir(lostfilesDir); 234 } 235 File moveTo = new File(lostfilesDir, archiveFile.getName()); 236 archiveFile.renameTo(moveTo); 237 movedFiles.add(moveTo); 238 } catch (PermissionDenied e) { 239 log.warn("Not allowed to make oldjobs dir '{}'", lostfilesDir.getAbsolutePath(), e); 240 } 241 242 } 243 } 244 if (!movedFiles.isEmpty()) { 245 log.warn("Found files not belonging to job {}, the following files have been stored for later: {}", 246 files.getJobId(), movedFiles); 247 } 248 } 249 250 /** 251 * Write harvestdetails to archive file(s). This includes the order.xml, seeds.txt, specific settings.xml for 252 * certain domains, the harvestInfo.xml, All available reports (subset of HeritrixFiles.HERITRIX_REPORTS), All 253 * available logs (subset of HeritrixFiles.HERITRIX_LOGS). 254 * 255 * @param jobID the given job Id 256 * @param harvestID the id for the harvestdefinition, which created this job 257 * @param crawlDir the directory where the crawljob took place 258 * @param mdfw an MetadaFileWriter used to store the harvest configuration, and harvest logs and reports. 259 * @param heritrixVersion the heritrix version used by the harvest. 260 * @return a list of files added to the archive file. 261 * @throws ArgumentNotValid If null arguments occur 262 */ 263 private static List<File> writeHarvestDetails(long jobID, long harvestID, IngestableFiles ingestableFiles, MetadataFileWriter mdfw, 264 String heritrixVersion) { 265 List<File> filesAdded = new ArrayList<File>(); 266 267 // We will sort the files by URL 268 TreeSet<MetadataFile> files = new TreeSet<MetadataFile>(); 269 270 // look for files in the crawldir and the ${heritrix3jobdir}, and ${heritrix3jobdir}/latest 271 // - reports is relative to ${heritrix3jobdir}/latest/ 272 // - logs is relative to ${heritrix3jobdir} 273 File crawlDir = ingestableFiles.getCrawlDir(); 274 File jobsDir = ingestableFiles.getHeritrix3JobDir(); 275 File reportsDir = ingestableFiles.getReportsDir(); 276 277 log.info("Looking for heritrix files in the following directories: {},{}, {}", 278 crawlDir.getAbsolutePath(), jobsDir.getAbsolutePath(), reportsDir.getAbsolutePath()); 279 280 // Find and add Heritrix files in the crawl directory 281 File[] heritrixFiles = crawlDir.listFiles(new FileFilter() { 282 @Override 283 public boolean accept(File f) { 284 return (f.isFile() && f.getName().matches(MetadataFile.HERITRIX_FILE_PATTERN)); 285 } 286 }); 287 for (File hf : heritrixFiles) { 288 files.add(new MetadataFile(hf, harvestID, jobID, heritrixVersion)); 289 } 290 291 // Find and add Heritrix files in the heritrixjobdir (if it exists) 292 if (jobsDir.exists()) { 293 File[] heritrixFilesJobDir = jobsDir.listFiles(new FileFilter() { 294 @Override 295 public boolean accept(File f) { 296 return (f.isFile() && f.getName().matches(MetadataFile.HERITRIX_FILE_PATTERN)); 297 } 298 }); 299 for (File hf : heritrixFilesJobDir) { 300 files.add(new MetadataFile(hf, harvestID, jobID, heritrixVersion)); 301 } 302 } else { 303 log.warn("The directory {} does not exist", jobsDir.getAbsolutePath()); 304 } 305 306 // Find and add Heritrix files in the heritrixReportsDir (if it exists) 307 if (reportsDir.exists()) { 308 File[] heritrixFilesReports = reportsDir.listFiles(new FileFilter() { 309 @Override 310 public boolean accept(File f) { 311 return (f.isFile() && f.getName().matches(MetadataFile.HERITRIX_FILE_PATTERN)); 312 } 313 }); 314 315 for (File hf : heritrixFilesReports) { 316 files.add(new MetadataFile(hf, harvestID, jobID, heritrixVersion)); 317 } 318 } else { 319 log.warn("The directory {} does not exist", reportsDir.getAbsolutePath()); 320 } 321 322 // Generate an arcfiles-report.txt if configured to do so. 323 // FIXME This is not possible to extract from the crawl.log (Is this list available in any other way?) 324 325 boolean genArcFilesReport = Settings.getBoolean(Heritrix3Settings.METADATA_GENERATE_ARCHIVE_FILES_REPORT); 326 if (genArcFilesReport) { 327 log.debug("Arcfiles-report.txt generation Not currently supported by Heritrix3"); 328 /* 329 log.debug("Creating an arcfiles-report.txt"); 330 files.add(new MetadataFile(new ArchiveFilesReportGenerator(crawlDir).generateReport(), harvestID, jobID, 331 heritrixVersion)); 332 */ 333 } else { 334 log.debug("Creation of the arcfiles-report.txt has been disabled by the setting '{}'!", 335 Heritrix3Settings.METADATA_GENERATE_ARCHIVE_FILES_REPORT); 336 } 337 338 // Add log files 339 File logDir = new File(jobsDir, "logs"); 340 if (logDir.exists()) { 341 File[] heritrixLogFiles = logDir.listFiles(new FileFilter() { 342 @Override 343 public boolean accept(File f) { 344 return (f.isFile() && f.getName().matches(MetadataFile.LOG_FILE_PATTERN)); 345 } 346 }); 347 for (File logFile : heritrixLogFiles) { 348 files.add(new MetadataFile(logFile, harvestID, jobID, heritrixVersion)); 349 log.info("Found Heritrix log file {}", logFile.getName()); 350 } 351 } else { 352 log.debug("No logs dir found in crawldir: {}", crawlDir.getAbsolutePath()); 353 } 354 355 // Write files in order to metadata archive file. 356 for (MetadataFile mdf : files) { 357 File heritrixFile = mdf.getHeritrixFile(); 358 String heritrixFileName = heritrixFile.getName(); 359 String mimeType = (heritrixFileName.endsWith(".xml") ? "text/xml" : "text/plain"); 360 if (mdfw.writeTo(heritrixFile, mdf.getUrl(), mimeType)) { 361 filesAdded.add(heritrixFile); 362 } else { 363 log.warn("The Heritrix file '{}' was not included in the metadata archivefile due to some error.", 364 heritrixFile.getAbsolutePath()); 365 } 366 } 367 368 return filesAdded; 369 } 370 371}