001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.harvester.heritrix3; 025 026import java.io.File; 027import java.io.FileFilter; 028import java.io.IOException; 029import java.util.ArrayList; 030import java.util.Date; 031import java.util.Iterator; 032import java.util.List; 033import java.util.TreeSet; 034 035import org.jwat.common.ANVLRecord; 036import org.slf4j.Logger; 037import org.slf4j.LoggerFactory; 038 039import dk.netarkivet.common.Constants; 040import dk.netarkivet.common.exceptions.ArgumentNotValid; 041import dk.netarkivet.common.exceptions.IOFailure; 042import dk.netarkivet.common.exceptions.PermissionDenied; 043import dk.netarkivet.common.utils.FileUtils; 044import dk.netarkivet.common.utils.Settings; 045import dk.netarkivet.common.utils.SystemUtils; 046import dk.netarkivet.common.utils.archive.ArchiveProfile; 047import dk.netarkivet.common.utils.cdx.CDXUtils; 048import dk.netarkivet.harvester.HarvesterSettings; 049import dk.netarkivet.harvester.harvesting.PersistentJobData; 050import dk.netarkivet.harvester.harvesting.metadata.MetadataEntry; 051import dk.netarkivet.harvester.harvesting.metadata.MetadataFile; 052import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriter; 053import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriterWarc; 054 055/** 056 * This class contains code for documenting a H3 harvest. Metadata is read from the directories associated with a given 057 * harvest-job-attempt (i.e. one DoCrawlMessage sent to a harvest server). The collected metadata are written to a new 058 * metadata file that is managed by IngestableFiles. Temporary metadata files will be deleted after this metadata file 059 * has been written. 060 */ 061public class HarvestDocumentation { 062 063 private static final Logger log = LoggerFactory.getLogger(HarvestDocumentation.class); 064 065 /** 066 * Documents the harvest under the given dir in a packaged metadata arc file in a directory 'metadata' under the 067 * current dir. Only documents the files belonging to the given jobID, the rest are moved to oldjobs. 068 * <p> 069 * In the current implementation, the documentation consists of CDX indices over all ARC files (with one CDX record 070 * per harvested ARC file), plus packaging of log files. 071 * <p> 072 * If this method finishes without an exception, it is guaranteed that metadata is ready for upload. 073 * <p> 074 * TODO Place preharvestmetadata in IngestableFiles-defined area 075 * TODO This method may be a good place to copy deduplicate information from the crawl log to the cdx file. 076 * 077 * @param ingestables Information about the finished crawl (crawldir, jobId, harvestID). 078 * @throws ArgumentNotValid if crawlDir is null or does not exist, or if jobID or harvestID is negative. 079 * @throws IOFailure if - reading ARC files or temporary files fails - writing a file to arcFilesDir fails 080 */ 081 public static void documentHarvest(IngestableFiles ingestables) throws IOFailure { 082 ArgumentNotValid.checkNotNull(ingestables, "ingestables"); 083 084 File crawlDir = ingestables.getCrawlDir(); 085 Long jobID = ingestables.getJobId(); 086 Long harvestID = ingestables.getHarvestID(); 087 088 // Prepare metadata-arcfile for ingestion of metadata, and enumerate 089 // items to ingest. 090 091 // If metadata-arcfile already exists, we are done 092 // See bug 722 093 if (ingestables.isMetadataReady()) { 094 log.warn("The metadata-file '{}' already exists, so we don't make another one!", ingestables 095 .getMetadataFile().getAbsolutePath()); 096 return; 097 } 098 List<File> filesAddedAndNowDeletable = null; 099 100 try { 101 MetadataFileWriter mdfw; 102 mdfw = ingestables.getMetadataWriter(); 103 104 if (mdfw instanceof MetadataFileWriterWarc) { 105 // add warc-info record 106 ANVLRecord infoPayload = new ANVLRecord(); 107 infoPayload.addLabelValue("software", 108 "NetarchiveSuite/" + dk.netarkivet.common.Constants.getVersionString() + "/" 109 + dk.netarkivet.common.Constants.PROJECT_WEBSITE); 110 infoPayload.addLabelValue("ip", SystemUtils.getLocalIP()); 111 infoPayload.addLabelValue("hostname", SystemUtils.getLocalHostName()); 112 infoPayload.addLabelValue("conformsTo", 113 "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf"); 114 115 PersistentJobData psj = new PersistentJobData(crawlDir); 116 infoPayload.addLabelValue("isPartOf", "" + psj.getJobID()); 117 MetadataFileWriterWarc mfww = (MetadataFileWriterWarc) mdfw; 118 mfww.insertInfoRecord(infoPayload); 119 } 120 121 // Fetch any serialized preharvest metadata objects, if they exists. 122 List<MetadataEntry> storedMetadata = getStoredMetadata(crawlDir); 123 try { 124 for (MetadataEntry m : storedMetadata) { 125 mdfw.write(m.getURL(), m.getMimeType(), SystemUtils.getLocalIP(), System.currentTimeMillis(), 126 m.getData()); 127 } 128 } catch (IOException e) { 129 log.warn("Unable to write pre-metadata to metadata archivefile", e); 130 } 131 132 // Insert the harvestdetails into metadata archivefile. 133 filesAddedAndNowDeletable = writeHarvestDetails(jobID, harvestID, ingestables, mdfw, Constants.getHeritrix3VersionString()); 134 // All these files just added to the metadata archivefile can now be deleted 135 // except for the files we need for later processing): 136 // - crawl.log is needed to create domainharvestreport later 137 // - harvestInfo.xml is needed to upload stored data after 138 // crashes/stops on the harvesters 139 // - progress-statistics.log is needed to find out if crawl ended due 140 // to hitting a size limit, or due to other completion 141 142 Iterator<File> iterator = filesAddedAndNowDeletable.iterator(); 143 while (iterator.hasNext()) { 144 File f = iterator.next(); 145 if (f.getName().equals("crawl.log") || f.getName().equals("harvestInfo.xml") 146 || f.getName().equals("progress-statistics.log")) { 147 iterator.remove(); 148 } 149 } 150 151 boolean cdxGenerationSucceeded = false; 152 153 // Try to create CDXes over ARC and WARC files. 154 File arcFilesDir = ingestables.getArcsDir(); 155 File warcFilesDir = ingestables.getWarcsDir(); 156 157 if (arcFilesDir.isDirectory() && FileUtils.hasFiles(arcFilesDir)) { 158 addCDXes(ingestables, arcFilesDir, mdfw, ArchiveProfile.ARC_PROFILE); 159 cdxGenerationSucceeded = true; 160 } 161 if (warcFilesDir.isDirectory() && FileUtils.hasFiles(warcFilesDir)) { 162 addCDXes(ingestables, warcFilesDir, mdfw, ArchiveProfile.WARC_PROFILE); 163 cdxGenerationSucceeded = true; 164 } 165 166 if (cdxGenerationSucceeded) { 167 // This indicates, that either the files in the arcsdir or in the warcsdir 168 // have now been CDX-processed. 169 // 170 // TODO refactor, as this call has too many sideeffects 171 ingestables.setMetadataGenerationSucceeded(true); 172 } else { 173 log.warn("Found no archive directory with ARC og WARC files. Looked for dirs '{}' and '{}'.", 174 arcFilesDir.getAbsolutePath(), warcFilesDir.getAbsolutePath()); 175 } 176 } finally { 177 // If at this point metadata is not ready, an error occurred. 178 if (!ingestables.isMetadataReady()) { 179 ingestables.setMetadataGenerationSucceeded(false); 180 } else { 181 for (File fileAdded : filesAddedAndNowDeletable) { 182 FileUtils.remove(fileAdded); 183 } 184 ingestables.cleanup(); 185 } 186 } 187 } 188 189 private static void addCDXes(IngestableFiles files, File archiveDir, MetadataFileWriter writer, 190 ArchiveProfile profile) { 191 moveAwayForeignFiles(profile, archiveDir, files); 192 File cdxFilesDir = FileUtils.createUniqueTempDir(files.getTmpMetadataDir(), "cdx"); 193 CDXUtils.generateCDX(profile, archiveDir, cdxFilesDir); 194 writer.insertFiles(cdxFilesDir, FileUtils.CDX_FILE_FILTER, Constants.CDX_MIME_TYPE, 195 files.getHarvestID(), files.getJobId()); 196 } 197 198 /** 199 * Restore serialized MetadataEntry objects from the "metadata" subdirectory of the crawldir. 200 * 201 * @param crawlDir the given crawl directory 202 * @return a set of deserialized MetadataEntry objects 203 */ 204 private static List<MetadataEntry> getStoredMetadata(File crawlDir) { 205 File metadataDir = new File(crawlDir, IngestableFiles.METADATA_SUB_DIR); 206 if (!metadataDir.isDirectory()) { 207 log.warn("Should have an metadata directory '{}' but there wasn't", metadataDir.getAbsolutePath()); 208 return new ArrayList<MetadataEntry>(); 209 } else { 210 return MetadataEntry.getMetadataFromDisk(metadataDir); 211 } 212 } 213 214 /** 215 * Iterates over the (W)ARC files in the given dir and moves away files that do not belong to the given job into a 216 * "lost-files" directory under oldjobs named with a timestamp. 217 * 218 * @param archiveProfile archive profile including filters, patterns, etc. 219 * @param dir A directory containing one or more (W)ARC files. 220 * @param files Information about the files produced by heritrix (jobId and harvestnamePrefix) 221 */ 222 private static void moveAwayForeignFiles(ArchiveProfile archiveProfile, File dir, IngestableFiles files) { 223 File[] archiveFiles = dir.listFiles(archiveProfile.filename_filter); 224 File oldJobsDir = new File(Settings.get(HarvesterSettings.HARVEST_CONTROLLER_OLDJOBSDIR)); 225 File lostfilesDir = new File(oldJobsDir, "lost-files-" + new Date().getTime()); 226 List<File> movedFiles = new ArrayList<File>(); 227 log.info("Looking for files not having harvestprefix '{}'", files.getHarvestnamePrefix()); 228 for (File archiveFile : archiveFiles) { 229 if (!(archiveFile.getName().startsWith(files.getHarvestnamePrefix()))) { 230 // move unidentified file to lostfiles directory 231 log.info("removing unidentified file {}", archiveFile.getAbsolutePath()); 232 try { 233 if (!lostfilesDir.exists()) { 234 FileUtils.createDir(lostfilesDir); 235 } 236 File moveTo = new File(lostfilesDir, archiveFile.getName()); 237 archiveFile.renameTo(moveTo); 238 movedFiles.add(moveTo); 239 } catch (PermissionDenied e) { 240 log.warn("Not allowed to make oldjobs dir '{}'", lostfilesDir.getAbsolutePath(), e); 241 } 242 243 } 244 } 245 if (!movedFiles.isEmpty()) { 246 log.warn("Found files not belonging to job {}, the following files have been stored for later: {}", 247 files.getJobId(), movedFiles); 248 } 249 } 250 251 /** 252 * Write harvestdetails to archive file(s). This includes the order.xml, seeds.txt, specific settings.xml for 253 * certain domains, the harvestInfo.xml, All available reports (subset of HeritrixFiles.HERITRIX_REPORTS), All 254 * available logs (subset of HeritrixFiles.HERITRIX_LOGS). 255 * 256 * @param jobID the given job Id 257 * @param harvestID the id for the harvestdefinition, which created this job 258 * @param crawlDir the directory where the crawljob took place 259 * @param mdfw an MetadaFileWriter used to store the harvest configuration, and harvest logs and reports. 260 * @param heritrixVersion the heritrix version used by the harvest. 261 * @return a list of files added to the archive file. 262 * @throws ArgumentNotValid If null arguments occur 263 */ 264 private static List<File> writeHarvestDetails(long jobID, long harvestID, IngestableFiles ingestableFiles, MetadataFileWriter mdfw, 265 String heritrixVersion) { 266 List<File> filesAdded = new ArrayList<File>(); 267 268 // We will sort the files by URL 269 TreeSet<MetadataFile> files = new TreeSet<MetadataFile>(); 270 271 // look for files in the crawldir and the ${heritrix3jobdir}, and ${heritrix3jobdir}/latest 272 // - reports is relative to ${heritrix3jobdir}/latest/ 273 // - logs is relative to ${heritrix3jobdir} 274 File crawlDir = ingestableFiles.getCrawlDir(); 275 File jobsDir = ingestableFiles.getHeritrix3JobDir(); 276 File reportsDir = ingestableFiles.getReportsDir(); 277 278 log.info("Looking for heritrix files in the following directories: {},{}, {}", 279 crawlDir.getAbsolutePath(), jobsDir.getAbsolutePath(), reportsDir.getAbsolutePath()); 280 281 // Find and add Heritrix files in the crawl directory 282 File[] heritrixFiles = crawlDir.listFiles(new FileFilter() { 283 @Override 284 public boolean accept(File f) { 285 return (f.isFile() && f.getName().matches(MetadataFile.HERITRIX_FILE_PATTERN)); 286 } 287 }); 288 for (File hf : heritrixFiles) { 289 files.add(new MetadataFile(hf, harvestID, jobID, heritrixVersion)); 290 } 291 292 // Find and add Heritrix files in the heritrixjobdir (if it exists) 293 if (jobsDir.exists()) { 294 File[] heritrixFilesJobDir = jobsDir.listFiles(new FileFilter() { 295 @Override 296 public boolean accept(File f) { 297 return (f.isFile() && f.getName().matches(MetadataFile.HERITRIX_FILE_PATTERN)); 298 } 299 }); 300 for (File hf : heritrixFilesJobDir) { 301 files.add(new MetadataFile(hf, harvestID, jobID, heritrixVersion)); 302 } 303 } else { 304 log.warn("The directory {} does not exist", jobsDir.getAbsolutePath()); 305 } 306 307 // Find and add Heritrix files in the heritrixReportsDir (if it exists) 308 if (reportsDir.exists()) { 309 File[] heritrixFilesReports = reportsDir.listFiles(new FileFilter() { 310 @Override 311 public boolean accept(File f) { 312 return (f.isFile() && f.getName().matches(MetadataFile.HERITRIX_FILE_PATTERN)); 313 } 314 }); 315 316 for (File hf : heritrixFilesReports) { 317 files.add(new MetadataFile(hf, harvestID, jobID, heritrixVersion)); 318 } 319 } else { 320 log.warn("The directory {} does not exist", reportsDir.getAbsolutePath()); 321 } 322 323 // Generate an arcfiles-report.txt if configured to do so. 324 // This is not possible to extract from the crawl.log, but we will make one from just listing the files harvested by Heritrix3 325 326 boolean genArcFilesReport = Settings.getBoolean(Heritrix3Settings.METADATA_GENERATE_ARCHIVE_FILES_REPORT); 327 if (genArcFilesReport) { 328 329 log.debug("Creating an arcfiles-report.txt"); 330 files.add(new MetadataFile(new ArchiveFilesReportGenerator(ingestableFiles).generateReport(), harvestID, jobID, 331 heritrixVersion)); 332 333 } else { 334 log.debug("Creation of the arcfiles-report.txt has been disabled by the setting '{}'!", 335 Heritrix3Settings.METADATA_GENERATE_ARCHIVE_FILES_REPORT); 336 } 337 338 // Add log files 339 File logDir = new File(jobsDir, "logs"); 340 if (logDir.exists()) { 341 File[] heritrixLogFiles = logDir.listFiles(new FileFilter() { 342 @Override 343 public boolean accept(File f) { 344 return (f.isFile() && f.getName().matches(MetadataFile.LOG_FILE_PATTERN)); 345 } 346 }); 347 for (File logFile : heritrixLogFiles) { 348 files.add(new MetadataFile(logFile, harvestID, jobID, heritrixVersion)); 349 log.info("Found Heritrix log file {}", logFile.getName()); 350 } 351 } else { 352 log.debug("No logs dir found in crawldir: {}", crawlDir.getAbsolutePath()); 353 } 354 355 // Write files in order to metadata archive file. 356 for (MetadataFile mdf : files) { 357 File heritrixFile = mdf.getHeritrixFile(); 358 String heritrixFileName = heritrixFile.getName(); 359 String mimeType = (heritrixFileName.endsWith(".xml") ? "text/xml" : "text/plain"); 360 if (mdfw.writeTo(heritrixFile, mdf.getUrl(), mimeType)) { 361 filesAdded.add(heritrixFile); 362 } else { 363 log.warn("The Heritrix file '{}' was not included in the metadata archivefile due to some error.", 364 heritrixFile.getAbsolutePath()); 365 } 366 } 367 368 return filesAdded; 369 } 370 371}