001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.harvester.harvesting; 025 026import java.io.File; 027import java.io.FileFilter; 028import java.io.IOException; 029import java.util.ArrayList; 030import java.util.Date; 031import java.util.HashMap; 032import java.util.Iterator; 033import java.util.List; 034import java.util.Map; 035import java.util.TreeSet; 036 037import org.jwat.common.ANVLRecord; 038import org.slf4j.Logger; 039import org.slf4j.LoggerFactory; 040 041import dk.netarkivet.common.Constants; 042import dk.netarkivet.common.exceptions.ArgumentNotValid; 043import dk.netarkivet.common.exceptions.IOFailure; 044import dk.netarkivet.common.exceptions.PermissionDenied; 045import dk.netarkivet.common.utils.FileUtils; 046import dk.netarkivet.common.utils.Settings; 047import dk.netarkivet.common.utils.SystemUtils; 048import dk.netarkivet.common.utils.archive.ArchiveProfile; 049import dk.netarkivet.common.utils.cdx.CDXUtils; 050import dk.netarkivet.harvester.HarvesterSettings; 051import dk.netarkivet.harvester.harvesting.metadata.MetadataEntry; 052import dk.netarkivet.harvester.harvesting.metadata.MetadataFile; 053import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriter; 054import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriterWarc; 055 056/** 057 * This class contains code for documenting a harvest. Metadata is read from the directories associated with a given 058 * harvest-job-attempt (i.e. one DoCrawlMessage sent to a harvest server). The collected metadata are written to a new 059 * metadata file that is managed by IngestableFiles. Temporary metadata files will be deleted after this metadata file 060 * has been written. 061 */ 062public class HarvestDocumentation { 063 064 private static final Logger log = LoggerFactory.getLogger(HarvestDocumentation.class); 065 066 /** 067 * Documents the harvest under the given dir in a packaged metadata arc file in a directory 'metadata' under the 068 * current dir. Only documents the files belonging to the given jobID, the rest are moved to oldjobs. 069 * <p> 070 * In the current implementation, the documentation consists of CDX indices over all ARC files (with one CDX record 071 * per harvested ARC file), plus packaging of log files. 072 * <p> 073 * If this method finishes without an exception, it is guaranteed that metadata is ready for upload. 074 * <p> 075 * TODO Place preharvestmetadata in IngestableFiles-defined area TODO This method may be a good place to copy 076 * deduplicate information from the crawl log to the cdx file. 077 * 078 * @param ingestables Information about the finished crawl (crawldir, jobId, harvestID). 079 * @throws ArgumentNotValid if crawlDir is null or does not exist, or if jobID or harvestID is negative. 080 * @throws IOFailure if - reading ARC files or temporary files fails - writing a file to arcFilesDir fails 081 */ 082 public static void documentHarvest(IngestableFiles ingestables) throws IOFailure { 083 ArgumentNotValid.checkNotNull(ingestables, "ingestables"); 084 085 File crawlDir = ingestables.getCrawlDir(); 086 Long jobID = ingestables.getJobId(); 087 Long harvestID = ingestables.getHarvestID(); 088 089 // Prepare metadata-arcfile for ingestion of metadata, and enumerate 090 // items to ingest. 091 092 // If metadata-arcfile already exists, we are done 093 // See bug 722 094 if (ingestables.isMetadataReady()) { 095 log.warn("The metadata-file '{}' already exists, so we don't make another one!", ingestables 096 .getMetadataFile().getAbsolutePath()); 097 return; 098 } 099 List<File> filesAddedAndNowDeletable = null; 100 101 try { 102 MetadataFileWriter mdfw; 103 mdfw = ingestables.getMetadataWriter(); 104 105 if (mdfw instanceof MetadataFileWriterWarc) { 106 // add warc-info record 107 ANVLRecord infoPayload = new ANVLRecord(); 108 infoPayload.addLabelValue("software", 109 "NetarchiveSuite/" + dk.netarkivet.common.Constants.getVersionString() + "/" 110 + dk.netarkivet.common.Constants.PROJECT_WEBSITE); 111 infoPayload.addLabelValue("ip", SystemUtils.getLocalIP()); 112 infoPayload.addLabelValue("hostname", SystemUtils.getLocalHostName()); 113 infoPayload.addLabelValue("conformsTo", 114 "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf"); 115 116 PersistentJobData psj = new PersistentJobData(crawlDir); 117 infoPayload.addLabelValue("isPartOf", "" + psj.getJobID()); 118 MetadataFileWriterWarc mfww = (MetadataFileWriterWarc) mdfw; 119 mfww.insertInfoRecord(infoPayload); 120 } 121 122 // Fetch any serialized preharvest metadata objects, if they exists. 123 List<MetadataEntry> storedMetadata = getStoredMetadata(crawlDir); 124 try { 125 for (MetadataEntry m : storedMetadata) { 126 mdfw.write(m.getURL(), m.getMimeType(), SystemUtils.getLocalIP(), System.currentTimeMillis(), 127 m.getData()); 128 } 129 } catch (IOException e) { 130 log.warn("Unable to write pre-metadata to metadata archivefile", e); 131 } 132 133 // Insert the harvestdetails into metadata archivefile. 134 filesAddedAndNowDeletable = writeHarvestDetails(jobID, harvestID, crawlDir, mdfw, 135 Constants.getHeritrixVersionString()); 136 // All these files just added to the metadata archivefile can now be deleted 137 // except for the files we need for later processing): 138 // - crawl.log is needed to create domainharvestreport later 139 // - harvestInfo.xml is needed to upload stored data after 140 // crashes/stops on the harvesters 141 // - progress-statistics.log is needed to find out if crawl ended due 142 // to hitting a size limit, or due to other completion 143 144 Iterator<File> iterator = filesAddedAndNowDeletable.iterator(); 145 while (iterator.hasNext()) { 146 File f = iterator.next(); 147 if (f.getName().equals("crawl.log") || f.getName().equals("harvestInfo.xml") 148 || f.getName().equals("progress-statistics.log")) { 149 iterator.remove(); 150 } 151 } 152 153 boolean cdxGenerationSucceeded = false; 154 155 // Try to create CDXes over ARC and WARC files. 156 File arcFilesDir = ingestables.getArcsDir(); 157 File warcFilesDir = ingestables.getWarcsDir(); 158 159 if (arcFilesDir.isDirectory() && FileUtils.hasFiles(arcFilesDir)) { 160 addCDXes(ingestables, arcFilesDir, mdfw, ArchiveProfile.ARC_PROFILE); 161 cdxGenerationSucceeded = true; 162 } 163 if (warcFilesDir.isDirectory() && FileUtils.hasFiles(warcFilesDir)) { 164 addCDXes(ingestables, warcFilesDir, mdfw, ArchiveProfile.WARC_PROFILE); 165 cdxGenerationSucceeded = true; 166 } 167 168 if (cdxGenerationSucceeded) { 169 // This indicates, that either the files in the arcsdir or in the warcsdir 170 // have now been CDX-processed. 171 // 172 // TODO refactor, as this call has too many sideeffects 173 ingestables.setMetadataGenerationSucceeded(true); 174 } else { 175 log.warn("Found no archive directory with ARC og WARC files. Looked for dirs '{}' and '{}'.", 176 arcFilesDir.getAbsolutePath(), warcFilesDir.getAbsolutePath()); 177 } 178 } finally { 179 // If at this point metadata is not ready, an error occurred. 180 if (!ingestables.isMetadataReady()) { 181 ingestables.setMetadataGenerationSucceeded(false); 182 } else { 183 for (File fileAdded : filesAddedAndNowDeletable) { 184 FileUtils.remove(fileAdded); 185 } 186 ingestables.cleanup(); 187 } 188 } 189 } 190 191 private static void addCDXes(IngestableFiles files, File archiveDir, MetadataFileWriter writer, 192 ArchiveProfile profile) { 193 moveAwayForeignFiles(profile, archiveDir, files); 194 File cdxFilesDir = FileUtils.createUniqueTempDir(files.getTmpMetadataDir(), "cdx"); 195 CDXUtils.generateCDX(profile, archiveDir, cdxFilesDir); 196 writer.insertFiles(cdxFilesDir, FileUtils.CDX_FILE_FILTER, Constants.CDX_MIME_TYPE, 197 files.getHarvestID(), files.getJobId()); 198 } 199 200 /** 201 * Restore serialized MetadataEntry objects from the "metadata" subdirectory of the crawldir. 202 * 203 * @param crawlDir the given crawl directory 204 * @return a set of deserialized MetadataEntry objects 205 */ 206 private static List<MetadataEntry> getStoredMetadata(File crawlDir) { 207 File metadataDir = new File(crawlDir, IngestableFiles.METADATA_SUB_DIR); 208 if (!metadataDir.isDirectory()) { 209 log.warn("Should have an metadata directory '{}' but there wasn't", metadataDir.getAbsolutePath()); 210 return new ArrayList<MetadataEntry>(); 211 } else { 212 return MetadataEntry.getMetadataFromDisk(metadataDir); 213 } 214 } 215 216 /** 217 * Iterates over the (W)ARC files in the given dir and moves away files that do not belong to the given job into a 218 * "lost-files" directory under oldjobs named with a timestamp. 219 * 220 * @param archiveProfile archive profile including filters, patterns, etc. 221 * @param dir A directory containing one or more (W)ARC files. 222 * @param files Information about the files produced by heritrix (jobId and harvestnamePrefix) 223 */ 224 private static void moveAwayForeignFiles(ArchiveProfile archiveProfile, File dir, IngestableFiles files) { 225 File[] archiveFiles = dir.listFiles(archiveProfile.filename_filter); 226 File oldJobsDir = new File(Settings.get(HarvesterSettings.HARVEST_CONTROLLER_OLDJOBSDIR)); 227 File lostfilesDir = new File(oldJobsDir, "lost-files-" + new Date().getTime()); 228 List<File> movedFiles = new ArrayList<File>(); 229 log.info("Looking for files not having harvestprefix '{}'", files.getHarvestnamePrefix()); 230 for (File archiveFile : archiveFiles) { 231 if (!(archiveFile.getName().startsWith(files.getHarvestnamePrefix()))) { 232 // move unidentified file to lostfiles directory 233 log.info("removing unidentified file {}", archiveFile.getAbsolutePath()); 234 try { 235 if (!lostfilesDir.exists()) { 236 FileUtils.createDir(lostfilesDir); 237 } 238 File moveTo = new File(lostfilesDir, archiveFile.getName()); 239 archiveFile.renameTo(moveTo); 240 movedFiles.add(moveTo); 241 } catch (PermissionDenied e) { 242 log.warn("Not allowed to make oldjobs dir '{}'", lostfilesDir.getAbsolutePath(), e); 243 } 244 245 } 246 } 247 if (!movedFiles.isEmpty()) { 248 log.warn("Found files not belonging to job {}, the following files have been stored for later: {}", 249 files.getJobId(), movedFiles); 250 } 251 } 252 253 /** 254 * Write harvestdetails to archive file(s). This includes the order.xml, seeds.txt, specific settings.xml for 255 * certain domains, the harvestInfo.xml, All available reports (subset of HeritrixFiles.HERITRIX_REPORTS), All 256 * available logs (subset of HeritrixFiles.HERITRIX_LOGS). 257 * 258 * @param jobID the given job Id 259 * @param harvestID the id for the harvestdefinition, which created this job 260 * @param crawlDir the directory where the crawljob took place 261 * @param mdfw an MetadaFileWriter used to store the harvest configuration, and harvest logs and reports. 262 * @param heritrixVersion the heritrix version used by the harvest. 263 * @return a list of files added to the archive file. 264 * @throws ArgumentNotValid If null arguments occur 265 */ 266 private static List<File> writeHarvestDetails(long jobID, long harvestID, File crawlDir, MetadataFileWriter mdfw, 267 String heritrixVersion) { 268 List<File> filesAdded = new ArrayList<File>(); 269 270 // We will sort the files by URL 271 TreeSet<MetadataFile> files = new TreeSet<MetadataFile>(); 272 273 // List heritrix files in the crawl directory 274 File[] heritrixFiles = crawlDir.listFiles(new FileFilter() { 275 @Override 276 public boolean accept(File f) { 277 return (f.isFile() && f.getName().matches(MetadataFile.HERITRIX_FILE_PATTERN)); 278 } 279 }); 280 281 // Add files in the crawl directory 282 for (File hf : heritrixFiles) { 283 files.add(new MetadataFile(hf, harvestID, jobID, heritrixVersion)); 284 } 285 // Generate an arcfiles-report.txt if configured to do so. 286 boolean genArcFilesReport = Settings.getBoolean(HarvesterSettings.METADATA_GENERATE_ARCHIVE_FILES_REPORT); 287 if (genArcFilesReport) { 288 log.debug("Creating an arcfiles-report.txt"); 289 files.add(new MetadataFile(new ArchiveFilesReportGenerator(crawlDir).generateReport(), harvestID, jobID, 290 heritrixVersion)); 291 } else { 292 log.debug("Creation of the arcfiles-report.txt has been disabled by the setting '{}'!", 293 HarvesterSettings.METADATA_GENERATE_ARCHIVE_FILES_REPORT); 294 } 295 296 // Add log files 297 File logDir = new File(crawlDir, "logs"); 298 if (logDir.exists()) { 299 File[] heritrixLogFiles = logDir.listFiles(new FileFilter() { 300 @Override 301 public boolean accept(File f) { 302 return (f.isFile() && f.getName().matches(MetadataFile.LOG_FILE_PATTERN)); 303 } 304 }); 305 for (File logFile : heritrixLogFiles) { 306 files.add(new MetadataFile(logFile, harvestID, jobID, heritrixVersion)); 307 log.info("Found Heritrix log file {}", logFile.getName()); 308 } 309 } else { 310 log.debug("No logs dir found in crawldir: {}", crawlDir.getAbsolutePath()); 311 } 312 313 // Check if exists any settings directory (domain-specific settings) 314 // if yes, add any settings.xml hiding in this directory. 315 // TODO Delete any settings-files found in the settings directory */ 316 File settingsDir = new File(crawlDir, "settings"); 317 if (settingsDir.isDirectory()) { 318 Map<File, String> domainSettingsFiles = findDomainSpecificSettings(settingsDir); 319 for (Map.Entry<File, String> entry : domainSettingsFiles.entrySet()) { 320 321 File dsf = entry.getKey(); 322 String domain = entry.getValue(); 323 files.add(new MetadataFile(dsf, harvestID, jobID, heritrixVersion, domain)); 324 } 325 } else { 326 log.debug("No settings directory found in crawldir: {}", crawlDir.getAbsolutePath()); 327 } 328 329 // Write files in order to metadata archive file. 330 for (MetadataFile mdf : files) { 331 File heritrixFile = mdf.getHeritrixFile(); 332 String heritrixFileName = heritrixFile.getName(); 333 String mimeType = (heritrixFileName.endsWith(".xml") ? "text/xml" : "text/plain"); 334 if (mdfw.writeTo(heritrixFile, mdf.getUrl(), mimeType)) { 335 filesAdded.add(heritrixFile); 336 } else { 337 log.warn("The Heritrix file '{}' was not included in the metadata archivefile due to some error.", 338 heritrixFile.getAbsolutePath()); 339 } 340 } 341 342 return filesAdded; 343 } 344 345 /** 346 * Finds domain-specific configurations in the settings subdirectory of the crawl directory. 347 * 348 * @param settingsDir the given settings directory 349 * @return the settings file paired with their domain.. 350 */ 351 private static Map<File, String> findDomainSpecificSettings(File settingsDir) { 352 // find any domain specific configurations (settings.xml) 353 List<String> reversedDomainsWithSettings = findAllDomainsWithSettings(settingsDir, ""); 354 355 Map<File, String> settingsFileToDomain = new HashMap<File, String>(); 356 if (reversedDomainsWithSettings.isEmpty()) { 357 log.debug("No settings/<domain> directories exists: no domain-specific configurations available"); 358 } else { 359 for (String reversedDomain : reversedDomainsWithSettings) { 360 String domain = reverseDomainString(reversedDomain); 361 File settingsXmlFile = new File(settingsDir + reversedDomain.replaceAll("\\.", File.separator), 362 MetadataFile.DOMAIN_SETTINGS_FILE); 363 if (!settingsXmlFile.isFile()) { 364 log.debug("Directory settings/{}/{} does not exist.", domain, MetadataFile.DOMAIN_SETTINGS_FILE); 365 } else { 366 settingsFileToDomain.put(settingsXmlFile, domain); 367 } 368 } 369 } 370 return settingsFileToDomain; 371 } 372 373 /** 374 * Find all domains which have a settings.xml file in the given directory. 375 * 376 * @param directory a given directory 377 * @param domainReversed the domain reversed 378 * @return a list of domains (in reverse), which contained a file with given filename 379 */ 380 private static List<String> findAllDomainsWithSettings(File directory, String domainReversed) { 381 if (!directory.isDirectory()) { 382 return new ArrayList<String>(0); 383 } 384 // List to hold the files temporarily 385 List<String> filesToReturn = new ArrayList<String>(); 386 387 for (File fileInDir : directory.listFiles()) { 388 // if the given file is a dir, then call 389 // the method recursively. 390 if (fileInDir.isDirectory()) { 391 List<String> resultList = findAllDomainsWithSettings(fileInDir, 392 domainReversed + "." + fileInDir.getName()); 393 if (!resultList.isEmpty()) { 394 filesToReturn.addAll(resultList); 395 } 396 } else { 397 if (fileInDir.getName().equals(MetadataFile.DOMAIN_SETTINGS_FILE)) { 398 // Store the domain, so that we can find the file later 399 filesToReturn.add(domainReversed); 400 } 401 } 402 } 403 return filesToReturn; 404 } 405 406 /** 407 * Reverses a domain string, e.g. reverses "com.amazon" to "amazon.com" 408 * 409 * @param reversedDomain the domain name to reverse 410 * @return the reversed domain string 411 */ 412 private static String reverseDomainString(String reversedDomain) { 413 String domain = ""; 414 String remaining = reversedDomain; 415 int lastDotIndex = remaining.lastIndexOf("."); 416 while (lastDotIndex != -1) { 417 domain += remaining.substring(lastDotIndex + 1) + "."; 418 remaining = remaining.substring(0, lastDotIndex); 419 lastDotIndex = remaining.lastIndexOf("."); 420 } 421 return domain.substring(0, domain.length() - 1); 422 } 423 424}