Source code

001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.harvester.heritrix3;
025
026import java.io.File;
027import java.io.FileFilter;
028import java.io.IOException;
029import java.util.ArrayList;
030import java.util.Date;
031import java.util.Iterator;
032import java.util.List;
033import java.util.TreeSet;
034
035import org.jwat.common.ANVLRecord;
036import org.slf4j.Logger;
037import org.slf4j.LoggerFactory;
038
039import dk.netarkivet.common.Constants;
040import dk.netarkivet.common.exceptions.ArgumentNotValid;
041import dk.netarkivet.common.exceptions.IOFailure;
042import dk.netarkivet.common.exceptions.PermissionDenied;
043import dk.netarkivet.common.utils.FileUtils;
044import dk.netarkivet.common.utils.Settings;
045import dk.netarkivet.common.utils.SystemUtils;
046import dk.netarkivet.common.utils.archive.ArchiveProfile;
047import dk.netarkivet.common.utils.cdx.CDXUtils;
048import dk.netarkivet.harvester.HarvesterSettings;
049import dk.netarkivet.harvester.harvesting.PersistentJobData;
050import dk.netarkivet.harvester.harvesting.metadata.MetadataEntry;
051import dk.netarkivet.harvester.harvesting.metadata.MetadataFile;
052import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriter;
053import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriterWarc;
054
055/**
056 * This class contains code for documenting a H3 harvest. Metadata is read from the directories associated with a given
057 * harvest-job-attempt (i.e. one DoCrawlMessage sent to a harvest server). The collected metadata are written to a new
058 * metadata file that is managed by IngestableFiles. Temporary metadata files will be deleted after this metadata file
059 * has been written.
060 */
061public class HarvestDocumentation {
062
063    private static final Logger log = LoggerFactory.getLogger(HarvestDocumentation.class);
064
065    /**
066     * Documents the harvest under the given dir in a packaged metadata arc file in a directory 'metadata' under the
067     * current dir. Only documents the files belonging to the given jobID, the rest are moved to oldjobs.
068     * <p>
069     * In the current implementation, the documentation consists of CDX indices over all ARC files (with one CDX record
070     * per harvested ARC file), plus packaging of log files.
071     * <p>
072     * If this method finishes without an exception, it is guaranteed that metadata is ready for upload.
073     * <p>
074     * TODO Place preharvestmetadata in IngestableFiles-defined area 
075     * TODO This method may be a good place to copy deduplicate information from the crawl log to the cdx file.
076     *
077     * @param ingestables Information about the finished crawl (crawldir, jobId, harvestID).
078     * @throws ArgumentNotValid if crawlDir is null or does not exist, or if jobID or harvestID is negative.
079     * @throws IOFailure if - reading ARC files or temporary files fails - writing a file to arcFilesDir fails
080     */
081    public static void documentHarvest(IngestableFiles ingestables) throws IOFailure {
082        ArgumentNotValid.checkNotNull(ingestables, "ingestables");
083
084        File crawlDir = ingestables.getCrawlDir();
085        Long jobID = ingestables.getJobId();
086        Long harvestID = ingestables.getHarvestID();
087
088        // Prepare metadata-arcfile for ingestion of metadata, and enumerate
089        // items to ingest.
090
091        // If metadata-arcfile already exists, we are done
092        // See bug 722
093        if (ingestables.isMetadataReady()) {
094            log.warn("The metadata-file '{}' already exists, so we don't make another one!", ingestables
095                    .getMetadataFile().getAbsolutePath());
096            return;
097        }
098        List<File> filesAddedAndNowDeletable = null;
099
100        try {
101            MetadataFileWriter mdfw;
102            mdfw = ingestables.getMetadataWriter();
103
104            if (mdfw instanceof MetadataFileWriterWarc) {
105                // add warc-info record
106                ANVLRecord infoPayload = new ANVLRecord();
107                infoPayload.addLabelValue("software",
108                        "NetarchiveSuite/" + dk.netarkivet.common.Constants.getVersionString() + "/"
109                                + dk.netarkivet.common.Constants.PROJECT_WEBSITE);
110                infoPayload.addLabelValue("ip", SystemUtils.getLocalIP());
111                infoPayload.addLabelValue("hostname", SystemUtils.getLocalHostName());
112                infoPayload.addLabelValue("conformsTo",
113                        "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf");
114
115                PersistentJobData psj = new PersistentJobData(crawlDir);
116                infoPayload.addLabelValue("isPartOf", "" + psj.getJobID());
117                MetadataFileWriterWarc mfww = (MetadataFileWriterWarc) mdfw;
118                mfww.insertInfoRecord(infoPayload);
119            }
120
121            // Fetch any serialized preharvest metadata objects, if they exists.
122            List<MetadataEntry> storedMetadata = getStoredMetadata(crawlDir);
123            try {
124                for (MetadataEntry m : storedMetadata) {
125                    mdfw.write(m.getURL(), m.getMimeType(), SystemUtils.getLocalIP(), System.currentTimeMillis(),
126                            m.getData());
127                }
128            } catch (IOException e) {
129                log.warn("Unable to write pre-metadata to metadata archivefile", e);
130            }
131
132            // Insert the harvestdetails into metadata archivefile.
133            filesAddedAndNowDeletable = writeHarvestDetails(jobID, harvestID, ingestables, mdfw, Constants.getHeritrix3VersionString());
134            // All these files just added to the metadata archivefile can now be deleted
135            // except for the files we need for later processing):
136            // - crawl.log is needed to create domainharvestreport later
137            // - harvestInfo.xml is needed to upload stored data after
138            // crashes/stops on the harvesters
139            // - progress-statistics.log is needed to find out if crawl ended due
140            // to hitting a size limit, or due to other completion
141
142            Iterator<File> iterator = filesAddedAndNowDeletable.iterator();
143            while (iterator.hasNext()) {
144                File f = iterator.next();
145                if (f.getName().equals("crawl.log") || f.getName().equals("harvestInfo.xml")
146                        || f.getName().equals("progress-statistics.log")) {
147                    iterator.remove();
148                }
149            }
150
151            boolean cdxGenerationSucceeded = false;
152
153            // Try to create CDXes over ARC and WARC files.
154            File arcFilesDir = ingestables.getArcsDir();
155            File warcFilesDir = ingestables.getWarcsDir();
156
157            if (arcFilesDir.isDirectory() && FileUtils.hasFiles(arcFilesDir)) {
158                addCDXes(ingestables, arcFilesDir, mdfw, ArchiveProfile.ARC_PROFILE);
159                cdxGenerationSucceeded = true;
160            }
161            if (warcFilesDir.isDirectory() && FileUtils.hasFiles(warcFilesDir)) {
162                addCDXes(ingestables, warcFilesDir, mdfw, ArchiveProfile.WARC_PROFILE);
163                cdxGenerationSucceeded = true;
164            }
165
166            if (cdxGenerationSucceeded) {
167                // This indicates, that either the files in the arcsdir or in the warcsdir
168                // have now been CDX-processed.
169                //
170                // TODO refactor, as this call has too many sideeffects
171                ingestables.setMetadataGenerationSucceeded(true);
172            } else {
173                log.warn("Found no archive directory with ARC og WARC files. Looked for dirs '{}' and '{}'.",
174                        arcFilesDir.getAbsolutePath(), warcFilesDir.getAbsolutePath());
175            }
176        } finally {
177            // If at this point metadata is not ready, an error occurred.
178            if (!ingestables.isMetadataReady()) {
179                ingestables.setMetadataGenerationSucceeded(false);
180            } else {
181                for (File fileAdded : filesAddedAndNowDeletable) {
182                    FileUtils.remove(fileAdded);
183                }
184                ingestables.cleanup();
185            }
186        }
187    }
188
189    private static void addCDXes(IngestableFiles files, File archiveDir, MetadataFileWriter writer,
190            ArchiveProfile profile) {
191        moveAwayForeignFiles(profile, archiveDir, files);
192        File cdxFilesDir = FileUtils.createUniqueTempDir(files.getTmpMetadataDir(), "cdx");
193        CDXUtils.generateCDX(profile, archiveDir, cdxFilesDir);
194        writer.insertFiles(cdxFilesDir, FileUtils.CDX_FILE_FILTER, Constants.CDX_MIME_TYPE, 
195                        files.getHarvestID(), files.getJobId());
196    }
197
198    /**
199     * Restore serialized MetadataEntry objects from the "metadata" subdirectory of the crawldir.
200     *
201     * @param crawlDir the given crawl directory
202     * @return a set of deserialized MetadataEntry objects
203     */
204    private static List<MetadataEntry> getStoredMetadata(File crawlDir) {
205        File metadataDir = new File(crawlDir, IngestableFiles.METADATA_SUB_DIR);
206        if (!metadataDir.isDirectory()) {
207            log.warn("Should have an metadata directory '{}' but there wasn't", metadataDir.getAbsolutePath());
208            return new ArrayList<MetadataEntry>();
209        } else {
210            return MetadataEntry.getMetadataFromDisk(metadataDir);
211        }
212    }
213
214    /**
215     * Iterates over the (W)ARC files in the given dir and moves away files that do not belong to the given job into a
216     * "lost-files" directory under oldjobs named with a timestamp.
217     *
218     * @param archiveProfile archive profile including filters, patterns, etc.
219     * @param dir A directory containing one or more (W)ARC files.
220     * @param files Information about the files produced by heritrix (jobId and harvestnamePrefix)
221     */
222    private static void moveAwayForeignFiles(ArchiveProfile archiveProfile, File dir, IngestableFiles files) {
223        File[] archiveFiles = dir.listFiles(archiveProfile.filename_filter);
224        File oldJobsDir = new File(Settings.get(HarvesterSettings.HARVEST_CONTROLLER_OLDJOBSDIR));
225        File lostfilesDir = new File(oldJobsDir, "lost-files-" + new Date().getTime());
226        List<File> movedFiles = new ArrayList<File>();
227        log.info("Looking for files not having harvestprefix '{}'", files.getHarvestnamePrefix());
228        for (File archiveFile : archiveFiles) {
229            if (!(archiveFile.getName().startsWith(files.getHarvestnamePrefix()))) {
230                // move unidentified file to lostfiles directory
231                log.info("removing unidentified file {}", archiveFile.getAbsolutePath());
232                try {
233                    if (!lostfilesDir.exists()) {
234                        FileUtils.createDir(lostfilesDir);
235                    }
236                    File moveTo = new File(lostfilesDir, archiveFile.getName());
237                    archiveFile.renameTo(moveTo);
238                    movedFiles.add(moveTo);
239                } catch (PermissionDenied e) {
240                    log.warn("Not allowed to make oldjobs dir '{}'", lostfilesDir.getAbsolutePath(), e);
241                }
242
243            }
244        }
245        if (!movedFiles.isEmpty()) {
246            log.warn("Found files not belonging to job {}, the following files have been stored for later: {}",
247                    files.getJobId(), movedFiles);
248        }
249    }
250
251    /**
252     * Write harvestdetails to archive file(s). This includes the order.xml, seeds.txt, specific settings.xml for
253     * certain domains, the harvestInfo.xml, All available reports (subset of HeritrixFiles.HERITRIX_REPORTS), All
254     * available logs (subset of HeritrixFiles.HERITRIX_LOGS).
255     *
256     * @param jobID the given job Id
257     * @param harvestID the id for the harvestdefinition, which created this job
258     * @param crawlDir the directory where the crawljob took place
259     * @param mdfw an MetadaFileWriter used to store the harvest configuration, and harvest logs and reports.
260     * @param heritrixVersion the heritrix version used by the harvest.
261     * @return a list of files added to the archive file.
262     * @throws ArgumentNotValid If null arguments occur
263     */
264    private static List<File> writeHarvestDetails(long jobID, long harvestID, IngestableFiles ingestableFiles, MetadataFileWriter mdfw,
265            String heritrixVersion) {
266        List<File> filesAdded = new ArrayList<File>();
267        
268        // We will sort the files by URL
269        TreeSet<MetadataFile> files = new TreeSet<MetadataFile>();
270
271        // look for files in the crawldir and the ${heritrix3jobdir}, and ${heritrix3jobdir}/latest
272        // - reports is relative to ${heritrix3jobdir}/latest/ 
273        // - logs is relative to ${heritrix3jobdir}
274        File crawlDir = ingestableFiles.getCrawlDir();
275        File jobsDir = ingestableFiles.getHeritrix3JobDir();
276        File reportsDir = ingestableFiles.getReportsDir();
277        
278        log.info("Looking for heritrix files in the following directories: {},{}, {}",
279                        crawlDir.getAbsolutePath(), jobsDir.getAbsolutePath(), reportsDir.getAbsolutePath());
280        
281        // Find and add Heritrix files in the crawl directory
282        File[] heritrixFiles = crawlDir.listFiles(new FileFilter() {
283            @Override
284            public boolean accept(File f) {
285                return (f.isFile() && f.getName().matches(MetadataFile.HERITRIX_FILE_PATTERN));
286            }
287        });
288        for (File hf : heritrixFiles) {
289            files.add(new MetadataFile(hf, harvestID, jobID, heritrixVersion));
290        }
291
292        // Find and add Heritrix files in the heritrixjobdir (if it exists)
293        if (jobsDir.exists()) {
294                File[] heritrixFilesJobDir = jobsDir.listFiles(new FileFilter() {
295                        @Override
296                        public boolean accept(File f) {
297                                return (f.isFile() && f.getName().matches(MetadataFile.HERITRIX_FILE_PATTERN));
298                        }
299                });
300                for (File hf : heritrixFilesJobDir) {
301                        files.add(new MetadataFile(hf, harvestID, jobID, heritrixVersion));
302                }
303        } else {
304                log.warn("The directory {} does not exist", jobsDir.getAbsolutePath()); 
305        }
306        
307        // Find and add Heritrix files in the heritrixReportsDir (if it exists)
308        if (reportsDir.exists()) {
309                File[] heritrixFilesReports = reportsDir.listFiles(new FileFilter() {
310                        @Override
311                        public boolean accept(File f) {
312                                return (f.isFile() && f.getName().matches(MetadataFile.HERITRIX_FILE_PATTERN));
313                        }
314                });
315
316                for (File hf : heritrixFilesReports) {
317                        files.add(new MetadataFile(hf, harvestID, jobID, heritrixVersion));
318                } 
319        } else {
320                log.warn("The directory {} does not exist", reportsDir.getAbsolutePath());
321        }
322        
323        // Generate an arcfiles-report.txt if configured to do so.
324        // This is not possible to extract from the crawl.log, but we will make one from just listing the files harvested by Heritrix3
325        
326        boolean genArcFilesReport = Settings.getBoolean(Heritrix3Settings.METADATA_GENERATE_ARCHIVE_FILES_REPORT);
327        if (genArcFilesReport) {
328    
329            log.debug("Creating an arcfiles-report.txt");
330            files.add(new MetadataFile(new ArchiveFilesReportGenerator(ingestableFiles).generateReport(), harvestID, jobID,
331                    heritrixVersion));
332                    
333        } else {
334            log.debug("Creation of the arcfiles-report.txt has been disabled by the setting '{}'!",
335                        Heritrix3Settings.METADATA_GENERATE_ARCHIVE_FILES_REPORT);
336        }
337
338        // Add log files
339        File logDir = new File(jobsDir, "logs");
340        if (logDir.exists()) {
341            File[] heritrixLogFiles = logDir.listFiles(new FileFilter() {
342                @Override
343                public boolean accept(File f) {
344                    return (f.isFile() && f.getName().matches(MetadataFile.LOG_FILE_PATTERN));
345                }
346            });
347            for (File logFile : heritrixLogFiles) {
348                files.add(new MetadataFile(logFile, harvestID, jobID, heritrixVersion));
349                log.info("Found Heritrix log file {}", logFile.getName());
350            }
351        } else {
352            log.debug("No logs dir found in crawldir: {}", crawlDir.getAbsolutePath());
353        }
354        
355        // Write files in order to metadata archive file.
356        for (MetadataFile mdf : files) {
357            File heritrixFile = mdf.getHeritrixFile();
358            String heritrixFileName = heritrixFile.getName();
359            String mimeType = (heritrixFileName.endsWith(".xml") ? "text/xml" : "text/plain");
360            if (mdfw.writeTo(heritrixFile, mdf.getUrl(), mimeType)) {
361                filesAdded.add(heritrixFile);
362            } else {
363                log.warn("The Heritrix file '{}' was not included in the metadata archivefile due to some error.",
364                        heritrixFile.getAbsolutePath());
365            }
366        }
367
368        return filesAdded;
369    }
370  
371}