001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.harvester.heritrix3;
025
026import java.io.File;
027import java.io.FileFilter;
028import java.io.IOException;
029import java.util.ArrayList;
030import java.util.Date;
031import java.util.Iterator;
032import java.util.List;
033import java.util.TreeSet;
034
035import org.jwat.common.ANVLRecord;
036import org.slf4j.Logger;
037import org.slf4j.LoggerFactory;
038
039import dk.netarkivet.common.Constants;
040import dk.netarkivet.common.exceptions.ArgumentNotValid;
041import dk.netarkivet.common.exceptions.IOFailure;
042import dk.netarkivet.common.exceptions.PermissionDenied;
043import dk.netarkivet.common.utils.FileUtils;
044import dk.netarkivet.common.utils.Settings;
045import dk.netarkivet.common.utils.SystemUtils;
046import dk.netarkivet.common.utils.archive.ArchiveProfile;
047import dk.netarkivet.common.utils.cdx.CDXUtils;
048import dk.netarkivet.harvester.harvesting.PersistentJobData;
049import dk.netarkivet.harvester.harvesting.metadata.MetadataEntry;
050import dk.netarkivet.harvester.harvesting.metadata.MetadataFile;
051import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriter;
052import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriterWarc;
053
054/**
055 * This class contains code for documenting a H3 harvest. Metadata is read from the directories associated with a given
056 * harvest-job-attempt (i.e. one DoCrawlMessage sent to a harvest server). The collected metadata are written to a new
057 * metadata file that is managed by IngestableFiles. Temporary metadata files will be deleted after this metadata file
058 * has been written.
059 */
060public class HarvestDocumentation {
061
062    private static final Logger log = LoggerFactory.getLogger(HarvestDocumentation.class);
063
064    /**
065     * Documents the harvest under the given dir in a packaged metadata arc file in a directory 'metadata' under the
066     * current dir. Only documents the files belonging to the given jobID, the rest are moved to oldjobs.
067     * <p>
068     * In the current implementation, the documentation consists of CDX indices over all ARC files (with one CDX record
069     * per harvested ARC file), plus packaging of log files.
070     * <p>
071     * If this method finishes without an exception, it is guaranteed that metadata is ready for upload.
072     * <p>
073     * TODO Place preharvestmetadata in IngestableFiles-defined area 
074     * TODO This method may be a good place to copy deduplicate information from the crawl log to the cdx file.
075     *
076     * @param ingestables Information about the finished crawl (crawldir, jobId, harvestID).
077     * @throws ArgumentNotValid if crawlDir is null or does not exist, or if jobID or harvestID is negative.
078     * @throws IOFailure if - reading ARC files or temporary files fails - writing a file to arcFilesDir fails
079     */
080    public static void documentHarvest(IngestableFiles ingestables) throws IOFailure {
081        ArgumentNotValid.checkNotNull(ingestables, "ingestables");
082
083        File crawlDir = ingestables.getCrawlDir();
084        Long jobID = ingestables.getJobId();
085        Long harvestID = ingestables.getHarvestID();
086
087        // Prepare metadata-arcfile for ingestion of metadata, and enumerate
088        // items to ingest.
089
090        // If metadata-arcfile already exists, we are done
091        // See bug 722
092        if (ingestables.isMetadataReady()) {
093            log.warn("The metadata-file '{}' already exists, so we don't make another one!", ingestables
094                    .getMetadataFile().getAbsolutePath());
095            return;
096        }
097        List<File> filesAddedAndNowDeletable = null;
098
099        try {
100            MetadataFileWriter mdfw;
101            mdfw = ingestables.getMetadataWriter();
102
103            if (mdfw instanceof MetadataFileWriterWarc) {
104                // add warc-info record
105                ANVLRecord infoPayload = new ANVLRecord();
106                infoPayload.addLabelValue("software",
107                        "NetarchiveSuite/" + dk.netarkivet.common.Constants.getVersionString() + "/"
108                                + dk.netarkivet.common.Constants.PROJECT_WEBSITE);
109                infoPayload.addLabelValue("ip", SystemUtils.getLocalIP());
110                infoPayload.addLabelValue("hostname", SystemUtils.getLocalHostName());
111                infoPayload.addLabelValue("conformsTo",
112                        "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf");
113
114                PersistentJobData psj = new PersistentJobData(crawlDir);
115                infoPayload.addLabelValue("isPartOf", "" + psj.getJobID());
116                MetadataFileWriterWarc mfww = (MetadataFileWriterWarc) mdfw;
117                mfww.insertInfoRecord(infoPayload);
118            }
119
120            // Fetch any serialized preharvest metadata objects, if they exists.
121            List<MetadataEntry> storedMetadata = getStoredMetadata(crawlDir);
122            try {
123                for (MetadataEntry m : storedMetadata) {
124                    mdfw.write(m.getURL(), m.getMimeType(), SystemUtils.getLocalIP(), System.currentTimeMillis(),
125                            m.getData());
126                }
127            } catch (IOException e) {
128                log.warn("Unable to write pre-metadata to metadata archivefile", e);
129            }
130
131            // Insert the harvestdetails into metadata archivefile.
132            filesAddedAndNowDeletable = writeHarvestDetails(jobID, harvestID, ingestables, mdfw, Constants.getHeritrix3VersionString());
133            // All these files just added to the metadata archivefile can now be deleted
134            // except for the files we need for later processing):
135            // - crawl.log is needed to create domainharvestreport later
136            // - harvestInfo.xml is needed to upload stored data after
137            // crashes/stops on the harvesters
138            // - progress-statistics.log is needed to find out if crawl ended due
139            // to hitting a size limit, or due to other completion
140
141            Iterator<File> iterator = filesAddedAndNowDeletable.iterator();
142            while (iterator.hasNext()) {
143                File f = iterator.next();
144                if (f.getName().equals("crawl.log") || f.getName().equals("harvestInfo.xml")
145                        || f.getName().equals("progress-statistics.log")) {
146                    iterator.remove();
147                }
148            }
149
150            boolean cdxGenerationSucceeded = false;
151
152            // Try to create CDXes over ARC and WARC files.
153            File arcFilesDir = ingestables.getArcsDir();
154            File warcFilesDir = ingestables.getWarcsDir();
155
156            if (arcFilesDir.isDirectory() && FileUtils.hasFiles(arcFilesDir)) {
157                addCDXes(ingestables, arcFilesDir, mdfw, ArchiveProfile.ARC_PROFILE);
158                cdxGenerationSucceeded = true;
159            }
160            if (warcFilesDir.isDirectory() && FileUtils.hasFiles(warcFilesDir)) {
161                addCDXes(ingestables, warcFilesDir, mdfw, ArchiveProfile.WARC_PROFILE);
162                cdxGenerationSucceeded = true;
163            }
164
165            if (cdxGenerationSucceeded) {
166                // This indicates, that either the files in the arcsdir or in the warcsdir
167                // have now been CDX-processed.
168                //
169                // TODO refactor, as this call has too many sideeffects
170                ingestables.setMetadataGenerationSucceeded(true);
171            } else {
172                log.warn("Found no archive directory with ARC og WARC files. Looked for dirs '{}' and '{}'.",
173                        arcFilesDir.getAbsolutePath(), warcFilesDir.getAbsolutePath());
174            }
175        } finally {
176            // If at this point metadata is not ready, an error occurred.
177            if (!ingestables.isMetadataReady()) {
178                ingestables.setMetadataGenerationSucceeded(false);
179            } else {
180                for (File fileAdded : filesAddedAndNowDeletable) {
181                    FileUtils.remove(fileAdded);
182                }
183                ingestables.cleanup();
184            }
185        }
186    }
187
188    private static void addCDXes(IngestableFiles files, File archiveDir, MetadataFileWriter writer,
189            ArchiveProfile profile) {
190        moveAwayForeignFiles(profile, archiveDir, files);
191        File cdxFilesDir = FileUtils.createUniqueTempDir(files.getTmpMetadataDir(), "cdx");
192        CDXUtils.generateCDX(profile, archiveDir, cdxFilesDir);
193        writer.insertFiles(cdxFilesDir, FileUtils.CDX_FILE_FILTER, Constants.CDX_MIME_TYPE, 
194                        files.getHarvestID(), files.getJobId());
195    }
196
197    /**
198     * Restore serialized MetadataEntry objects from the "metadata" subdirectory of the crawldir.
199     *
200     * @param crawlDir the given crawl directory
201     * @return a set of deserialized MetadataEntry objects
202     */
203    private static List<MetadataEntry> getStoredMetadata(File crawlDir) {
204        File metadataDir = new File(crawlDir, IngestableFiles.METADATA_SUB_DIR);
205        if (!metadataDir.isDirectory()) {
206            log.warn("Should have an metadata directory '{}' but there wasn't", metadataDir.getAbsolutePath());
207            return new ArrayList<MetadataEntry>();
208        } else {
209            return MetadataEntry.getMetadataFromDisk(metadataDir);
210        }
211    }
212
213    /**
214     * Iterates over the (W)ARC files in the given dir and moves away files that do not belong to the given job into a
215     * "lost-files" directory under oldjobs named with a timestamp.
216     *
217     * @param archiveProfile archive profile including filters, patterns, etc.
218     * @param dir A directory containing one or more (W)ARC files.
219     * @param files Information about the files produced by heritrix (jobId and harvestnamePrefix)
220     */
221    private static void moveAwayForeignFiles(ArchiveProfile archiveProfile, File dir, IngestableFiles files) {
222        File[] archiveFiles = dir.listFiles(archiveProfile.filename_filter);
223        File oldJobsDir = new File(Settings.get(Heritrix3Settings.HARVEST_CONTROLLER_OLDJOBSDIR));
224        File lostfilesDir = new File(oldJobsDir, "lost-files-" + new Date().getTime());
225        List<File> movedFiles = new ArrayList<File>();
226        log.info("Looking for files not having harvestprefix '{}'", files.getHarvestnamePrefix());
227        for (File archiveFile : archiveFiles) {
228            if (!(archiveFile.getName().startsWith(files.getHarvestnamePrefix()))) {
229                // move unidentified file to lostfiles directory
230                log.info("removing unidentified file {}", archiveFile.getAbsolutePath());
231                try {
232                    if (!lostfilesDir.exists()) {
233                        FileUtils.createDir(lostfilesDir);
234                    }
235                    File moveTo = new File(lostfilesDir, archiveFile.getName());
236                    archiveFile.renameTo(moveTo);
237                    movedFiles.add(moveTo);
238                } catch (PermissionDenied e) {
239                    log.warn("Not allowed to make oldjobs dir '{}'", lostfilesDir.getAbsolutePath(), e);
240                }
241
242            }
243        }
244        if (!movedFiles.isEmpty()) {
245            log.warn("Found files not belonging to job {}, the following files have been stored for later: {}",
246                    files.getJobId(), movedFiles);
247        }
248    }
249
250    /**
251     * Write harvestdetails to archive file(s). This includes the order.xml, seeds.txt, specific settings.xml for
252     * certain domains, the harvestInfo.xml, All available reports (subset of HeritrixFiles.HERITRIX_REPORTS), All
253     * available logs (subset of HeritrixFiles.HERITRIX_LOGS).
254     *
255     * @param jobID the given job Id
256     * @param harvestID the id for the harvestdefinition, which created this job
257     * @param crawlDir the directory where the crawljob took place
258     * @param mdfw an MetadaFileWriter used to store the harvest configuration, and harvest logs and reports.
259     * @param heritrixVersion the heritrix version used by the harvest.
260     * @return a list of files added to the archive file.
261     * @throws ArgumentNotValid If null arguments occur
262     */
263    private static List<File> writeHarvestDetails(long jobID, long harvestID, IngestableFiles ingestableFiles, MetadataFileWriter mdfw,
264            String heritrixVersion) {
265        List<File> filesAdded = new ArrayList<File>();
266        
267        // We will sort the files by URL
268        TreeSet<MetadataFile> files = new TreeSet<MetadataFile>();
269
270        // look for files in the crawldir and the ${heritrix3jobdir}, and ${heritrix3jobdir}/latest
271        // - reports is relative to ${heritrix3jobdir}/latest/ 
272        // - logs is relative to ${heritrix3jobdir}
273        File crawlDir = ingestableFiles.getCrawlDir();
274        File jobsDir = ingestableFiles.getHeritrix3JobDir();
275        File reportsDir = ingestableFiles.getReportsDir();
276        
277        log.info("Looking for heritrix files in the following directories: {},{}, {}",
278                        crawlDir.getAbsolutePath(), jobsDir.getAbsolutePath(), reportsDir.getAbsolutePath());
279        
280        // Find and add Heritrix files in the crawl directory
281        File[] heritrixFiles = crawlDir.listFiles(new FileFilter() {
282            @Override
283            public boolean accept(File f) {
284                return (f.isFile() && f.getName().matches(MetadataFile.HERITRIX_FILE_PATTERN));
285            }
286        });
287        for (File hf : heritrixFiles) {
288            files.add(new MetadataFile(hf, harvestID, jobID, heritrixVersion));
289        }
290
291        // Find and add Heritrix files in the heritrixjobdir (if it exists)
292        if (jobsDir.exists()) {
293                File[] heritrixFilesJobDir = jobsDir.listFiles(new FileFilter() {
294                        @Override
295                        public boolean accept(File f) {
296                                return (f.isFile() && f.getName().matches(MetadataFile.HERITRIX_FILE_PATTERN));
297                        }
298                });
299                for (File hf : heritrixFilesJobDir) {
300                        files.add(new MetadataFile(hf, harvestID, jobID, heritrixVersion));
301                }
302        } else {
303                log.warn("The directory {} does not exist", jobsDir.getAbsolutePath()); 
304        }
305        
306        // Find and add Heritrix files in the heritrixReportsDir (if it exists)
307        if (reportsDir.exists()) {
308                File[] heritrixFilesReports = reportsDir.listFiles(new FileFilter() {
309                        @Override
310                        public boolean accept(File f) {
311                                return (f.isFile() && f.getName().matches(MetadataFile.HERITRIX_FILE_PATTERN));
312                        }
313                });
314
315                for (File hf : heritrixFilesReports) {
316                        files.add(new MetadataFile(hf, harvestID, jobID, heritrixVersion));
317                } 
318        } else {
319                log.warn("The directory {} does not exist", reportsDir.getAbsolutePath());
320        }
321        
322        // Generate an arcfiles-report.txt if configured to do so.
323        // FIXME This is not possible to extract from the crawl.log (Is this list available in any other way?)
324        
325        boolean genArcFilesReport = Settings.getBoolean(Heritrix3Settings.METADATA_GENERATE_ARCHIVE_FILES_REPORT);
326        if (genArcFilesReport) {
327                log.debug("Arcfiles-report.txt generation Not currently supported by Heritrix3");
328                /*
329            log.debug("Creating an arcfiles-report.txt");
330            files.add(new MetadataFile(new ArchiveFilesReportGenerator(crawlDir).generateReport(), harvestID, jobID,
331                    heritrixVersion));
332                    */
333        } else {
334            log.debug("Creation of the arcfiles-report.txt has been disabled by the setting '{}'!",
335                        Heritrix3Settings.METADATA_GENERATE_ARCHIVE_FILES_REPORT);
336        }
337
338        // Add log files
339        File logDir = new File(jobsDir, "logs");
340        if (logDir.exists()) {
341            File[] heritrixLogFiles = logDir.listFiles(new FileFilter() {
342                @Override
343                public boolean accept(File f) {
344                    return (f.isFile() && f.getName().matches(MetadataFile.LOG_FILE_PATTERN));
345                }
346            });
347            for (File logFile : heritrixLogFiles) {
348                files.add(new MetadataFile(logFile, harvestID, jobID, heritrixVersion));
349                log.info("Found Heritrix log file {}", logFile.getName());
350            }
351        } else {
352            log.debug("No logs dir found in crawldir: {}", crawlDir.getAbsolutePath());
353        }
354        
355        // Write files in order to metadata archive file.
356        for (MetadataFile mdf : files) {
357            File heritrixFile = mdf.getHeritrixFile();
358            String heritrixFileName = heritrixFile.getName();
359            String mimeType = (heritrixFileName.endsWith(".xml") ? "text/xml" : "text/plain");
360            if (mdfw.writeTo(heritrixFile, mdf.getUrl(), mimeType)) {
361                filesAdded.add(heritrixFile);
362            } else {
363                log.warn("The Heritrix file '{}' was not included in the metadata archivefile due to some error.",
364                        heritrixFile.getAbsolutePath());
365            }
366        }
367
368        return filesAdded;
369    }
370  
371}