Source code

001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.harvester.harvesting;
025
026import java.io.File;
027import java.io.FileFilter;
028import java.io.IOException;
029import java.util.ArrayList;
030import java.util.Date;
031import java.util.HashMap;
032import java.util.Iterator;
033import java.util.List;
034import java.util.Map;
035import java.util.TreeSet;
036
037import org.jwat.common.ANVLRecord;
038import org.slf4j.Logger;
039import org.slf4j.LoggerFactory;
040
041import dk.netarkivet.common.Constants;
042import dk.netarkivet.common.exceptions.ArgumentNotValid;
043import dk.netarkivet.common.exceptions.IOFailure;
044import dk.netarkivet.common.exceptions.PermissionDenied;
045import dk.netarkivet.common.utils.FileUtils;
046import dk.netarkivet.common.utils.Settings;
047import dk.netarkivet.common.utils.SystemUtils;
048import dk.netarkivet.common.utils.archive.ArchiveProfile;
049import dk.netarkivet.common.utils.cdx.CDXUtils;
050import dk.netarkivet.harvester.HarvesterSettings;
051import dk.netarkivet.harvester.harvesting.metadata.MetadataEntry;
052import dk.netarkivet.harvester.harvesting.metadata.MetadataFile;
053import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriter;
054import dk.netarkivet.harvester.harvesting.metadata.MetadataFileWriterWarc;
055
056/**
057 * This class contains code for documenting a harvest. Metadata is read from the directories associated with a given
058 * harvest-job-attempt (i.e. one DoCrawlMessage sent to a harvest server). The collected metadata are written to a new
059 * metadata file that is managed by IngestableFiles. Temporary metadata files will be deleted after this metadata file
060 * has been written.
061 */
062public class HarvestDocumentation {
063
064    private static final Logger log = LoggerFactory.getLogger(HarvestDocumentation.class);
065
066    /**
067     * Documents the harvest under the given dir in a packaged metadata arc file in a directory 'metadata' under the
068     * current dir. Only documents the files belonging to the given jobID, the rest are moved to oldjobs.
069     * <p>
070     * In the current implementation, the documentation consists of CDX indices over all ARC files (with one CDX record
071     * per harvested ARC file), plus packaging of log files.
072     * <p>
073     * If this method finishes without an exception, it is guaranteed that metadata is ready for upload.
074     * <p>
075     * TODO Place preharvestmetadata in IngestableFiles-defined area TODO This method may be a good place to copy
076     * deduplicate information from the crawl log to the cdx file.
077     *
078     * @param ingestables Information about the finished crawl (crawldir, jobId, harvestID).
079     * @throws ArgumentNotValid if crawlDir is null or does not exist, or if jobID or harvestID is negative.
080     * @throws IOFailure if - reading ARC files or temporary files fails - writing a file to arcFilesDir fails
081     */
082    public static void documentHarvest(IngestableFiles ingestables) throws IOFailure {
083        ArgumentNotValid.checkNotNull(ingestables, "ingestables");
084
085        File crawlDir = ingestables.getCrawlDir();
086        Long jobID = ingestables.getJobId();
087        Long harvestID = ingestables.getHarvestID();
088
089        // Prepare metadata-arcfile for ingestion of metadata, and enumerate
090        // items to ingest.
091
092        // If metadata-arcfile already exists, we are done
093        // See bug 722
094        if (ingestables.isMetadataReady()) {
095            log.warn("The metadata-file '{}' already exists, so we don't make another one!", ingestables
096                    .getMetadataFile().getAbsolutePath());
097            return;
098        }
099        List<File> filesAddedAndNowDeletable = null;
100
101        try {
102            MetadataFileWriter mdfw;
103            mdfw = ingestables.getMetadataWriter();
104
105            if (mdfw instanceof MetadataFileWriterWarc) {
106                // add warc-info record
107                ANVLRecord infoPayload = new ANVLRecord();
108                infoPayload.addLabelValue("software",
109                        "NetarchiveSuite/" + dk.netarkivet.common.Constants.getVersionString() + "/"
110                                + dk.netarkivet.common.Constants.PROJECT_WEBSITE);
111                infoPayload.addLabelValue("ip", SystemUtils.getLocalIP());
112                infoPayload.addLabelValue("hostname", SystemUtils.getLocalHostName());
113                infoPayload.addLabelValue("conformsTo",
114                        "http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf");
115
116                PersistentJobData psj = new PersistentJobData(crawlDir);
117                infoPayload.addLabelValue("isPartOf", "" + psj.getJobID());
118                MetadataFileWriterWarc mfww = (MetadataFileWriterWarc) mdfw;
119                mfww.insertInfoRecord(infoPayload);
120            }
121
122            // Fetch any serialized preharvest metadata objects, if they exists.
123            List<MetadataEntry> storedMetadata = getStoredMetadata(crawlDir);
124            try {
125                for (MetadataEntry m : storedMetadata) {
126                    mdfw.write(m.getURL(), m.getMimeType(), SystemUtils.getLocalIP(), System.currentTimeMillis(),
127                            m.getData());
128                }
129            } catch (IOException e) {
130                log.warn("Unable to write pre-metadata to metadata archivefile", e);
131            }
132
133            // Insert the harvestdetails into metadata archivefile.
134            filesAddedAndNowDeletable = writeHarvestDetails(jobID, harvestID, crawlDir, mdfw,
135                    Constants.getHeritrixVersionString());
136            // All these files just added to the metadata archivefile can now be deleted
137            // except for the files we need for later processing):
138            // - crawl.log is needed to create domainharvestreport later
139            // - harvestInfo.xml is needed to upload stored data after
140            // crashes/stops on the harvesters
141            // - progress-statistics.log is needed to find out if crawl ended due
142            // to hitting a size limit, or due to other completion
143
144            Iterator<File> iterator = filesAddedAndNowDeletable.iterator();
145            while (iterator.hasNext()) {
146                File f = iterator.next();
147                if (f.getName().equals("crawl.log") || f.getName().equals("harvestInfo.xml")
148                        || f.getName().equals("progress-statistics.log")) {
149                    iterator.remove();
150                }
151            }
152
153            boolean cdxGenerationSucceeded = false;
154
155            // Try to create CDXes over ARC and WARC files.
156            File arcFilesDir = ingestables.getArcsDir();
157            File warcFilesDir = ingestables.getWarcsDir();
158
159            if (arcFilesDir.isDirectory() && FileUtils.hasFiles(arcFilesDir)) {
160                addCDXes(ingestables, arcFilesDir, mdfw, ArchiveProfile.ARC_PROFILE);
161                cdxGenerationSucceeded = true;
162            }
163            if (warcFilesDir.isDirectory() && FileUtils.hasFiles(warcFilesDir)) {
164                addCDXes(ingestables, warcFilesDir, mdfw, ArchiveProfile.WARC_PROFILE);
165                cdxGenerationSucceeded = true;
166            }
167
168            if (cdxGenerationSucceeded) {
169                // This indicates, that either the files in the arcsdir or in the warcsdir
170                // have now been CDX-processed.
171                //
172                // TODO refactor, as this call has too many sideeffects
173                ingestables.setMetadataGenerationSucceeded(true);
174            } else {
175                log.warn("Found no archive directory with ARC og WARC files. Looked for dirs '{}' and '{}'.",
176                        arcFilesDir.getAbsolutePath(), warcFilesDir.getAbsolutePath());
177            }
178        } finally {
179            // If at this point metadata is not ready, an error occurred.
180            if (!ingestables.isMetadataReady()) {
181                ingestables.setMetadataGenerationSucceeded(false);
182            } else {
183                for (File fileAdded : filesAddedAndNowDeletable) {
184                    FileUtils.remove(fileAdded);
185                }
186                ingestables.cleanup();
187            }
188        }
189    }
190
191    private static void addCDXes(IngestableFiles files, File archiveDir, MetadataFileWriter writer,
192            ArchiveProfile profile) {
193        moveAwayForeignFiles(profile, archiveDir, files);
194        File cdxFilesDir = FileUtils.createUniqueTempDir(files.getTmpMetadataDir(), "cdx");
195        CDXUtils.generateCDX(profile, archiveDir, cdxFilesDir);
196        writer.insertFiles(cdxFilesDir, FileUtils.CDX_FILE_FILTER, Constants.CDX_MIME_TYPE, 
197                        files.getHarvestID(), files.getJobId());
198    }
199
200    /**
201     * Restore serialized MetadataEntry objects from the "metadata" subdirectory of the crawldir.
202     *
203     * @param crawlDir the given crawl directory
204     * @return a set of deserialized MetadataEntry objects
205     */
206    private static List<MetadataEntry> getStoredMetadata(File crawlDir) {
207        File metadataDir = new File(crawlDir, IngestableFiles.METADATA_SUB_DIR);
208        if (!metadataDir.isDirectory()) {
209            log.warn("Should have an metadata directory '{}' but there wasn't", metadataDir.getAbsolutePath());
210            return new ArrayList<MetadataEntry>();
211        } else {
212            return MetadataEntry.getMetadataFromDisk(metadataDir);
213        }
214    }
215
216    /**
217     * Iterates over the (W)ARC files in the given dir and moves away files that do not belong to the given job into a
218     * "lost-files" directory under oldjobs named with a timestamp.
219     *
220     * @param archiveProfile archive profile including filters, patterns, etc.
221     * @param dir A directory containing one or more (W)ARC files.
222     * @param files Information about the files produced by heritrix (jobId and harvestnamePrefix)
223     */
224    private static void moveAwayForeignFiles(ArchiveProfile archiveProfile, File dir, IngestableFiles files) {
225        File[] archiveFiles = dir.listFiles(archiveProfile.filename_filter);
226        File oldJobsDir = new File(Settings.get(HarvesterSettings.HARVEST_CONTROLLER_OLDJOBSDIR));
227        File lostfilesDir = new File(oldJobsDir, "lost-files-" + new Date().getTime());
228        List<File> movedFiles = new ArrayList<File>();
229        log.info("Looking for files not having harvestprefix '{}'", files.getHarvestnamePrefix());
230        for (File archiveFile : archiveFiles) {
231            if (!(archiveFile.getName().startsWith(files.getHarvestnamePrefix()))) {
232                // move unidentified file to lostfiles directory
233                log.info("removing unidentified file {}", archiveFile.getAbsolutePath());
234                try {
235                    if (!lostfilesDir.exists()) {
236                        FileUtils.createDir(lostfilesDir);
237                    }
238                    File moveTo = new File(lostfilesDir, archiveFile.getName());
239                    archiveFile.renameTo(moveTo);
240                    movedFiles.add(moveTo);
241                } catch (PermissionDenied e) {
242                    log.warn("Not allowed to make oldjobs dir '{}'", lostfilesDir.getAbsolutePath(), e);
243                }
244
245            }
246        }
247        if (!movedFiles.isEmpty()) {
248            log.warn("Found files not belonging to job {}, the following files have been stored for later: {}",
249                    files.getJobId(), movedFiles);
250        }
251    }
252
253    /**
254     * Write harvestdetails to archive file(s). This includes the order.xml, seeds.txt, specific settings.xml for
255     * certain domains, the harvestInfo.xml, All available reports (subset of HeritrixFiles.HERITRIX_REPORTS), All
256     * available logs (subset of HeritrixFiles.HERITRIX_LOGS).
257     *
258     * @param jobID the given job Id
259     * @param harvestID the id for the harvestdefinition, which created this job
260     * @param crawlDir the directory where the crawljob took place
261     * @param mdfw an MetadaFileWriter used to store the harvest configuration, and harvest logs and reports.
262     * @param heritrixVersion the heritrix version used by the harvest.
263     * @return a list of files added to the archive file.
264     * @throws ArgumentNotValid If null arguments occur
265     */
266    private static List<File> writeHarvestDetails(long jobID, long harvestID, File crawlDir, MetadataFileWriter mdfw,
267            String heritrixVersion) {
268        List<File> filesAdded = new ArrayList<File>();
269
270        // We will sort the files by URL
271        TreeSet<MetadataFile> files = new TreeSet<MetadataFile>();
272
273        // List heritrix files in the crawl directory
274        File[] heritrixFiles = crawlDir.listFiles(new FileFilter() {
275            @Override
276            public boolean accept(File f) {
277                return (f.isFile() && f.getName().matches(MetadataFile.HERITRIX_FILE_PATTERN));
278            }
279        });
280
281        // Add files in the crawl directory
282        for (File hf : heritrixFiles) {
283            files.add(new MetadataFile(hf, harvestID, jobID, heritrixVersion));
284        }
285        // Generate an arcfiles-report.txt if configured to do so.
286        boolean genArcFilesReport = Settings.getBoolean(HarvesterSettings.METADATA_GENERATE_ARCHIVE_FILES_REPORT);
287        if (genArcFilesReport) {
288            log.debug("Creating an arcfiles-report.txt");
289            files.add(new MetadataFile(new ArchiveFilesReportGenerator(crawlDir).generateReport(), harvestID, jobID,
290                    heritrixVersion));
291        } else {
292            log.debug("Creation of the arcfiles-report.txt has been disabled by the setting '{}'!",
293                    HarvesterSettings.METADATA_GENERATE_ARCHIVE_FILES_REPORT);
294        }
295
296        // Add log files
297        File logDir = new File(crawlDir, "logs");
298        if (logDir.exists()) {
299            File[] heritrixLogFiles = logDir.listFiles(new FileFilter() {
300                @Override
301                public boolean accept(File f) {
302                    return (f.isFile() && f.getName().matches(MetadataFile.LOG_FILE_PATTERN));
303                }
304            });
305            for (File logFile : heritrixLogFiles) {
306                files.add(new MetadataFile(logFile, harvestID, jobID, heritrixVersion));
307                log.info("Found Heritrix log file {}", logFile.getName());
308            }
309        } else {
310            log.debug("No logs dir found in crawldir: {}", crawlDir.getAbsolutePath());
311        }
312
313        // Check if exists any settings directory (domain-specific settings)
314        // if yes, add any settings.xml hiding in this directory.
315        // TODO Delete any settings-files found in the settings directory */
316        File settingsDir = new File(crawlDir, "settings");
317        if (settingsDir.isDirectory()) {
318            Map<File, String> domainSettingsFiles = findDomainSpecificSettings(settingsDir);
319            for (Map.Entry<File, String> entry : domainSettingsFiles.entrySet()) {
320
321                File dsf = entry.getKey();
322                String domain = entry.getValue();
323                files.add(new MetadataFile(dsf, harvestID, jobID, heritrixVersion, domain));
324            }
325        } else {
326            log.debug("No settings directory found in crawldir: {}", crawlDir.getAbsolutePath());
327        }
328
329        // Write files in order to metadata archive file.
330        for (MetadataFile mdf : files) {
331            File heritrixFile = mdf.getHeritrixFile();
332            String heritrixFileName = heritrixFile.getName();
333            String mimeType = (heritrixFileName.endsWith(".xml") ? "text/xml" : "text/plain");
334            if (mdfw.writeTo(heritrixFile, mdf.getUrl(), mimeType)) {
335                filesAdded.add(heritrixFile);
336            } else {
337                log.warn("The Heritrix file '{}' was not included in the metadata archivefile due to some error.",
338                        heritrixFile.getAbsolutePath());
339            }
340        }
341
342        return filesAdded;
343    }
344
345    /**
346     * Finds domain-specific configurations in the settings subdirectory of the crawl directory.
347     *
348     * @param settingsDir the given settings directory
349     * @return the settings file paired with their domain..
350     */
351    private static Map<File, String> findDomainSpecificSettings(File settingsDir) {
352        // find any domain specific configurations (settings.xml)
353        List<String> reversedDomainsWithSettings = findAllDomainsWithSettings(settingsDir, "");
354
355        Map<File, String> settingsFileToDomain = new HashMap<File, String>();
356        if (reversedDomainsWithSettings.isEmpty()) {
357            log.debug("No settings/<domain> directories exists: no domain-specific configurations available");
358        } else {
359            for (String reversedDomain : reversedDomainsWithSettings) {
360                String domain = reverseDomainString(reversedDomain);
361                File settingsXmlFile = new File(settingsDir + reversedDomain.replaceAll("\\.", File.separator),
362                        MetadataFile.DOMAIN_SETTINGS_FILE);
363                if (!settingsXmlFile.isFile()) {
364                    log.debug("Directory settings/{}/{} does not exist.", domain, MetadataFile.DOMAIN_SETTINGS_FILE);
365                } else {
366                    settingsFileToDomain.put(settingsXmlFile, domain);
367                }
368            }
369        }
370        return settingsFileToDomain;
371    }
372
373    /**
374     * Find all domains which have a settings.xml file in the given directory.
375     *
376     * @param directory a given directory
377     * @param domainReversed the domain reversed
378     * @return a list of domains (in reverse), which contained a file with given filename
379     */
380    private static List<String> findAllDomainsWithSettings(File directory, String domainReversed) {
381        if (!directory.isDirectory()) {
382            return new ArrayList<String>(0);
383        }
384        // List to hold the files temporarily
385        List<String> filesToReturn = new ArrayList<String>();
386
387        for (File fileInDir : directory.listFiles()) {
388            // if the given file is a dir, then call
389            // the method recursively.
390            if (fileInDir.isDirectory()) {
391                List<String> resultList = findAllDomainsWithSettings(fileInDir,
392                        domainReversed + "." + fileInDir.getName());
393                if (!resultList.isEmpty()) {
394                    filesToReturn.addAll(resultList);
395                }
396            } else {
397                if (fileInDir.getName().equals(MetadataFile.DOMAIN_SETTINGS_FILE)) {
398                    // Store the domain, so that we can find the file later
399                    filesToReturn.add(domainReversed);
400                }
401            }
402        }
403        return filesToReturn;
404    }
405
406    /**
407     * Reverses a domain string, e.g. reverses "com.amazon" to "amazon.com"
408     *
409     * @param reversedDomain the domain name to reverse
410     * @return the reversed domain string
411     */
412    private static String reverseDomainString(String reversedDomain) {
413        String domain = "";
414        String remaining = reversedDomain;
415        int lastDotIndex = remaining.lastIndexOf(".");
416        while (lastDotIndex != -1) {
417            domain += remaining.substring(lastDotIndex + 1) + ".";
418            remaining = remaining.substring(0, lastDotIndex);
419            lastDotIndex = remaining.lastIndexOf(".");
420        }
421        return domain.substring(0, domain.length() - 1);
422    }
423
424}