Source code

001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.viewerproxy.webinterface;
024
025import java.io.BufferedReader;
026import java.io.File;
027import java.io.FileReader;
028import java.io.IOException;
029import java.util.ArrayList;
030import java.util.Collections;
031import java.util.HashSet;
032import java.util.List;
033import java.util.Set;
034import java.util.UUID;
035
036import org.apache.commons.io.IOUtils;
037import org.slf4j.Logger;
038import org.slf4j.LoggerFactory;
039
040import dk.netarkivet.common.CommonSettings;
041import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory;
042import dk.netarkivet.common.distribute.arcrepository.BatchStatus;
043import dk.netarkivet.common.exceptions.ArgumentNotValid;
044import dk.netarkivet.common.exceptions.IOFailure;
045import dk.netarkivet.common.utils.FileUtils;
046import dk.netarkivet.common.utils.Settings;
047import dk.netarkivet.common.utils.batch.ArchiveBatchFilter;
048import dk.netarkivet.common.utils.batch.FileBatchJob;
049import dk.netarkivet.common.utils.batch.FileListJob;
050import dk.netarkivet.common.utils.cdx.ArchiveExtractCDXJob;
051import dk.netarkivet.common.utils.cdx.CDXRecord;
052
053/**
054 * Methods for generating the batch results needed by the QA pages.
055 */
056@SuppressWarnings({"serial"})
057public class Reporting {
058
059    private static final Logger log = LoggerFactory.getLogger(Reporting.class);
060
061
062    /**
063     * Utility class, do not initialise.
064     */
065    private Reporting() {
066    }
067
068    /** The suffix for the data arc/warc files produced by Heritrix. */
069    static final String archivefile_suffix = ".*\\.(w)?arc(\\.gz)?";
070
071    /** The suffix for the data arc/warc metadata file created by NetarchiveSuite. */
072    static final String metadatafile_suffix = "-metadata-[0-9]+\\.(w)?arc(\\.gz)?";
073
074    /**
075     * Submit a batch job to list all files for a job, and report result in a sorted list.
076     *
077     * @param jobid The job to get files for.
078     * @param harvestprefix The harvestprefix for the files produced by heritrix
079     * @return A sorted list of files.
080     * @throws ArgumentNotValid If jobid is 0 or negative.
081     * @throws IOFailure On trouble generating the file list
082     */
083    public static List<String> getFilesForJob(long jobid, String harvestprefix) {
084        ArgumentNotValid.checkPositive(jobid, "jobid");
085        FileBatchJob fileListJob = new FileListJob();
086        List<String> acceptedPatterns = new ArrayList<String>();
087        final String metadataFilePatternForJobId = getMetadataFilePatternForJobId(jobid);
088        log.debug("Finding files matching pattern {}.", metadataFilePatternForJobId);
089        acceptedPatterns.add(metadataFilePatternForJobId);
090        acceptedPatterns.add(harvestprefix + archivefile_suffix);
091        fileListJob.processOnlyFilesMatching(acceptedPatterns);
092
093        File f;
094        try {
095            f = File.createTempFile(jobid + "-files", ".txt", FileUtils.getTempDir());
096        } catch (IOException e) {
097            throw new IOFailure("Could not create temporary file", e);
098        }
099        BatchStatus status = ArcRepositoryClientFactory.getViewerInstance().batch(fileListJob,
100                Settings.get(CommonSettings.USE_REPLICA_ID));
101        status.getResultFile().copyTo(f);
102        List<String> lines = new ArrayList<String>(FileUtils.readListFromFile(f));
103        FileUtils.remove(f);
104        Set<String> linesAsSet = new HashSet<String>();
105        linesAsSet.addAll(lines);
106        lines = new ArrayList<String>();
107        lines.addAll(linesAsSet);
108        Collections.sort(lines);
109        return lines;
110    }
111
112    /**
113     * Submit a batch job to generate cdx for all metadata files for a job, and report result in a list.
114     *
115     * @param jobid The job to get cdx for.
116     * @return A list of cdx records.
117     * @throws ArgumentNotValid If jobid is 0 or negative.
118     * @throws IOFailure On trouble generating the cdx
119     */
120    public static List<CDXRecord> getMetadataCDXRecordsForJob(long jobid) {
121        ArgumentNotValid.checkPositive(jobid, "jobid");
122        FileBatchJob cdxJob = new ArchiveExtractCDXJob(false) {
123            @Override
124            public ArchiveBatchFilter getFilter() {
125                return ArchiveBatchFilter.EXCLUDE_NON_WARCINFO_RECORDS;
126            }
127        };
128        String metadataFileSearchPattern = getMetadataFilePatternForJobId(jobid);
129        cdxJob.processOnlyFilesMatching(metadataFileSearchPattern);
130
131        File f;
132        try {
133            f = File.createTempFile(jobid + "-reports", ".cdx", FileUtils.getTempDir());
134        } catch (IOException e) {
135            throw new IOFailure("Could not create temporary file", e);
136        }
137        BatchStatus status = ArcRepositoryClientFactory.getViewerInstance().batch(cdxJob,
138                Settings.get(CommonSettings.USE_REPLICA_ID));
139        status.getResultFile().copyTo(f);
140        List<CDXRecord> records;
141        BufferedReader reader = null;
142        try {
143            reader = new BufferedReader(new FileReader(f));
144            records = new ArrayList<CDXRecord>();
145            for (String line = reader.readLine(); line != null; line = reader.readLine()) {
146                String[] parts = line.split("\\s+");
147                CDXRecord record = new CDXRecord(parts);
148                records.add(record);
149            }
150        } catch (IOException e) {
151            throw new IOFailure("Unable to read results from file '" + f + "'", e);
152        } finally {
153            IOUtils.closeQuietly(reader);
154            FileUtils.remove(f);
155        }
156        return records;
157    }
158
159    /**
160     * Submit a batch job to extract the part of a crawl log that is associated with the given domain and job.
161     *
162     * @param domain The domain to get crawl.log-lines for.
163     * @param jobid The jobid to get the crawl.log-lines for.
164     * @return A file containing the crawl.log lines. This file is temporary, and should be deleted after use.
165     * @throws ArgumentNotValid On negative jobids, or if domain is null or the empty string.
166     */
167    public static File getCrawlLogForDomainInJob(String domain, long jobid) {
168        ArgumentNotValid.checkPositive(jobid, "jobid");
169        ArgumentNotValid.checkNotNullOrEmpty(domain, "String domain");
170        FileBatchJob urlsForDomainBatchJob = new HarvestedUrlsForDomainBatchJob(domain);
171        urlsForDomainBatchJob.processOnlyFilesMatching(getMetadataFilePatternForJobId(jobid));
172        return getResultFile(urlsForDomainBatchJob);
173    }
174
175    /**
176     * Helper method to get result from a batchjob.
177     *
178     * @param batchJob a certain FileBatchJob
179     * @return a file with the result.
180     */
181    private static File getResultFile(FileBatchJob batchJob) {
182        File f;
183        File fsorted;
184        try {
185            final String uuid = UUID.randomUUID().toString();
186            f = File.createTempFile("temp", uuid + ".txt", FileUtils.getTempDir());
187            f.deleteOnExit();
188            fsorted = File.createTempFile("temp", uuid + "-sorted.txt", FileUtils.getTempDir());
189            fsorted.deleteOnExit();
190        } catch (IOException e) {
191            throw new IOFailure("Unable to create temporary file", e);
192        }
193        BatchStatus status = ArcRepositoryClientFactory.getViewerInstance().batch(batchJob,
194                Settings.get(CommonSettings.USE_REPLICA_ID));
195        status.getResultFile().copyTo(f);
196        FileUtils.sortCrawlLogOnTimestamp(f, fsorted);
197        FileUtils.remove(f);
198        return fsorted;
199    }
200
201    /**
202     * Return any crawllog lines for a given jobid matching the given regular expression.
203     *
204     * @param jobid The jobid
205     * @param regexp A regular expression
206     * @return a File with the matching lines.
207     */
208    public static File getCrawlLoglinesMatchingRegexp(long jobid, String regexp) {
209        ArgumentNotValid.checkPositive(jobid, "jobid");
210        ArgumentNotValid.checkNotNullOrEmpty(regexp, "String regexp");
211        FileBatchJob crawlLogBatchJob = new CrawlLogLinesMatchingRegexp(regexp);
212        crawlLogBatchJob.processOnlyFilesMatching(getMetadataFilePatternForJobId(jobid));
213        return getResultFile(crawlLogBatchJob);
214    }
215    
216    /**
217     * Construct the correct metadatafilepattern for a given jobID.
218     * @param jobid a given harvest jobID
219     * @return metadatafilePattern for the given jobid
220     */
221    private static String getMetadataFilePatternForJobId(long jobid) {
222        // The old invalid metadataFilePattern
223        //return ".*"+jobid + ".*" + metadatafile_suffix;
224        // return jobid + metadatafile_suffix;
225        return "(.*-)?" + jobid + "(-.*)?" + metadatafile_suffix;
226    }
227}