Source code

001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.viewerproxy.webinterface;
024
025import java.io.BufferedReader;
026import java.io.File;
027import java.io.FileReader;
028import java.io.IOException;
029import java.util.ArrayList;
030import java.util.Collections;
031import java.util.HashSet;
032import java.util.List;
033import java.util.Set;
034import java.util.UUID;
035
036import org.apache.commons.io.IOUtils;
037
038import dk.netarkivet.common.CommonSettings;
039import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory;
040import dk.netarkivet.common.distribute.arcrepository.BatchStatus;
041import dk.netarkivet.common.exceptions.ArgumentNotValid;
042import dk.netarkivet.common.exceptions.IOFailure;
043import dk.netarkivet.common.utils.FileUtils;
044import dk.netarkivet.common.utils.Settings;
045import dk.netarkivet.common.utils.batch.ArchiveBatchFilter;
046import dk.netarkivet.common.utils.batch.FileBatchJob;
047import dk.netarkivet.common.utils.batch.FileListJob;
048import dk.netarkivet.common.utils.cdx.ArchiveExtractCDXJob;
049import dk.netarkivet.common.utils.cdx.CDXRecord;
050
051/**
052 * Methods for generating the batch results needed by the QA pages.
053 */
054@SuppressWarnings({"serial"})
055public class Reporting {
056    /**
057     * Utility class, do not initialise.
058     */
059    private Reporting() {
060    }
061
062    /** The suffix for the data arc/warc files produced by Heritrix. */
063    static final String archivefile_suffix = ".*\\.(w)?arc(\\.gz)?";
064
065    /** The suffix for the data arc/warc metadata file created by NetarchiveSuite. */
066    static final String metadatafile_suffix = "-metadata-[0-9]+\\.(w)?arc(\\.gz)?";
067
068    /**
069     * Submit a batch job to list all files for a job, and report result in a sorted list.
070     *
071     * @param jobid The job to get files for.
072     * @param harvestprefix The harvestprefix for the files produced by heritrix
073     * @return A sorted list of files.
074     * @throws ArgumentNotValid If jobid is 0 or negative.
075     * @throws IOFailure On trouble generating the file list
076     */
077    public static List<String> getFilesForJob(int jobid, String harvestprefix) {
078        ArgumentNotValid.checkPositive(jobid, "jobid");
079        FileBatchJob fileListJob = new FileListJob();
080        List<String> acceptedPatterns = new ArrayList<String>();
081        acceptedPatterns.add(".*" + jobid + ".*" + metadatafile_suffix);
082        acceptedPatterns.add(harvestprefix + archivefile_suffix);
083        fileListJob.processOnlyFilesMatching(acceptedPatterns);
084
085        File f;
086        try {
087            f = File.createTempFile(jobid + "-files", ".txt", FileUtils.getTempDir());
088        } catch (IOException e) {
089            throw new IOFailure("Could not create temorary file", e);
090        }
091        BatchStatus status = ArcRepositoryClientFactory.getViewerInstance().batch(fileListJob,
092                Settings.get(CommonSettings.USE_REPLICA_ID));
093        status.getResultFile().copyTo(f);
094        List<String> lines = new ArrayList<String>(FileUtils.readListFromFile(f));
095        FileUtils.remove(f);
096        Set<String> linesAsSet = new HashSet<String>();
097        linesAsSet.addAll(lines);
098        lines = new ArrayList<String>();
099        lines.addAll(linesAsSet);
100        Collections.sort(lines);
101        return lines;
102    }
103
104    /**
105     * Submit a batch job to generate cdx for all metadata files for a job, and report result in a list.
106     *
107     * @param jobid The job to get cdx for.
108     * @return A list of cdx records.
109     * @throws ArgumentNotValid If jobid is 0 or negative.
110     * @throws IOFailure On trouble generating the cdx
111     */
112    public static List<CDXRecord> getMetadataCDXRecordsForJob(long jobid) {
113        ArgumentNotValid.checkPositive(jobid, "jobid");
114        FileBatchJob cdxJob = new ArchiveExtractCDXJob(false) {
115            @Override
116            public ArchiveBatchFilter getFilter() {
117                return ArchiveBatchFilter.EXCLUDE_NON_WARCINFO_RECORDS;
118            }
119        };
120        cdxJob.processOnlyFilesMatching(".*"+jobid + ".*" + metadatafile_suffix);
121
122        File f;
123        try {
124            f = File.createTempFile(jobid + "-reports", ".cdx", FileUtils.getTempDir());
125        } catch (IOException e) {
126            throw new IOFailure("Could not create temporary file", e);
127        }
128        BatchStatus status = ArcRepositoryClientFactory.getViewerInstance().batch(cdxJob,
129                Settings.get(CommonSettings.USE_REPLICA_ID));
130        status.getResultFile().copyTo(f);
131        List<CDXRecord> records;
132        BufferedReader reader = null;
133        try {
134            reader = new BufferedReader(new FileReader(f));
135            records = new ArrayList<CDXRecord>();
136            for (String line = reader.readLine(); line != null; line = reader.readLine()) {
137                String[] parts = line.split("\\s+");
138                CDXRecord record = new CDXRecord(parts);
139                records.add(record);
140            }
141        } catch (IOException e) {
142            throw new IOFailure("Unable to read results from file '" + f + "'", e);
143        } finally {
144            IOUtils.closeQuietly(reader);
145            FileUtils.remove(f);
146        }
147        return records;
148    }
149
150    /**
151     * Submit a batch job to extract the part of a crawl log that is associated with the given domain and job.
152     *
153     * @param domain The domain to get crawl.log-lines for.
154     * @param jobid The jobid to get the crawl.log-lines for.
155     * @return A file containing the crawl.log lines. This file is temporary, and should be deleted after use.
156     * @throws ArgumentNotValid On negative jobids, or if domain is null or the empty string.
157     */
158    public static File getCrawlLogForDomainInJob(String domain, int jobid) {
159        ArgumentNotValid.checkPositive(jobid, "jobid");
160        ArgumentNotValid.checkNotNullOrEmpty(domain, "String domain");
161        FileBatchJob urlsForDomainBatchJob = new HarvestedUrlsForDomainBatchJob(domain);
162        urlsForDomainBatchJob.processOnlyFilesMatching(".*"+jobid + ".*" + metadatafile_suffix);
163        return getResultFile(urlsForDomainBatchJob);
164    }
165
166    /**
167     * Helper method to get result from a batchjob.
168     *
169     * @param batchJob a certain FileBatchJob
170     * @return a file with the result.
171     */
172    private static File getResultFile(FileBatchJob batchJob) {
173        File f;
174        File fsorted;
175        try {
176            final String uuid = UUID.randomUUID().toString();
177            f = File.createTempFile("temp", uuid + ".txt", FileUtils.getTempDir());
178            f.deleteOnExit();
179            fsorted = File.createTempFile("temp", uuid + "-sorted.txt", FileUtils.getTempDir());
180            fsorted.deleteOnExit();
181        } catch (IOException e) {
182            throw new IOFailure("Unable to create temporary file", e);
183        }
184        BatchStatus status = ArcRepositoryClientFactory.getViewerInstance().batch(batchJob,
185                Settings.get(CommonSettings.USE_REPLICA_ID));
186        status.getResultFile().copyTo(f);
187        FileUtils.sortCrawlLogOnTimestamp(f, fsorted);
188        FileUtils.remove(f);
189        return fsorted;
190    }
191
192    /**
193     * Return any crawllog lines for a given jobid matching the given regular expression.
194     *
195     * @param jobid The jobid
196     * @param regexp A regular expression
197     * @return a File with the matching lines.
198     */
199    public static File getCrawlLoglinesMatchingRegexp(int jobid, String regexp) {
200        ArgumentNotValid.checkPositive(jobid, "jobid");
201        ArgumentNotValid.checkNotNullOrEmpty(regexp, "String regexp");
202        FileBatchJob crawlLogBatchJob = new CrawlLogLinesMatchingRegexp(regexp);
203        crawlLogBatchJob.processOnlyFilesMatching(".*"+jobid + ".*" + metadatafile_suffix);
204        return getResultFile(crawlLogBatchJob);
205    }
206
207}