001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.viewerproxy.webinterface; 024 025import java.io.BufferedReader; 026import java.io.File; 027import java.io.FileReader; 028import java.io.IOException; 029import java.util.ArrayList; 030import java.util.Collections; 031import java.util.HashSet; 032import java.util.List; 033import java.util.Set; 034import java.util.UUID; 035 036import org.apache.commons.io.IOUtils; 037import org.slf4j.Logger; 038import org.slf4j.LoggerFactory; 039 040import dk.netarkivet.common.CommonSettings; 041import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory; 042import dk.netarkivet.common.distribute.arcrepository.BatchStatus; 043import dk.netarkivet.common.exceptions.ArgumentNotValid; 044import dk.netarkivet.common.exceptions.IOFailure; 045import dk.netarkivet.common.utils.FileUtils; 046import dk.netarkivet.common.utils.Settings; 047import dk.netarkivet.common.utils.batch.ArchiveBatchFilter; 048import dk.netarkivet.common.utils.batch.FileBatchJob; 049import dk.netarkivet.common.utils.batch.FileListJob; 050import dk.netarkivet.common.utils.cdx.ArchiveExtractCDXJob; 051import dk.netarkivet.common.utils.cdx.CDXRecord; 052 053/** 054 * Methods for generating the batch results needed by the QA pages. 055 */ 056@SuppressWarnings({"serial"}) 057public class Reporting { 058 059 private static final Logger log = LoggerFactory.getLogger(Reporting.class); 060 061 062 /** 063 * Utility class, do not initialise. 064 */ 065 private Reporting() { 066 } 067 068 /** The suffix for the data arc/warc files produced by Heritrix. */ 069 static final String archivefile_suffix = ".*\\.(w)?arc(\\.gz)?"; 070 071 /** The suffix for the data arc/warc metadata file created by NetarchiveSuite. */ 072 static final String metadatafile_suffix = "-metadata-[0-9]+\\.(w)?arc(\\.gz)?"; 073 074 /** 075 * Submit a batch job to list all files for a job, and report result in a sorted list. 076 * 077 * @param jobid The job to get files for. 078 * @param harvestprefix The harvestprefix for the files produced by heritrix 079 * @return A sorted list of files. 080 * @throws ArgumentNotValid If jobid is 0 or negative. 081 * @throws IOFailure On trouble generating the file list 082 */ 083 public static List<String> getFilesForJob(long jobid, String harvestprefix) { 084 ArgumentNotValid.checkPositive(jobid, "jobid"); 085 FileBatchJob fileListJob = new FileListJob(); 086 List<String> acceptedPatterns = new ArrayList<String>(); 087 final String metadataFilePatternForJobId = getMetadataFilePatternForJobId(jobid); 088 log.debug("Finding files matching pattern {}.", metadataFilePatternForJobId); 089 acceptedPatterns.add(metadataFilePatternForJobId); 090 acceptedPatterns.add(harvestprefix + archivefile_suffix); 091 fileListJob.processOnlyFilesMatching(acceptedPatterns); 092 093 File f; 094 try { 095 f = File.createTempFile(jobid + "-files", ".txt", FileUtils.getTempDir()); 096 } catch (IOException e) { 097 throw new IOFailure("Could not create temporary file", e); 098 } 099 BatchStatus status = ArcRepositoryClientFactory.getViewerInstance().batch(fileListJob, 100 Settings.get(CommonSettings.USE_REPLICA_ID)); 101 status.getResultFile().copyTo(f); 102 List<String> lines = new ArrayList<String>(FileUtils.readListFromFile(f)); 103 FileUtils.remove(f); 104 Set<String> linesAsSet = new HashSet<String>(); 105 linesAsSet.addAll(lines); 106 lines = new ArrayList<String>(); 107 lines.addAll(linesAsSet); 108 Collections.sort(lines); 109 return lines; 110 } 111 112 /** 113 * Submit a batch job to generate cdx for all metadata files for a job, and report result in a list. 114 * 115 * @param jobid The job to get cdx for. 116 * @return A list of cdx records. 117 * @throws ArgumentNotValid If jobid is 0 or negative. 118 * @throws IOFailure On trouble generating the cdx 119 */ 120 public static List<CDXRecord> getMetadataCDXRecordsForJob(long jobid) { 121 ArgumentNotValid.checkPositive(jobid, "jobid"); 122 FileBatchJob cdxJob = new ArchiveExtractCDXJob(false) { 123 @Override 124 public ArchiveBatchFilter getFilter() { 125 return ArchiveBatchFilter.EXCLUDE_NON_WARCINFO_RECORDS; 126 } 127 }; 128 String metadataFileSearchPattern = getMetadataFilePatternForJobId(jobid); 129 cdxJob.processOnlyFilesMatching(metadataFileSearchPattern); 130 131 File f; 132 try { 133 f = File.createTempFile(jobid + "-reports", ".cdx", FileUtils.getTempDir()); 134 } catch (IOException e) { 135 throw new IOFailure("Could not create temporary file", e); 136 } 137 BatchStatus status = ArcRepositoryClientFactory.getViewerInstance().batch(cdxJob, 138 Settings.get(CommonSettings.USE_REPLICA_ID)); 139 status.getResultFile().copyTo(f); 140 List<CDXRecord> records; 141 BufferedReader reader = null; 142 try { 143 reader = new BufferedReader(new FileReader(f)); 144 records = new ArrayList<CDXRecord>(); 145 for (String line = reader.readLine(); line != null; line = reader.readLine()) { 146 String[] parts = line.split("\\s+"); 147 CDXRecord record = new CDXRecord(parts); 148 records.add(record); 149 } 150 } catch (IOException e) { 151 throw new IOFailure("Unable to read results from file '" + f + "'", e); 152 } finally { 153 IOUtils.closeQuietly(reader); 154 FileUtils.remove(f); 155 } 156 return records; 157 } 158 159 /** 160 * Submit a batch job to extract the part of a crawl log that is associated with the given domain and job. 161 * 162 * @param domain The domain to get crawl.log-lines for. 163 * @param jobid The jobid to get the crawl.log-lines for. 164 * @return A file containing the crawl.log lines. This file is temporary, and should be deleted after use. 165 * @throws ArgumentNotValid On negative jobids, or if domain is null or the empty string. 166 */ 167 public static File getCrawlLogForDomainInJob(String domain, long jobid) { 168 ArgumentNotValid.checkPositive(jobid, "jobid"); 169 ArgumentNotValid.checkNotNullOrEmpty(domain, "String domain"); 170 FileBatchJob urlsForDomainBatchJob = new HarvestedUrlsForDomainBatchJob(domain); 171 urlsForDomainBatchJob.processOnlyFilesMatching(getMetadataFilePatternForJobId(jobid)); 172 return getResultFile(urlsForDomainBatchJob); 173 } 174 175 /** 176 * Helper method to get result from a batchjob. 177 * 178 * @param batchJob a certain FileBatchJob 179 * @return a file with the result. 180 */ 181 private static File getResultFile(FileBatchJob batchJob) { 182 File f; 183 File fsorted; 184 try { 185 final String uuid = UUID.randomUUID().toString(); 186 f = File.createTempFile("temp", uuid + ".txt", FileUtils.getTempDir()); 187 f.deleteOnExit(); 188 fsorted = File.createTempFile("temp", uuid + "-sorted.txt", FileUtils.getTempDir()); 189 fsorted.deleteOnExit(); 190 } catch (IOException e) { 191 throw new IOFailure("Unable to create temporary file", e); 192 } 193 BatchStatus status = ArcRepositoryClientFactory.getViewerInstance().batch(batchJob, 194 Settings.get(CommonSettings.USE_REPLICA_ID)); 195 status.getResultFile().copyTo(f); 196 FileUtils.sortCrawlLogOnTimestamp(f, fsorted); 197 FileUtils.remove(f); 198 return fsorted; 199 } 200 201 /** 202 * Return any crawllog lines for a given jobid matching the given regular expression. 203 * 204 * @param jobid The jobid 205 * @param regexp A regular expression 206 * @return a File with the matching lines. 207 */ 208 public static File getCrawlLoglinesMatchingRegexp(long jobid, String regexp) { 209 ArgumentNotValid.checkPositive(jobid, "jobid"); 210 ArgumentNotValid.checkNotNullOrEmpty(regexp, "String regexp"); 211 FileBatchJob crawlLogBatchJob = new CrawlLogLinesMatchingRegexp(regexp); 212 crawlLogBatchJob.processOnlyFilesMatching(getMetadataFilePatternForJobId(jobid)); 213 return getResultFile(crawlLogBatchJob); 214 } 215 216 /** 217 * Construct the correct metadatafilepattern for a given jobID. 218 * @param jobid a given harvest jobID 219 * @return metadatafilePattern for the given jobid 220 */ 221 private static String getMetadataFilePatternForJobId(long jobid) { 222 // The old invalid metadataFilePattern 223 //return ".*"+jobid + ".*" + metadatafile_suffix; 224 // return jobid + metadatafile_suffix; 225 return "(.*-)?" + jobid + "(-.*)?" + metadatafile_suffix; 226 } 227}