001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.viewerproxy.webinterface; 024 025import java.io.BufferedReader; 026import java.io.File; 027import java.io.FileReader; 028import java.io.IOException; 029import java.util.ArrayList; 030import java.util.Collections; 031import java.util.HashSet; 032import java.util.List; 033import java.util.Set; 034import java.util.UUID; 035 036import org.apache.commons.io.IOUtils; 037 038import dk.netarkivet.common.CommonSettings; 039import dk.netarkivet.common.distribute.arcrepository.ArcRepositoryClientFactory; 040import dk.netarkivet.common.distribute.arcrepository.BatchStatus; 041import dk.netarkivet.common.exceptions.ArgumentNotValid; 042import dk.netarkivet.common.exceptions.IOFailure; 043import dk.netarkivet.common.utils.FileUtils; 044import dk.netarkivet.common.utils.Settings; 045import dk.netarkivet.common.utils.batch.ArchiveBatchFilter; 046import dk.netarkivet.common.utils.batch.FileBatchJob; 047import dk.netarkivet.common.utils.batch.FileListJob; 048import dk.netarkivet.common.utils.cdx.ArchiveExtractCDXJob; 049import dk.netarkivet.common.utils.cdx.CDXRecord; 050 051/** 052 * Methods for generating the batch results needed by the QA pages. 053 */ 054@SuppressWarnings({"serial"}) 055public class Reporting { 056 /** 057 * Utility class, do not initialise. 058 */ 059 private Reporting() { 060 } 061 062 /** The suffix for the data arc/warc files produced by Heritrix. */ 063 static final String archivefile_suffix = ".*\\.(w)?arc(\\.gz)?"; 064 065 /** The suffix for the data arc/warc metadata file created by NetarchiveSuite. */ 066 static final String metadatafile_suffix = "-metadata-[0-9]+\\.(w)?arc(\\.gz)?"; 067 068 /** 069 * Submit a batch job to list all files for a job, and report result in a sorted list. 070 * 071 * @param jobid The job to get files for. 072 * @param harvestprefix The harvestprefix for the files produced by heritrix 073 * @return A sorted list of files. 074 * @throws ArgumentNotValid If jobid is 0 or negative. 075 * @throws IOFailure On trouble generating the file list 076 */ 077 public static List<String> getFilesForJob(int jobid, String harvestprefix) { 078 ArgumentNotValid.checkPositive(jobid, "jobid"); 079 FileBatchJob fileListJob = new FileListJob(); 080 List<String> acceptedPatterns = new ArrayList<String>(); 081 acceptedPatterns.add(".*" + jobid + ".*" + metadatafile_suffix); 082 acceptedPatterns.add(harvestprefix + archivefile_suffix); 083 fileListJob.processOnlyFilesMatching(acceptedPatterns); 084 085 File f; 086 try { 087 f = File.createTempFile(jobid + "-files", ".txt", FileUtils.getTempDir()); 088 } catch (IOException e) { 089 throw new IOFailure("Could not create temorary file", e); 090 } 091 BatchStatus status = ArcRepositoryClientFactory.getViewerInstance().batch(fileListJob, 092 Settings.get(CommonSettings.USE_REPLICA_ID)); 093 status.getResultFile().copyTo(f); 094 List<String> lines = new ArrayList<String>(FileUtils.readListFromFile(f)); 095 FileUtils.remove(f); 096 Set<String> linesAsSet = new HashSet<String>(); 097 linesAsSet.addAll(lines); 098 lines = new ArrayList<String>(); 099 lines.addAll(linesAsSet); 100 Collections.sort(lines); 101 return lines; 102 } 103 104 /** 105 * Submit a batch job to generate cdx for all metadata files for a job, and report result in a list. 106 * 107 * @param jobid The job to get cdx for. 108 * @return A list of cdx records. 109 * @throws ArgumentNotValid If jobid is 0 or negative. 110 * @throws IOFailure On trouble generating the cdx 111 */ 112 public static List<CDXRecord> getMetadataCDXRecordsForJob(long jobid) { 113 ArgumentNotValid.checkPositive(jobid, "jobid"); 114 FileBatchJob cdxJob = new ArchiveExtractCDXJob(false) { 115 @Override 116 public ArchiveBatchFilter getFilter() { 117 return ArchiveBatchFilter.EXCLUDE_NON_WARCINFO_RECORDS; 118 } 119 }; 120 cdxJob.processOnlyFilesMatching(".*"+jobid + ".*" + metadatafile_suffix); 121 122 File f; 123 try { 124 f = File.createTempFile(jobid + "-reports", ".cdx", FileUtils.getTempDir()); 125 } catch (IOException e) { 126 throw new IOFailure("Could not create temporary file", e); 127 } 128 BatchStatus status = ArcRepositoryClientFactory.getViewerInstance().batch(cdxJob, 129 Settings.get(CommonSettings.USE_REPLICA_ID)); 130 status.getResultFile().copyTo(f); 131 List<CDXRecord> records; 132 BufferedReader reader = null; 133 try { 134 reader = new BufferedReader(new FileReader(f)); 135 records = new ArrayList<CDXRecord>(); 136 for (String line = reader.readLine(); line != null; line = reader.readLine()) { 137 String[] parts = line.split("\\s+"); 138 CDXRecord record = new CDXRecord(parts); 139 records.add(record); 140 } 141 } catch (IOException e) { 142 throw new IOFailure("Unable to read results from file '" + f + "'", e); 143 } finally { 144 IOUtils.closeQuietly(reader); 145 FileUtils.remove(f); 146 } 147 return records; 148 } 149 150 /** 151 * Submit a batch job to extract the part of a crawl log that is associated with the given domain and job. 152 * 153 * @param domain The domain to get crawl.log-lines for. 154 * @param jobid The jobid to get the crawl.log-lines for. 155 * @return A file containing the crawl.log lines. This file is temporary, and should be deleted after use. 156 * @throws ArgumentNotValid On negative jobids, or if domain is null or the empty string. 157 */ 158 public static File getCrawlLogForDomainInJob(String domain, int jobid) { 159 ArgumentNotValid.checkPositive(jobid, "jobid"); 160 ArgumentNotValid.checkNotNullOrEmpty(domain, "String domain"); 161 FileBatchJob urlsForDomainBatchJob = new HarvestedUrlsForDomainBatchJob(domain); 162 urlsForDomainBatchJob.processOnlyFilesMatching(".*"+jobid + ".*" + metadatafile_suffix); 163 return getResultFile(urlsForDomainBatchJob); 164 } 165 166 /** 167 * Helper method to get result from a batchjob. 168 * 169 * @param batchJob a certain FileBatchJob 170 * @return a file with the result. 171 */ 172 private static File getResultFile(FileBatchJob batchJob) { 173 File f; 174 File fsorted; 175 try { 176 final String uuid = UUID.randomUUID().toString(); 177 f = File.createTempFile("temp", uuid + ".txt", FileUtils.getTempDir()); 178 f.deleteOnExit(); 179 fsorted = File.createTempFile("temp", uuid + "-sorted.txt", FileUtils.getTempDir()); 180 fsorted.deleteOnExit(); 181 } catch (IOException e) { 182 throw new IOFailure("Unable to create temporary file", e); 183 } 184 BatchStatus status = ArcRepositoryClientFactory.getViewerInstance().batch(batchJob, 185 Settings.get(CommonSettings.USE_REPLICA_ID)); 186 status.getResultFile().copyTo(f); 187 FileUtils.sortCrawlLogOnTimestamp(f, fsorted); 188 FileUtils.remove(f); 189 return fsorted; 190 } 191 192 /** 193 * Return any crawllog lines for a given jobid matching the given regular expression. 194 * 195 * @param jobid The jobid 196 * @param regexp A regular expression 197 * @return a File with the matching lines. 198 */ 199 public static File getCrawlLoglinesMatchingRegexp(int jobid, String regexp) { 200 ArgumentNotValid.checkPositive(jobid, "jobid"); 201 ArgumentNotValid.checkNotNullOrEmpty(regexp, "String regexp"); 202 FileBatchJob crawlLogBatchJob = new CrawlLogLinesMatchingRegexp(regexp); 203 crawlLogBatchJob.processOnlyFilesMatching(".*"+jobid + ".*" + metadatafile_suffix); 204 return getResultFile(crawlLogBatchJob); 205 } 206 207}