001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2018 The Royal Danish Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.viewerproxy.webinterface; 025 026import java.io.BufferedReader; 027import java.io.IOException; 028import java.io.InputStreamReader; 029import java.io.OutputStream; 030 031import org.slf4j.Logger; 032import org.slf4j.LoggerFactory; 033 034import dk.netarkivet.common.CommonSettings; 035import dk.netarkivet.common.Constants; 036import dk.netarkivet.common.exceptions.ArgumentNotValid; 037import dk.netarkivet.common.exceptions.IOFailure; 038import dk.netarkivet.common.utils.Settings; 039import dk.netarkivet.common.utils.archive.ArchiveBatchJob; 040import dk.netarkivet.common.utils.archive.ArchiveRecordBase; 041import dk.netarkivet.common.utils.batch.ArchiveBatchFilter; 042 043/** 044 * Batchjob that extracts lines from a crawl log matching a regular expression The batch job should be restricted to run 045 * on metadata files for a specific job only, using the {@link #processOnlyFilesMatching(String)} construct. 046 */ 047@SuppressWarnings({"serial"}) 048public class CrawlLogLinesMatchingRegexp extends ArchiveBatchJob { 049 050 /** The logger. */ 051 private static final Logger log = LoggerFactory.getLogger(CrawlLogLinesMatchingRegexp.class); 052 053 /** Metadata URL for crawl logs. */ 054 private static final String SETUP_URL_FORMAT = String.format("metadata://%s/crawl/logs/crawl.log", 055 Settings.get(CommonSettings.ORGANIZATION)); 056 057 /** The regular expression to match in the crawl.log line. */ 058 private final String regexp; 059 060 /** 061 * Initialise the batch job. 062 * 063 * @param regexp The regexp to match in the crawl.log lines. 064 */ 065 public CrawlLogLinesMatchingRegexp(String regexp) { 066 ArgumentNotValid.checkNotNullOrEmpty(regexp, "regexp"); 067 this.regexp = regexp; 068 069 /** 070 * One week in milliseconds. 071 */ 072 batchJobTimeout = 7 * Constants.ONE_DAY_IN_MILLIES; 073 } 074 075 /** 076 * Does nothing, no initialisation is needed. 077 * 078 * @param os Not used. 079 */ 080 @Override 081 public void initialize(OutputStream os) { 082 } 083 084 @Override 085 public ArchiveBatchFilter getFilter() { 086 return new ArchiveBatchFilter("OnlyCrawlLog") { 087 public boolean accept(ArchiveRecordBase record) { 088 String URL = record.getHeader().getUrl(); 089 if (URL == null) { 090 return false; 091 } else { 092 return URL.startsWith(SETUP_URL_FORMAT); 093 } 094 } 095 }; 096 } 097 098 /** 099 * Process a record on crawl log concerning the given domain to result. 100 * 101 * @param record The record to process. 102 * @param os The output stream for the result. 103 * @throws ArgumentNotValid on null parameters 104 * @throws IOFailure on trouble processing the record. 105 */ 106 @Override 107 public void processRecord(ArchiveRecordBase record, OutputStream os) { 108 ArgumentNotValid.checkNotNull(record, "ArchiveRecordBase record"); 109 ArgumentNotValid.checkNotNull(os, "OutputStream os"); 110 BufferedReader arcreader = new BufferedReader(new InputStreamReader(record.getInputStream())); 111 try { 112 for (String line = arcreader.readLine(); line != null; line = arcreader.readLine()) { 113 if (line.matches(regexp)) { 114 os.write(line.getBytes("UTF-8")); 115 os.write('\n'); 116 } 117 118 } 119 } catch (IOException e) { 120 throw new IOFailure("Unable to process (w)arc record", e); 121 } finally { 122 try { 123 arcreader.close(); 124 } catch (IOException e) { 125 log.warn("unable to close arcreader probably", e); 126 } 127 } 128 } 129 130 /** 131 * Does nothing, no finishing is needed. 132 * 133 * @param os Not used. 134 */ 135 @Override 136 public void finish(OutputStream os) { 137 } 138 139 @Override 140 public String toString() { 141 return getClass().getName() + ", with arguments: Regexp = " + regexp + ", Filter = " + getFilter(); 142 } 143}