001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.viewerproxy.webinterface; 025 026import java.io.BufferedReader; 027import java.io.IOException; 028import java.io.InputStreamReader; 029import java.io.OutputStream; 030 031import org.slf4j.Logger; 032import org.slf4j.LoggerFactory; 033 034import dk.netarkivet.common.CommonSettings; 035import dk.netarkivet.common.Constants; 036import dk.netarkivet.common.exceptions.ArgumentNotValid; 037import dk.netarkivet.common.exceptions.IOFailure; 038import dk.netarkivet.common.utils.Settings; 039import dk.netarkivet.common.utils.archive.ArchiveBatchJob; 040import dk.netarkivet.common.utils.archive.ArchiveRecordBase; 041import dk.netarkivet.common.utils.batch.ArchiveBatchFilter; 042 043/** 044 * Batchjob that extracts lines from a crawl log matching a regular expression The batch job should be restricted to run 045 * on metadata files for a specific job only, using the {@link #processOnlyFilesMatching(String)} construct. 046 */ 047@SuppressWarnings({"serial"}) 048public class CrawlLogLinesMatchingRegexp extends ArchiveBatchJob { 049 050 /** The logger. */ 051 //private final Log log = LogFactory.getLog(getClass().getName()); 052 private static final Logger log = LoggerFactory.getLogger(CrawlLogLinesMatchingRegexp.class); 053 054 /** Metadata URL for crawl logs. */ 055 private static final String SETUP_URL_FORMAT = String.format("metadata://%s/crawl/logs/crawl.log", 056 Settings.get(CommonSettings.ORGANIZATION)); 057 058 /** The regular expression to match in the crawl.log line. */ 059 private final String regexp; 060 061 /** 062 * Initialise the batch job. 063 * 064 * @param regexp The regexp to match in the crawl.log lines. 065 */ 066 public CrawlLogLinesMatchingRegexp(String regexp) { 067 ArgumentNotValid.checkNotNullOrEmpty(regexp, "regexp"); 068 this.regexp = regexp; 069 070 /** 071 * One week in milliseconds. 072 */ 073 batchJobTimeout = 7 * Constants.ONE_DAY_IN_MILLIES; 074 } 075 076 /** 077 * Does nothing, no initialisation is needed. 078 * 079 * @param os Not used. 080 */ 081 @Override 082 public void initialize(OutputStream os) { 083 } 084 085 @Override 086 public ArchiveBatchFilter getFilter() { 087 return new ArchiveBatchFilter("OnlyCrawlLog") { 088 public boolean accept(ArchiveRecordBase record) { 089 String URL = record.getHeader().getUrl(); 090 if (URL == null) { 091 return false; 092 } else { 093 return URL.startsWith(SETUP_URL_FORMAT); 094 } 095 } 096 }; 097 } 098 099 /** 100 * Process a record on crawl log concerning the given domain to result. 101 * 102 * @param record The record to process. 103 * @param os The output stream for the result. 104 * @throws ArgumentNotValid on null parameters 105 * @throws IOFailure on trouble processing the record. 106 */ 107 @Override 108 public void processRecord(ArchiveRecordBase record, OutputStream os) { 109 ArgumentNotValid.checkNotNull(record, "ArchiveRecordBase record"); 110 ArgumentNotValid.checkNotNull(os, "OutputStream os"); 111 BufferedReader arcreader = new BufferedReader(new InputStreamReader(record.getInputStream())); 112 try { 113 for (String line = arcreader.readLine(); line != null; line = arcreader.readLine()) { 114 if (line.matches(regexp)) { 115 os.write(line.getBytes("UTF-8")); 116 os.write('\n'); 117 } 118 119 } 120 } catch (IOException e) { 121 throw new IOFailure("Unable to process (w)arc record", e); 122 } finally { 123 try { 124 arcreader.close(); 125 } catch (IOException e) { 126 log.warn("unable to close arcreader probably", e); 127 } 128 } 129 } 130 131 /** 132 * Does nothing, no finishing is needed. 133 * 134 * @param os Not used. 135 */ 136 @Override 137 public void finish(OutputStream os) { 138 } 139 140 @Override 141 public String toString() { 142 return getClass().getName() + ", with arguments: Regexp = " + regexp + ", Filter = " + getFilter(); 143 } 144}