001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.viewerproxy.webinterface;
025
026import java.io.BufferedReader;
027import java.io.IOException;
028import java.io.InputStreamReader;
029import java.io.OutputStream;
030
031import org.slf4j.Logger;
032import org.slf4j.LoggerFactory;
033
034import dk.netarkivet.common.CommonSettings;
035import dk.netarkivet.common.Constants;
036import dk.netarkivet.common.exceptions.ArgumentNotValid;
037import dk.netarkivet.common.exceptions.IOFailure;
038import dk.netarkivet.common.utils.Settings;
039import dk.netarkivet.common.utils.archive.ArchiveBatchJob;
040import dk.netarkivet.common.utils.archive.ArchiveRecordBase;
041import dk.netarkivet.common.utils.batch.ArchiveBatchFilter;
042
043/**
044 * Batchjob that extracts lines from a crawl log matching a regular expression The batch job should be restricted to run
045 * on metadata files for a specific job only, using the {@link #processOnlyFilesMatching(String)} construct.
046 */
047@SuppressWarnings({"serial"})
048public class CrawlLogLinesMatchingRegexp extends ArchiveBatchJob {
049
050    /** The logger. */
051    //private final Log log = LogFactory.getLog(getClass().getName());
052    private static final Logger log = LoggerFactory.getLogger(CrawlLogLinesMatchingRegexp.class);
053
054    /** Metadata URL for crawl logs. */
055    private static final String SETUP_URL_FORMAT = String.format("metadata://%s/crawl/logs/crawl.log",
056            Settings.get(CommonSettings.ORGANIZATION));
057
058    /** The regular expression to match in the crawl.log line. */
059    private final String regexp;
060
061    /**
062     * Initialise the batch job.
063     *
064     * @param regexp The regexp to match in the crawl.log lines.
065     */
066    public CrawlLogLinesMatchingRegexp(String regexp) {
067        ArgumentNotValid.checkNotNullOrEmpty(regexp, "regexp");
068        this.regexp = regexp;
069
070        /**
071         * One week in milliseconds.
072         */
073        batchJobTimeout = 7 * Constants.ONE_DAY_IN_MILLIES;
074    }
075
076    /**
077     * Does nothing, no initialisation is needed.
078     *
079     * @param os Not used.
080     */
081    @Override
082    public void initialize(OutputStream os) {
083    }
084
085    @Override
086    public ArchiveBatchFilter getFilter() {
087        return new ArchiveBatchFilter("OnlyCrawlLog") {
088            public boolean accept(ArchiveRecordBase record) {
089                String URL = record.getHeader().getUrl();
090                if (URL == null) {
091                    return false;
092                } else {
093                    return URL.startsWith(SETUP_URL_FORMAT);
094                }
095            }
096        };
097    }
098
099    /**
100     * Process a record on crawl log concerning the given domain to result.
101     *
102     * @param record The record to process.
103     * @param os The output stream for the result.
104     * @throws ArgumentNotValid on null parameters
105     * @throws IOFailure on trouble processing the record.
106     */
107    @Override
108    public void processRecord(ArchiveRecordBase record, OutputStream os) {
109        ArgumentNotValid.checkNotNull(record, "ArchiveRecordBase record");
110        ArgumentNotValid.checkNotNull(os, "OutputStream os");
111        BufferedReader arcreader = new BufferedReader(new InputStreamReader(record.getInputStream()));
112        try {
113            for (String line = arcreader.readLine(); line != null; line = arcreader.readLine()) {
114                if (line.matches(regexp)) {
115                    os.write(line.getBytes("UTF-8"));
116                    os.write('\n');
117                }
118
119            }
120        } catch (IOException e) {
121            throw new IOFailure("Unable to process (w)arc record", e);
122        } finally {
123            try {
124                arcreader.close();
125            } catch (IOException e) {
126                log.warn("unable to close arcreader probably", e);
127            }
128        }
129    }
130
131    /**
132     * Does nothing, no finishing is needed.
133     *
134     * @param os Not used.
135     */
136    @Override
137    public void finish(OutputStream os) {
138    }
139
140    @Override
141    public String toString() {
142        return getClass().getName() + ", with arguments: Regexp = " + regexp + ", Filter = " + getFilter();
143    }
144}