001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023
024package dk.netarkivet.viewerproxy.webinterface;
025
026import java.io.BufferedReader;
027import java.io.IOException;
028import java.io.InputStreamReader;
029import java.io.OutputStream;
030
031import org.slf4j.Logger;
032import org.slf4j.LoggerFactory;
033
034import dk.netarkivet.common.CommonSettings;
035import dk.netarkivet.common.Constants;
036import dk.netarkivet.common.exceptions.ArgumentNotValid;
037import dk.netarkivet.common.exceptions.IOFailure;
038import dk.netarkivet.common.utils.DomainUtils;
039import dk.netarkivet.common.utils.FixedUURI;
040import dk.netarkivet.common.utils.Settings;
041import dk.netarkivet.common.utils.archive.ArchiveBatchJob;
042import dk.netarkivet.common.utils.archive.ArchiveRecordBase;
043import dk.netarkivet.common.utils.batch.ArchiveBatchFilter;
044
045/**
046 * Batchjob that extracts lines referring to a specific domain from a crawl log. The batch job should be restricted to
047 * run on metadata files for a specific job only, using the {@link #processOnlyFilesMatching(String)} construct.
048 */
049@SuppressWarnings({"serial"})
050public class HarvestedUrlsForDomainBatchJob extends ArchiveBatchJob {
051
052    // logger
053    //private final Log log = LogFactory.getLog(getClass().getName());
054    private static final Logger log = LoggerFactory.getLogger(HarvestedUrlsForDomainBatchJob.class);
055
056    /** Metadata URL for crawl logs. */
057    private static final String SETUP_URL_FORMAT = String.format("metadata://%s/crawl/logs/crawl.log",
058            Settings.get(CommonSettings.ORGANIZATION));
059    /** The domain to extract crawl.log lines for. */
060    final String domain;
061
062    /**
063     * Initialise the batch job.
064     *
065     * @param domain The domain to get crawl.log lines for.
066     */
067    public HarvestedUrlsForDomainBatchJob(String domain) {
068        ArgumentNotValid.checkNotNullOrEmpty(domain, "domain");
069        this.domain = domain;
070
071        /**
072         * Two week in milliseconds.
073         */
074        batchJobTimeout = 7 * Constants.ONE_DAY_IN_MILLIES;
075    }
076
077    /**
078     * Does nothing, no initialisation is needed.
079     *
080     * @param os Not used.
081     */
082    @Override
083    public void initialize(OutputStream os) {
084    }
085
086    @Override
087    public ArchiveBatchFilter getFilter() {
088        return new ArchiveBatchFilter("OnlyCrawlLog") {
089            @Override
090            public boolean accept(ArchiveRecordBase record) {
091                // All ARC records have a URL, but the WarcInfo records doesn't
092                if (record.bIsWarc) {
093                    // In the WARC file the warc-info hasn't a URL. the other
094                    // records in the metadata file have that.
095                    return (record.getHeader().getUrl() != null && record.getHeader().getUrl()
096                            .startsWith(SETUP_URL_FORMAT));
097                } else {
098                    return record.getHeader().getUrl().startsWith(SETUP_URL_FORMAT);
099                }
100            }
101        };
102    }
103
104    /**
105     * Process a record on crawl log concerning the given domain to result.
106     *
107     * @param record The record to process.
108     * @param os The output stream for the result.
109     * @throws ArgumentNotValid on null parameters
110     * @throws IOFailure on trouble processing the record.
111     */
112    @Override
113    public void processRecord(ArchiveRecordBase record, OutputStream os) {
114        ArgumentNotValid.checkNotNull(record, "ArchiveRecordBase record");
115        ArgumentNotValid.checkNotNull(os, "OutputStream os");
116        log.info("looking for crawl-log lines for domain: " + domain);
117
118        BufferedReader arcreader = new BufferedReader(new InputStreamReader(record.getInputStream()));
119        String line = null;
120        try {
121            for (line = arcreader.readLine(); line != null; line = arcreader.readLine()) {
122
123                // Parse a single crawl-log line into parts
124                // The parts are here separated by white space.
125                // part 4 of the crawl-line is the url component
126                // part 6 of the crawl-line is the discovery url component
127                // Cf. "http://crawler.archive.org/articles/user_manual
128                // /analysis.html#logs"
129
130                String[] parts = line.split("\\s+");
131                final int URL_PART_INDEX = 3;
132                final int DISCOVERY_URL_PART_INDEX = 5;
133                // The current crawl.log line is written to the outstream
134                // in two cases:
135                // A. If it has a URL component (4th component) and
136                // this URL belongs to the domain in question
137                // B. If it has a Discovery URL (6th component) and
138                // this URL belongs to the domain in question
139                if (parts.length > 3 && getDomainFromUrlPart(parts[URL_PART_INDEX]).equals(domain)) {
140                    os.write(line.getBytes("UTF-8"));
141                    os.write('\n');
142
143                } else if (parts.length > 5 && !parts[5].equals("-")
144                        && getDomainFromUrlPart(parts[DISCOVERY_URL_PART_INDEX]).equals(domain)) {
145                    os.write(line.getBytes("UTF-8"));
146                    os.write('\n');
147                }
148
149            }
150        } catch (IOException e) {
151            throw new IOFailure("Unable to process (w)arc record", e);
152        } catch (Throwable e1) {
153            e1.printStackTrace();
154            System.out.println("caused by line '" + line + "'");
155        } finally {
156            try {
157                arcreader.close();
158            } catch (IOException e) {
159                log.warn("unable to close arcreader probably", e);
160            }
161        }
162    }
163
164    /**
165     * Return domain from urlpart, if feasibly. Return empty string otherwise.
166     *
167     * @param urlpart One of the URL part of the crawllog-line.
168     * @return domain from urlpart, if feasibly. Return empty string otherwise
169     */
170    private String getDomainFromUrlPart(String urlpart) {
171        String domain = null;
172        try {
173            domain = DomainUtils.domainNameFromHostname(new FixedUURI(urlpart, true).getReferencedHost());
174        } catch (Exception e) {
175            log.warn("Unable to extract a domain name from the url ' " + urlpart + "' due to exception", e);
176        }
177        if (domain == null) {
178            domain = "";
179        }
180        return domain;
181    }
182
183    /**
184     * Does nothing, no finishing is needed.
185     *
186     * @param os Not used.
187     */
188    @Override
189    public void finish(OutputStream os) {
190    }
191
192    /**
193     * Humanly readable representation of this instance.
194     *
195     * @return The class content.
196     */
197    @Override
198    public String toString() {
199        return getClass().getName() + ", with arguments: Domain = " + domain + ", Filter = " + getFilter();
200    }
201}