001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023 024package dk.netarkivet.viewerproxy.webinterface; 025 026import java.io.BufferedReader; 027import java.io.IOException; 028import java.io.InputStreamReader; 029import java.io.OutputStream; 030 031import org.slf4j.Logger; 032import org.slf4j.LoggerFactory; 033 034import dk.netarkivet.common.CommonSettings; 035import dk.netarkivet.common.Constants; 036import dk.netarkivet.common.exceptions.ArgumentNotValid; 037import dk.netarkivet.common.exceptions.IOFailure; 038import dk.netarkivet.common.utils.DomainUtils; 039import dk.netarkivet.common.utils.FixedUURI; 040import dk.netarkivet.common.utils.Settings; 041import dk.netarkivet.common.utils.archive.ArchiveBatchJob; 042import dk.netarkivet.common.utils.archive.ArchiveRecordBase; 043import dk.netarkivet.common.utils.batch.ArchiveBatchFilter; 044 045/** 046 * Batchjob that extracts lines referring to a specific domain from a crawl log. The batch job should be restricted to 047 * run on metadata files for a specific job only, using the {@link #processOnlyFilesMatching(String)} construct. 048 */ 049@SuppressWarnings({"serial"}) 050public class HarvestedUrlsForDomainBatchJob extends ArchiveBatchJob { 051 052 // logger 053 //private final Log log = LogFactory.getLog(getClass().getName()); 054 private static final Logger log = LoggerFactory.getLogger(HarvestedUrlsForDomainBatchJob.class); 055 056 /** Metadata URL for crawl logs. */ 057 private static final String SETUP_URL_FORMAT = String.format("metadata://%s/crawl/logs/crawl.log", 058 Settings.get(CommonSettings.ORGANIZATION)); 059 /** The domain to extract crawl.log lines for. */ 060 final String domain; 061 062 /** 063 * Initialise the batch job. 064 * 065 * @param domain The domain to get crawl.log lines for. 066 */ 067 public HarvestedUrlsForDomainBatchJob(String domain) { 068 ArgumentNotValid.checkNotNullOrEmpty(domain, "domain"); 069 this.domain = domain; 070 071 /** 072 * Two week in milliseconds. 073 */ 074 batchJobTimeout = 7 * Constants.ONE_DAY_IN_MILLIES; 075 } 076 077 /** 078 * Does nothing, no initialisation is needed. 079 * 080 * @param os Not used. 081 */ 082 @Override 083 public void initialize(OutputStream os) { 084 } 085 086 @Override 087 public ArchiveBatchFilter getFilter() { 088 return new ArchiveBatchFilter("OnlyCrawlLog") { 089 @Override 090 public boolean accept(ArchiveRecordBase record) { 091 // All ARC records have a URL, but the WarcInfo records doesn't 092 if (record.bIsWarc) { 093 // In the WARC file the warc-info hasn't a URL. the other 094 // records in the metadata file have that. 095 return (record.getHeader().getUrl() != null && record.getHeader().getUrl() 096 .startsWith(SETUP_URL_FORMAT)); 097 } else { 098 return record.getHeader().getUrl().startsWith(SETUP_URL_FORMAT); 099 } 100 } 101 }; 102 } 103 104 /** 105 * Process a record on crawl log concerning the given domain to result. 106 * 107 * @param record The record to process. 108 * @param os The output stream for the result. 109 * @throws ArgumentNotValid on null parameters 110 * @throws IOFailure on trouble processing the record. 111 */ 112 @Override 113 public void processRecord(ArchiveRecordBase record, OutputStream os) { 114 ArgumentNotValid.checkNotNull(record, "ArchiveRecordBase record"); 115 ArgumentNotValid.checkNotNull(os, "OutputStream os"); 116 log.info("looking for crawl-log lines for domain: " + domain); 117 118 BufferedReader arcreader = new BufferedReader(new InputStreamReader(record.getInputStream())); 119 String line = null; 120 try { 121 for (line = arcreader.readLine(); line != null; line = arcreader.readLine()) { 122 123 // Parse a single crawl-log line into parts 124 // The parts are here separated by white space. 125 // part 4 of the crawl-line is the url component 126 // part 6 of the crawl-line is the discovery url component 127 // Cf. "http://crawler.archive.org/articles/user_manual 128 // /analysis.html#logs" 129 130 String[] parts = line.split("\\s+"); 131 final int URL_PART_INDEX = 3; 132 final int DISCOVERY_URL_PART_INDEX = 5; 133 // The current crawl.log line is written to the outstream 134 // in two cases: 135 // A. If it has a URL component (4th component) and 136 // this URL belongs to the domain in question 137 // B. If it has a Discovery URL (6th component) and 138 // this URL belongs to the domain in question 139 if (parts.length > 3 && getDomainFromUrlPart(parts[URL_PART_INDEX]).equals(domain)) { 140 os.write(line.getBytes("UTF-8")); 141 os.write('\n'); 142 143 } else if (parts.length > 5 && !parts[5].equals("-") 144 && getDomainFromUrlPart(parts[DISCOVERY_URL_PART_INDEX]).equals(domain)) { 145 os.write(line.getBytes("UTF-8")); 146 os.write('\n'); 147 } 148 149 } 150 } catch (IOException e) { 151 throw new IOFailure("Unable to process (w)arc record", e); 152 } catch (Throwable e1) { 153 e1.printStackTrace(); 154 System.out.println("caused by line '" + line + "'"); 155 } finally { 156 try { 157 arcreader.close(); 158 } catch (IOException e) { 159 log.warn("unable to close arcreader probably", e); 160 } 161 } 162 } 163 164 /** 165 * Return domain from urlpart, if feasibly. Return empty string otherwise. 166 * 167 * @param urlpart One of the URL part of the crawllog-line. 168 * @return domain from urlpart, if feasibly. Return empty string otherwise 169 */ 170 private String getDomainFromUrlPart(String urlpart) { 171 String domain = null; 172 try { 173 domain = DomainUtils.domainNameFromHostname(new FixedUURI(urlpart, true).getReferencedHost()); 174 } catch (Exception e) { 175 log.warn("Unable to extract a domain name from the url ' " + urlpart + "' due to exception", e); 176 } 177 if (domain == null) { 178 domain = ""; 179 } 180 return domain; 181 } 182 183 /** 184 * Does nothing, no finishing is needed. 185 * 186 * @param os Not used. 187 */ 188 @Override 189 public void finish(OutputStream os) { 190 } 191 192 /** 193 * Humanly readable representation of this instance. 194 * 195 * @return The class content. 196 */ 197 @Override 198 public String toString() { 199 return getClass().getName() + ", with arguments: Domain = " + domain + ", Filter = " + getFilter(); 200 } 201}