001/* 002 * #%L 003 * Netarchivesuite - Heritrix 3 extensions 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.harvesting; 024 025import static org.archive.modules.fetcher.FetchStatusCodes.S_DNS_SUCCESS; 026import static org.archive.modules.fetcher.FetchStatusCodes.S_UNFETCHABLE_URI; 027 028import java.io.BufferedReader; 029import java.io.IOException; 030import java.io.Reader; 031import java.net.InetAddress; 032import java.net.UnknownHostException; 033import java.util.HashMap; 034import java.util.Map; 035import java.util.logging.Level; 036import java.util.logging.Logger; 037 038import org.apache.commons.httpclient.URIException; 039import org.apache.commons.io.IOUtils; 040import org.archive.io.ReadSource; 041import org.archive.modules.CrawlURI; 042import org.archive.modules.fetcher.FetchDNS; 043import org.archive.modules.net.CrawlHost; 044 045/** 046 * Extended FetchDNS processor which allows the override of hosts 047 * to be used before they are querying through a DNS server. 048 * 049 * @author nicl 050 */ 051public class NASFetchDNS extends FetchDNS { 052 053 /** Logger instance. */ 054 private static Logger logger = Logger.getLogger(FetchDNS.class.getName()); 055 056 /** 057 * Look for hosts in the hosts file/text value before doing a DNS lookup. 058 */ 059 protected boolean acceptDefinedHosts = true; 060 public boolean getAcceptDefinedHosts() { 061 return acceptDefinedHosts; 062 } 063 // @Required 064 public void setAcceptDefinedHosts(boolean acceptDefinedHosts) { 065 this.acceptDefinedHosts = acceptDefinedHosts; 066 } 067 068 /** 069 * Text from which to load hosts 070 */ 071 protected ReadSource hostsFile = null; 072 public ReadSource getHostsFile() { 073 return hostsFile; 074 } 075 // @Required 076 public void setHostsFile(ReadSource hostsFile) { 077 this.hostsFile = hostsFile; 078 } 079 080 /** 081 * Text from which to look for hosts. 082 */ 083 protected ReadSource hostsSource = null; 084 public ReadSource getHostsSource() { 085 return hostsSource; 086 } 087 // @Required 088 public void setHostsSource(ReadSource hostsSource) { 089 this.hostsSource = hostsSource; 090 } 091 092 ///private static final long DEFAULT_TTL_FOR_HOSTS_RESOLVES = Long.MAX_VALUE; // A very long time... 093 094 /** Has the hosts been loaded. */ 095 private boolean bInitialized = false; 096 097 /** Map of hosts that override the normal DNS lookup. */ 098 protected Map<String, String> hosts; 099 100 /* 101 * Check for the host in the hosts map before calling the extended method. 102 * @see org.archive.modules.fetcher.FetchDNS#innerProcess(org.archive.modules.CrawlURI) 103 */ 104 @Override 105 protected void innerProcess(CrawlURI curi) { 106 InetAddress address = null; 107 if (acceptDefinedHosts) { 108 String dnsName = null; 109 try { 110 dnsName = curi.getUURI().getReferencedHost(); 111 } catch (URIException e) { 112 logger.log(Level.SEVERE, "Failed parse of dns record " + curi, e); 113 } 114 if(dnsName == null) { 115 curi.setFetchStatus(S_UNFETCHABLE_URI); 116 return; 117 } 118 CrawlHost targetHost = getServerCache().getHostFor(dnsName); 119 if (isQuadAddress(curi, dnsName, targetHost)) { 120 // We're done processing. 121 return; 122 } 123 if (!bInitialized) { 124 reload(); 125 bInitialized = true; 126 } 127 if (hosts.size() > 0) { 128 // Do actual DNS lookup. 129 String ipAddress = hosts.get(dnsName); 130 if (ipAddress != null) { 131 curi.setFetchBeginTime(System.currentTimeMillis()); 132 try { 133 address = InetAddress.getByName(ipAddress); 134 } catch (UnknownHostException e1) { 135 address = null; 136 } 137 if (address != null) { 138 //targetHost.setIP(address, DEFAULT_TTL_FOR_HOSTS_RESOLVES); 139 //curi.setFetchStatus(S_GETBYNAME_SUCCESS); 140 targetHost.setIP(address, CrawlHost.IP_NEVER_EXPIRES); 141 curi.setFetchStatus(S_DNS_SUCCESS); 142 curi.setContentType("text/dns"); 143 if (logger.isLoggable(Level.FINE)) { 144 logger.fine("Found address for " + dnsName + " using hosts file."); 145 } 146 } else { 147 if (logger.isLoggable(Level.FINE)) { 148 logger.fine("Failed find of address for " + dnsName + " using hosts file."); 149 } 150 setUnresolvable(curi, targetHost); 151 } 152 curi.setFetchCompletedTime(System.currentTimeMillis()); 153 } 154 } 155 } 156 if (address == null) { 157 super.innerProcess(curi); 158 } 159 } 160 161 /** 162 * Clear loaded hosts of reload from hosts file and value text. 163 */ 164 protected void reload() { 165 hosts = new HashMap<String, String>(); 166 getHosts(getHostsFile()); 167 getHosts(getHostsSource()); 168 } 169 170 /** 171 * Run through the lines in a <code>ReadSource</code> and add all valid host lines encountered. 172 * @param hostsSource hosts file or value text 173 */ 174 protected void getHosts(ReadSource hostsSource) { 175 if (hostsSource != null) { 176 if (logger.isLoggable(Level.FINE)) { 177 logger.fine("reading surt prefixes from " + hostsSource); 178 } 179 Reader reader = hostsSource.obtainReader(); 180 BufferedReader bufferedReader = new BufferedReader(reader); 181 try { 182 String str; 183 while ((str = bufferedReader.readLine()) != null) { 184 str = str.trim(); 185 if (!str.startsWith("#")) { 186 int idx = str.indexOf('#'); 187 if (idx != -1) { 188 str = str.substring(0, idx).trim(); 189 } 190 String[] tokensArr = new String[3]; 191 int tokens = tokenize(str, tokensArr); 192 if (tokens >= 2) { 193 hosts.put(tokensArr[1], tokensArr[0]); 194 } 195 if (tokens >= 3) { 196 hosts.put(tokensArr[2], tokensArr[0]); 197 } 198 } 199 } 200 } catch (IOException e) { 201 logger.log(Level.SEVERE, "Exception parsing hosts", e); 202 } finally { 203 IOUtils.closeQuietly(bufferedReader); 204 IOUtils.closeQuietly(reader); 205 } 206 } 207 } 208 209 /** 210 * Split input string into tokens. Treats multiple whitespace as one. 211 * Only parse the number of tokens that are able to fit into the supplied token array. 212 * @param str split input string into tokens 213 * @param tokensArr supply a string array to be filled with tokens 214 * @return number of tokens inserted into the token array 215 */ 216 public static int tokenize(String str, String[] tokensArr) { 217 int tokens = 0; 218 int idx = 0; 219 int pIdx; 220 while (tokens < tokensArr.length && idx < str.length()) { 221 while (idx < str.length() && Character.isWhitespace(str.charAt(idx))) { 222 ++idx; 223 } 224 pIdx = idx; 225 while (idx < str.length() && !Character.isWhitespace(str.charAt(idx))) { 226 ++idx; 227 } 228 if (idx > pIdx) { 229 tokensArr[tokens++] = str.substring(pIdx, idx); 230 } 231 } 232 return tokens; 233 } 234 235}