001package dk.netarkivet.harvester.harvesting; 002 003import static org.archive.modules.fetcher.FetchStatusCodes.S_DNS_SUCCESS; 004import static org.archive.modules.fetcher.FetchStatusCodes.S_DOMAIN_UNRESOLVABLE; 005import static org.archive.modules.fetcher.FetchStatusCodes.S_GETBYNAME_SUCCESS; 006import static org.archive.modules.fetcher.FetchStatusCodes.S_UNFETCHABLE_URI; 007 008import java.io.ByteArrayInputStream; 009import java.io.ByteArrayOutputStream; 010import java.io.IOException; 011import java.io.InputStream; 012import java.net.InetAddress; 013import java.net.UnknownHostException; 014import java.security.MessageDigest; 015import java.util.logging.Level; 016import java.util.logging.Logger; 017import java.util.regex.Matcher; 018 019import org.apache.commons.httpclient.URIException; 020import org.archive.modules.CrawlURI; 021import org.archive.modules.Processor; 022import org.archive.modules.net.CrawlHost; 023import org.archive.modules.net.ServerCache; 024import org.archive.util.ArchiveUtils; 025import org.archive.util.InetAddressUtil; 026import org.archive.util.Recorder; 027import org.springframework.beans.factory.annotation.Autowired; 028import org.xbill.DNS.ARecord; 029import org.xbill.DNS.DClass; 030import org.xbill.DNS.Lookup; 031import org.xbill.DNS.Record; 032import org.xbill.DNS.ResolverConfig; 033import org.xbill.DNS.TextParseException; 034import org.xbill.DNS.Type; 035 036import dk.netarkivet.common.utils.DomainUtils; 037 038/** 039 * Processor to resolve 'dns:' URIs. 040 * 041 * Based on version of FetchDNS taken from https://github.com/internetarchive/heritrix3/commit/aee83dfe26ea5a36a4eb3092380e1b0d7b242aab 042 * 043 * Makes it possible to avoid lookup of bad hostnames, e.g. 'components' or 'www' without any valid domain-information 044 * 045 * @author multiple 046 * sample usage: 047 * bean id="fetchDns" class="dk.netarkivet.harvester.harvesting.ExtendedDNSFetcher"> 048 <property name="enabled" value="true" /> 049 <property name="acceptNonDnsResolves" value="false" /> 050 <property name="disableJavaDnsResolves" value="false" /> 051 <property name="digestContent" value="true" /> 052 <property name="digestAlgorithm" value="sha1" /> 053 <property name="prevalidateHostname" value="false" /> 054 </bean> 055 */ 056public class ExtendedDNSFetcher extends Processor { 057 058 @SuppressWarnings("unused") 059 private static final long serialVersionUID = 3L; 060 061 private static Logger logger = Logger.getLogger(ExtendedDNSFetcher.class.getName()); 062 063 // Defaults. 064 private short ClassType = DClass.IN; 065 private short TypeType = Type.A; 066 protected InetAddress serverInetAddr = null; 067 068 /** 069 * If a DNS lookup fails, whether or not to fall back to InetAddress 070 * resolution, which may use local 'hosts' files or other mechanisms. 071 * It is disabled by default. 072 */ 073 { 074 setAcceptNonDnsResolves(false); 075 } 076 public boolean getAcceptNonDnsResolves() { 077 return (Boolean) kp.get("acceptNonDnsResolves"); 078 } 079 public void setAcceptNonDnsResolves(boolean acceptNonDnsResolves) { 080 kp.put("acceptNonDnsResolves",acceptNonDnsResolves); 081 } 082 083 /** 084 * Optionally, only allow InetAddress resolution, precisely because it 085 * may use local 'hosts' files or other mechanisms. 086 * 087 * This should not generally be used in production as it will prevent 088 * DNS lookups from being recorded properly. 089 * It is disabled by default. 090 * 091 */ 092 { 093 setDisableJavaDnsResolves(false); 094 } 095 public boolean getDisableJavaDnsResolves() { 096 return (Boolean) kp.get("disableJavaDnsResolves"); 097 } 098 public void setDisableJavaDnsResolves(boolean disableJavaDnsResolves) { 099 kp.put("disableJavaDnsResolves",disableJavaDnsResolves); 100 } 101 102 /** 103 * Used to do DNS lookups. 104 */ 105 protected ServerCache serverCache; 106 public ServerCache getServerCache() { 107 return this.serverCache; 108 } 109 @Autowired 110 public void setServerCache(ServerCache serverCache) { 111 this.serverCache = serverCache; 112 } 113 114 /** 115 * Whether or not to perform an on-the-fly digest hash of retrieved 116 * content-bodies. It is enabled by default 117 */ 118 { 119 setDigestContent(true); 120 } 121 public boolean getDigestContent() { 122 return (Boolean) kp.get("digestContent"); 123 } 124 public void setDigestContent(boolean digest) { 125 kp.put("digestContent",digest); 126 } 127 128 /** 129 * Which algorithm (for example MD5 or SHA-1) to use to perform an 130 * on-the-fly digest hash of retrieved content-bodies. The default is 'sha1' 131 */ 132 protected String digestAlgorithm = "sha1"; 133 public String getDigestAlgorithm() { 134 return digestAlgorithm; 135 } 136 public void setDigestAlgorithm(String digestAlgorithm) { 137 this.digestAlgorithm = digestAlgorithm; 138 } 139 140 /** 141 * Whether or not to prevalidate dnsname as a valid host. It is disabled by default. 142 */ 143 { 144 setPrevalidateHostname(false); 145 } 146 public boolean getPrevalidateHostname() { 147 return (Boolean) kp.get("prevalidateHostname"); 148 } 149 150 public void setPrevalidateHostname(boolean prevalidateHostname) { 151 kp.put("prevalidateHostname",prevalidateHostname); 152 } 153 154 private static final long DEFAULT_TTL_FOR_NON_DNS_RESOLVES 155 = 6 * 60 * 60; // 6 hrs 156 157 public ExtendedDNSFetcher() { 158 } 159 160 protected boolean shouldProcess(CrawlURI curi) { 161 return curi.getUURI().getScheme().equals("dns"); 162 } 163 164 /** 165 * tests if dnsName is a valid hostname. 166 * @param dnsName a given hostname 167 * @return true, if it is considered a valid hostname from a NetarchiveSuite perspective. 168 */ 169 private boolean validHostName(String dnsName) { 170 String domainName = DomainUtils.domainNameFromHostname(dnsName); 171 if (domainName == null) { 172 return false; 173 } else { 174 return true; 175 } 176 } 177 178 protected void innerProcess(CrawlURI curi) { 179 Record[] rrecordSet = null; // Retrieved dns records 180 String dnsName = null; 181 try { 182 dnsName = curi.getUURI().getReferencedHost(); 183 } catch (URIException e) { 184 logger.log(Level.SEVERE, "Failed parse of dns record " + curi, e); 185 } 186 187 if(dnsName == null) { 188 curi.setFetchStatus(S_UNFETCHABLE_URI); 189 return; 190 } 191 192 CrawlHost targetHost = getServerCache().getHostFor(dnsName); 193 if (isQuadAddress(curi, dnsName, targetHost)) { 194 // We're done processing. 195 return; 196 } 197 198 if (getPrevalidateHostname()) { 199 if (!validHostName(dnsName)) { 200 targetHost.setIP(null, 0); 201 curi.setFetchStatus(S_DOMAIN_UNRESOLVABLE); // or S_UNFETCHABLE_URI 202 logger.info("URI '" + curi.getURI() + "' rejected, as hostname '" + dnsName + "' is considered invalid"); 203 return; 204 } 205 } 206 207 // Do actual DNS lookup. 208 curi.setFetchBeginTime(System.currentTimeMillis()); 209 210 // Try to get the records for this host (assume domain name) 211 // TODO: Bug #935119 concerns potential hang here 212 String lookupName = dnsName.endsWith(".") ? dnsName : dnsName + "."; 213 // If we have not disabled JavaDNS, use that: 214 if (!getDisableJavaDnsResolves()) { 215 try { 216 rrecordSet = (new Lookup(lookupName, TypeType, ClassType)).run(); 217 } catch (TextParseException e) { 218 rrecordSet = null; 219 } 220 } 221 curi.setContentType("text/dns"); 222 if (rrecordSet != null) { 223 if (logger.isLoggable(Level.FINE)) { 224 logger.fine("Found recordset for " + lookupName); 225 } 226 storeDNSRecord(curi, dnsName, targetHost, rrecordSet); 227 } else { 228 if (logger.isLoggable(Level.FINE)) { 229 logger.fine("Failed find of recordset for " + lookupName); 230 } 231 if (getAcceptNonDnsResolves()||getDisableJavaDnsResolves()||"localhost".equals(dnsName)) { 232 // Do lookup that bypasses javadns. 233 InetAddress address = null; 234 try { 235 address = InetAddress.getByName(dnsName); 236 } catch (UnknownHostException e1) { 237 address = null; 238 } 239 if (address != null) { 240 targetHost.setIP(address, DEFAULT_TTL_FOR_NON_DNS_RESOLVES); 241 curi.setFetchStatus(S_GETBYNAME_SUCCESS); 242 curi.setContentSize(0); 243 if (logger.isLoggable(Level.FINE)) { 244 logger.fine("Found address for " + dnsName + 245 " using native dns."); 246 } 247 } else { 248 if (logger.isLoggable(Level.FINE)) { 249 logger.fine("Failed find of address for " + dnsName + 250 " using native dns."); 251 } 252 setUnresolvable(curi, targetHost); 253 } 254 } else { 255 setUnresolvable(curi, targetHost); 256 } 257 } 258 curi.setFetchCompletedTime(System.currentTimeMillis()); 259 } 260 261 protected void storeDNSRecord(final CrawlURI curi, final String dnsName, 262 final CrawlHost targetHost, final Record[] rrecordSet) { 263 // Get TTL and IP info from the first A record (there may be 264 // multiple, e.g. www.washington.edu) then update the CrawlServer 265 ARecord arecord = getFirstARecord(rrecordSet); 266 if (arecord == null) { 267 throw new NullPointerException("Got null arecord for " + 268 dnsName); 269 } 270 targetHost.setIP(arecord.getAddress(), arecord.getTTL()); 271 try { 272 recordDNS(curi, rrecordSet); 273 curi.setFetchStatus(S_DNS_SUCCESS); 274 curi.setDNSServerIPLabel(ResolverConfig.getCurrentConfig().server()); 275 } catch (IOException e) { 276 logger.log(Level.SEVERE, "Failed store of DNS Record for " + 277 curi.toString(), e); 278 setUnresolvable(curi, targetHost); 279 } 280 } 281 282 protected boolean isQuadAddress(final CrawlURI curi, final String dnsName, 283 final CrawlHost targetHost) { 284 boolean result = false; 285 Matcher matcher = InetAddressUtil.IPV4_QUADS.matcher(dnsName); 286 // If it's an ip no need to do a lookup 287 if (matcher == null || !matcher.matches()) { 288 return result; 289 } 290 291 result = true; 292 // Ideally this branch would never be reached: no CrawlURI 293 // would be created for numerical IPs 294 if (logger.isLoggable(Level.WARNING)) { 295 logger.warning("Unnecessary DNS CrawlURI created: " + curi); 296 } 297 try { 298 targetHost.setIP(InetAddress.getByAddress(dnsName, new byte[] { 299 (byte) (new Integer(matcher.group(1)).intValue()), 300 (byte) (new Integer(matcher.group(2)).intValue()), 301 (byte) (new Integer(matcher.group(3)).intValue()), 302 (byte) (new Integer(matcher.group(4)).intValue()) }), 303 CrawlHost.IP_NEVER_EXPIRES); // Never expire numeric IPs 304 curi.setFetchStatus(S_DNS_SUCCESS); 305 } catch (UnknownHostException e) { 306 logger.log(Level.SEVERE, "Should never be " + e.getMessage(), e); 307 setUnresolvable(curi, targetHost); 308 } 309 return result; 310 } 311 312 protected void recordDNS(final CrawlURI curi, final Record[] rrecordSet) 313 throws IOException { 314 final byte[] dnsRecord = getDNSRecord(curi.getFetchBeginTime(), 315 rrecordSet); 316 317 Recorder rec = curi.getRecorder(); 318 // Shall we get a digest on the content downloaded? 319 boolean digestContent = getDigestContent(); 320 String algorithm = null; 321 if (digestContent) { 322 algorithm = getDigestAlgorithm(); 323 rec.getRecordedInput().setDigest(algorithm); 324 } else { 325 rec.getRecordedInput().setDigest((MessageDigest)null); 326 } 327 InputStream is = curi.getRecorder().inputWrap( 328 new ByteArrayInputStream(dnsRecord)); 329 330 if (digestContent) { 331 rec.getRecordedInput().startDigest(); 332 } 333 334 // Reading from the wrapped stream, behind the scenes, will write 335 // files into scratch space 336 try { 337 byte[] buf = new byte[256]; 338 while (is.read(buf) != -1) { 339 continue; 340 } 341 } finally { 342 is.close(); 343 rec.closeRecorders(); 344 } 345 curi.setContentSize(dnsRecord.length); 346 347 if (digestContent) { 348 curi.setContentDigest(algorithm, 349 rec.getRecordedInput().getDigestValue()); 350 } 351 } 352 353 protected byte [] getDNSRecord(final long fetchStart, 354 final Record[] rrecordSet) 355 throws IOException { 356 ByteArrayOutputStream baos = new ByteArrayOutputStream(); 357 // Start the record with a 14-digit date per RFC 2540 358 byte[] fetchDate = ArchiveUtils.get14DigitDate(fetchStart).getBytes(); 359 baos.write(fetchDate); 360 // Don't forget the newline 361 baos.write("\n".getBytes()); 362 if (rrecordSet != null) { 363 for (int i = 0; i < rrecordSet.length; i++) { 364 byte[] record = rrecordSet[i].toString().getBytes(); 365 baos.write(record); 366 // Add the newline between records back in 367 baos.write("\n".getBytes()); 368 } 369 } 370 return baos.toByteArray(); 371 } 372 373 protected void setUnresolvable(CrawlURI curi, CrawlHost host) { 374 host.setIP(null, 0); 375 curi.setFetchStatus(S_DOMAIN_UNRESOLVABLE); 376 } 377 378 protected ARecord getFirstARecord(Record[] rrecordSet) { 379 ARecord arecord = null; 380 if (rrecordSet == null || rrecordSet.length == 0) { 381 if (logger.isLoggable(Level.FINEST)) { 382 logger.finest("rrecordSet is null or zero length: " + 383 rrecordSet); 384 } 385 return arecord; 386 } 387 for (int i = 0; i < rrecordSet.length; i++) { 388 if (rrecordSet[i].getType() != Type.A) { 389 if (logger.isLoggable(Level.FINEST)) { 390 logger.finest("Record " + Integer.toString(i) + 391 " is not A type but " + rrecordSet[i].getType()); 392 } 393 continue; 394 } 395 arecord = (ARecord) rrecordSet[i]; 396 break; 397 } 398 return arecord; 399 } 400}