001package dk.netarkivet.harvester.harvesting;
002
003import static org.archive.modules.fetcher.FetchStatusCodes.S_DNS_SUCCESS;
004import static org.archive.modules.fetcher.FetchStatusCodes.S_DOMAIN_UNRESOLVABLE;
005import static org.archive.modules.fetcher.FetchStatusCodes.S_GETBYNAME_SUCCESS;
006import static org.archive.modules.fetcher.FetchStatusCodes.S_UNFETCHABLE_URI;
007
008import java.io.ByteArrayInputStream;
009import java.io.ByteArrayOutputStream;
010import java.io.IOException;
011import java.io.InputStream;
012import java.net.InetAddress;
013import java.net.UnknownHostException;
014import java.security.MessageDigest;
015import java.util.logging.Level;
016import java.util.logging.Logger;
017import java.util.regex.Matcher;
018
019import org.apache.commons.httpclient.URIException;
020import org.archive.modules.CrawlURI;
021import org.archive.modules.Processor;
022import org.archive.modules.net.CrawlHost;
023import org.archive.modules.net.ServerCache;
024import org.archive.util.ArchiveUtils;
025import org.archive.util.InetAddressUtil;
026import org.archive.util.Recorder;
027import org.springframework.beans.factory.annotation.Autowired;
028import org.xbill.DNS.ARecord;
029import org.xbill.DNS.DClass;
030import org.xbill.DNS.Lookup;
031import org.xbill.DNS.Record;
032import org.xbill.DNS.ResolverConfig;
033import org.xbill.DNS.TextParseException;
034import org.xbill.DNS.Type;
035
036import dk.netarkivet.common.utils.DomainUtils;
037
038/**
039 * Processor to resolve 'dns:' URIs.
040 * 
041 * Based on version of FetchDNS taken from https://github.com/internetarchive/heritrix3/commit/aee83dfe26ea5a36a4eb3092380e1b0d7b242aab
042 * 
043 * Makes it possible to avoid lookup of bad hostnames, e.g. 'components' or 'www' without any valid domain-information
044 *  
045 * @author multiple
046 * sample usage:
047 * bean id="fetchDns" class="dk.netarkivet.harvester.harvesting.ExtendedDNSFetcher">
048        <property name="enabled" value="true" />
049        <property name="acceptNonDnsResolves" value="false" />
050        <property name="disableJavaDnsResolves" value="false" />
051        <property name="digestContent" value="true" />
052        <property name="digestAlgorithm" value="sha1" />
053        <property name="prevalidateHostname" value="false" />
054    </bean>
055 */
056public class ExtendedDNSFetcher extends Processor {
057
058    @SuppressWarnings("unused")
059    private static final long serialVersionUID = 3L;
060
061    private static Logger logger = Logger.getLogger(ExtendedDNSFetcher.class.getName());
062
063    // Defaults.
064    private short ClassType = DClass.IN;
065    private short TypeType = Type.A;
066    protected InetAddress serverInetAddr = null;
067
068    /**
069     * If a DNS lookup fails, whether or not to fall back to InetAddress
070     * resolution, which may use local 'hosts' files or other mechanisms.
071     * It is disabled by default.
072     */
073    {
074        setAcceptNonDnsResolves(false);
075    }
076    public boolean getAcceptNonDnsResolves() {
077        return (Boolean) kp.get("acceptNonDnsResolves");
078    }
079    public void setAcceptNonDnsResolves(boolean acceptNonDnsResolves) {
080        kp.put("acceptNonDnsResolves",acceptNonDnsResolves);
081    }
082    
083    /**
084     * Optionally, only allow InetAddress resolution, precisely because it 
085     * may use local 'hosts' files or other mechanisms.
086     * 
087     * This should not generally be used in production as it will prevent 
088     * DNS lookups from being recorded properly.
089     * It is disabled by default.
090     * 
091     */
092    {
093        setDisableJavaDnsResolves(false);
094    }
095    public boolean getDisableJavaDnsResolves() {
096        return (Boolean) kp.get("disableJavaDnsResolves");
097    }
098    public void setDisableJavaDnsResolves(boolean disableJavaDnsResolves) {
099        kp.put("disableJavaDnsResolves",disableJavaDnsResolves);
100    }
101    
102    /**
103     * Used to do DNS lookups.
104     */
105    protected ServerCache serverCache;
106    public ServerCache getServerCache() {
107        return this.serverCache;
108    }
109    @Autowired
110    public void setServerCache(ServerCache serverCache) {
111        this.serverCache = serverCache;
112    }
113    
114    /**
115     * Whether or not to perform an on-the-fly digest hash of retrieved
116     * content-bodies. It is enabled by default
117     */
118    {
119        setDigestContent(true);
120    }
121    public boolean getDigestContent() {
122        return (Boolean) kp.get("digestContent");
123    }
124    public void setDigestContent(boolean digest) {
125        kp.put("digestContent",digest);
126    }
127
128    /**
129     * Which algorithm (for example MD5 or SHA-1) to use to perform an 
130     * on-the-fly digest hash of retrieved content-bodies. The default is 'sha1'
131     */
132    protected String digestAlgorithm = "sha1"; 
133    public String getDigestAlgorithm() {
134        return digestAlgorithm;
135    }
136    public void setDigestAlgorithm(String digestAlgorithm) {
137        this.digestAlgorithm = digestAlgorithm;
138    }
139    
140    /**
141     * Whether or not to prevalidate dnsname as a valid host. It is disabled by default.
142     */
143    {
144        setPrevalidateHostname(false);
145    }
146    public boolean getPrevalidateHostname() {
147        return (Boolean) kp.get("prevalidateHostname");
148    }
149    
150    public void setPrevalidateHostname(boolean prevalidateHostname) {
151        kp.put("prevalidateHostname",prevalidateHostname);
152    }
153
154    private static final long DEFAULT_TTL_FOR_NON_DNS_RESOLVES
155        = 6 * 60 * 60; // 6 hrs
156
157    public ExtendedDNSFetcher() {
158    }
159
160    protected boolean shouldProcess(CrawlURI curi) {
161        return curi.getUURI().getScheme().equals("dns");
162    }
163    
164    /**
165     * tests if dnsName is a valid hostname.
166     * @param dnsName a given hostname
167     * @return true, if it is considered a valid hostname from a NetarchiveSuite perspective.
168     */
169    private boolean validHostName(String dnsName) {
170        String domainName = DomainUtils.domainNameFromHostname(dnsName);
171        if (domainName == null) {
172            return false;
173        } else {
174            return true;
175        }
176    }
177    
178    protected void innerProcess(CrawlURI curi) {
179        Record[] rrecordSet = null; // Retrieved dns records
180        String dnsName = null;
181        try {
182            dnsName = curi.getUURI().getReferencedHost();
183        } catch (URIException e) {
184            logger.log(Level.SEVERE, "Failed parse of dns record " + curi, e);
185        }
186        
187        if(dnsName == null) {
188            curi.setFetchStatus(S_UNFETCHABLE_URI);
189            return;
190        }
191
192        CrawlHost targetHost = getServerCache().getHostFor(dnsName);
193        if (isQuadAddress(curi, dnsName, targetHost)) {
194            // We're done processing.
195            return;
196        }
197        
198        if (getPrevalidateHostname()) {
199            if (!validHostName(dnsName)) {
200                targetHost.setIP(null, 0);
201                curi.setFetchStatus(S_DOMAIN_UNRESOLVABLE); // or S_UNFETCHABLE_URI
202                logger.info("URI '" + curi.getURI() + "' rejected, as hostname '" + dnsName + "' is considered invalid");
203                return;
204            }
205        }      
206        
207        // Do actual DNS lookup.
208        curi.setFetchBeginTime(System.currentTimeMillis());
209
210        // Try to get the records for this host (assume domain name)
211        // TODO: Bug #935119 concerns potential hang here
212        String lookupName = dnsName.endsWith(".") ? dnsName : dnsName + ".";
213        // If we have not disabled JavaDNS, use that:
214        if (!getDisableJavaDnsResolves()) {
215            try {
216                rrecordSet = (new Lookup(lookupName, TypeType, ClassType)).run();
217            } catch (TextParseException e) {
218                rrecordSet = null;
219            }
220        }
221        curi.setContentType("text/dns");
222        if (rrecordSet != null) {
223            if (logger.isLoggable(Level.FINE)) {
224                logger.fine("Found recordset for " + lookupName);
225            }
226            storeDNSRecord(curi, dnsName, targetHost, rrecordSet);
227        } else {
228            if (logger.isLoggable(Level.FINE)) {
229                logger.fine("Failed find of recordset for " + lookupName);
230            }
231            if (getAcceptNonDnsResolves()||getDisableJavaDnsResolves()||"localhost".equals(dnsName)) {
232                // Do lookup that bypasses javadns.
233                InetAddress address = null;
234                try {
235                    address = InetAddress.getByName(dnsName);
236                } catch (UnknownHostException e1) {
237                    address = null;
238                }
239                if (address != null) {
240                    targetHost.setIP(address, DEFAULT_TTL_FOR_NON_DNS_RESOLVES);
241                    curi.setFetchStatus(S_GETBYNAME_SUCCESS);
242                    curi.setContentSize(0);
243                    if (logger.isLoggable(Level.FINE)) {
244                        logger.fine("Found address for " + dnsName +
245                            " using native dns.");
246                    }
247                } else {
248                    if (logger.isLoggable(Level.FINE)) {
249                        logger.fine("Failed find of address for " + dnsName +
250                            " using native dns.");
251                    }
252                    setUnresolvable(curi, targetHost);
253                }
254            } else {
255                setUnresolvable(curi, targetHost);
256            }
257        }
258        curi.setFetchCompletedTime(System.currentTimeMillis());
259    }
260    
261    protected void storeDNSRecord(final CrawlURI curi, final String dnsName,
262            final CrawlHost targetHost, final Record[] rrecordSet) {
263        // Get TTL and IP info from the first A record (there may be
264        // multiple, e.g. www.washington.edu) then update the CrawlServer
265        ARecord arecord = getFirstARecord(rrecordSet);
266        if (arecord == null) {
267            throw new NullPointerException("Got null arecord for " +
268                dnsName);
269        }
270        targetHost.setIP(arecord.getAddress(), arecord.getTTL());
271        try {
272            recordDNS(curi, rrecordSet);
273            curi.setFetchStatus(S_DNS_SUCCESS);
274            curi.setDNSServerIPLabel(ResolverConfig.getCurrentConfig().server());
275        } catch (IOException e) {
276            logger.log(Level.SEVERE, "Failed store of DNS Record for " +
277                curi.toString(), e);
278            setUnresolvable(curi, targetHost);
279        }
280    }
281    
282    protected boolean isQuadAddress(final CrawlURI curi, final String dnsName,
283            final CrawlHost targetHost) {
284        boolean result = false;
285        Matcher matcher = InetAddressUtil.IPV4_QUADS.matcher(dnsName);
286        // If it's an ip no need to do a lookup
287        if (matcher == null || !matcher.matches()) {
288            return result;
289        }
290        
291        result = true;
292        // Ideally this branch would never be reached: no CrawlURI
293        // would be created for numerical IPs
294        if (logger.isLoggable(Level.WARNING)) {
295            logger.warning("Unnecessary DNS CrawlURI created: " + curi);
296        }
297        try {
298            targetHost.setIP(InetAddress.getByAddress(dnsName, new byte[] {
299                    (byte) (new Integer(matcher.group(1)).intValue()),
300                    (byte) (new Integer(matcher.group(2)).intValue()),
301                    (byte) (new Integer(matcher.group(3)).intValue()),
302                    (byte) (new Integer(matcher.group(4)).intValue()) }),
303                    CrawlHost.IP_NEVER_EXPIRES); // Never expire numeric IPs
304            curi.setFetchStatus(S_DNS_SUCCESS);
305        } catch (UnknownHostException e) {
306            logger.log(Level.SEVERE, "Should never be " + e.getMessage(), e);
307            setUnresolvable(curi, targetHost);
308        }
309        return result;
310    }
311    
312    protected void recordDNS(final CrawlURI curi, final Record[] rrecordSet)
313            throws IOException {
314        final byte[] dnsRecord = getDNSRecord(curi.getFetchBeginTime(),
315                rrecordSet);
316
317        Recorder rec = curi.getRecorder();
318        // Shall we get a digest on the content downloaded?
319        boolean digestContent = getDigestContent();
320        String algorithm = null;
321        if (digestContent) {
322            algorithm = getDigestAlgorithm();
323            rec.getRecordedInput().setDigest(algorithm);
324        } else {
325            rec.getRecordedInput().setDigest((MessageDigest)null);
326        }
327        InputStream is = curi.getRecorder().inputWrap(
328                new ByteArrayInputStream(dnsRecord));
329
330        if (digestContent) {
331            rec.getRecordedInput().startDigest();
332        }
333
334        // Reading from the wrapped stream, behind the scenes, will write
335        // files into scratch space
336        try {
337            byte[] buf = new byte[256];
338            while (is.read(buf) != -1) {
339                continue;
340            }
341        } finally {
342            is.close();
343            rec.closeRecorders();
344        }
345        curi.setContentSize(dnsRecord.length);
346
347        if (digestContent) {
348            curi.setContentDigest(algorithm,
349                rec.getRecordedInput().getDigestValue());
350        }
351    }
352    
353    protected byte [] getDNSRecord(final long fetchStart,
354            final Record[] rrecordSet)
355    throws IOException {
356        ByteArrayOutputStream baos = new ByteArrayOutputStream();
357        // Start the record with a 14-digit date per RFC 2540
358        byte[] fetchDate = ArchiveUtils.get14DigitDate(fetchStart).getBytes();
359        baos.write(fetchDate);
360        // Don't forget the newline
361        baos.write("\n".getBytes());
362        if (rrecordSet != null) {
363            for (int i = 0; i < rrecordSet.length; i++) {
364                byte[] record = rrecordSet[i].toString().getBytes();
365                baos.write(record);
366                // Add the newline between records back in
367                baos.write("\n".getBytes());
368            }
369        }
370        return baos.toByteArray();
371    }
372    
373    protected void setUnresolvable(CrawlURI curi, CrawlHost host) {
374        host.setIP(null, 0);
375        curi.setFetchStatus(S_DOMAIN_UNRESOLVABLE); 
376    }
377    
378    protected ARecord getFirstARecord(Record[] rrecordSet) {
379        ARecord arecord = null;
380        if (rrecordSet == null || rrecordSet.length == 0) {
381            if (logger.isLoggable(Level.FINEST)) {
382                logger.finest("rrecordSet is null or zero length: " +
383                    rrecordSet);
384            }
385            return arecord;
386        }
387        for (int i = 0; i < rrecordSet.length; i++) {
388            if (rrecordSet[i].getType() != Type.A) {
389                if (logger.isLoggable(Level.FINEST)) {
390                    logger.finest("Record " + Integer.toString(i) +
391                        " is not A type but " + rrecordSet[i].getType());
392                }
393                continue;
394            }
395            arecord = (ARecord) rrecordSet[i];
396            break;
397        }
398        return arecord;
399    }
400}