001/*
002 * #%L
003 * Netarchivesuite - Heritrix 3 extensions
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.harvesting;
024
025import static org.archive.modules.fetcher.FetchStatusCodes.S_DNS_SUCCESS;
026import static org.archive.modules.fetcher.FetchStatusCodes.S_UNFETCHABLE_URI;
027
028import java.io.BufferedReader;
029import java.io.IOException;
030import java.io.Reader;
031import java.net.InetAddress;
032import java.net.UnknownHostException;
033import java.util.HashMap;
034import java.util.Map;
035import java.util.logging.Level;
036import java.util.logging.Logger;
037
038import org.apache.commons.httpclient.URIException;
039import org.apache.commons.io.IOUtils;
040import org.archive.io.ReadSource;
041import org.archive.modules.CrawlURI;
042import org.archive.modules.fetcher.FetchDNS;
043import org.archive.modules.net.CrawlHost;
044
045/**
046 * Extended FetchDNS processor which allows the override of hosts
047 * to be used before they are querying through a DNS server.
048 *
049 * @author nicl
050 */
051public class NASFetchDNS extends FetchDNS {
052
053    /** Logger instance. */
054    private static Logger logger = Logger.getLogger(FetchDNS.class.getName());
055
056    /**
057     * Look for hosts in the hosts file/text value before doing a DNS lookup.
058     */
059    protected boolean acceptDefinedHosts = true; 
060    public boolean getAcceptDefinedHosts() {
061        return acceptDefinedHosts;
062    }
063    // @Required
064    public void setAcceptDefinedHosts(boolean acceptDefinedHosts) {
065        this.acceptDefinedHosts = acceptDefinedHosts;
066    }
067
068    /**
069     * Text from which to load hosts
070     */
071    protected ReadSource hostsFile = null;
072    public ReadSource getHostsFile() {
073        return hostsFile;
074    }
075    // @Required
076    public void setHostsFile(ReadSource hostsFile) {
077        this.hostsFile = hostsFile;
078    }
079
080    /**
081     * Text from which to look for hosts.
082     */
083    protected ReadSource hostsSource = null;
084    public ReadSource getHostsSource() {
085        return hostsSource;
086    }
087    // @Required
088    public void setHostsSource(ReadSource hostsSource) {
089        this.hostsSource = hostsSource;
090    }
091
092    ///private static final long DEFAULT_TTL_FOR_HOSTS_RESOLVES = Long.MAX_VALUE;        // A very long time...
093
094    /** Has the hosts been loaded. */
095    private boolean bInitialized = false;
096
097    /** Map of hosts that override the normal DNS lookup. */
098    protected Map<String, String> hosts;
099
100    /*
101     * Check for the host in the hosts map before calling the extended method.
102     * @see org.archive.modules.fetcher.FetchDNS#innerProcess(org.archive.modules.CrawlURI)
103     */
104    @Override
105    protected void innerProcess(CrawlURI curi) {
106        InetAddress address = null;
107        if (acceptDefinedHosts) {
108            String dnsName = null;
109            try {
110                dnsName = curi.getUURI().getReferencedHost();
111            } catch (URIException e) {
112                logger.log(Level.SEVERE, "Failed parse of dns record " + curi, e);
113            }
114            if(dnsName == null) {
115                curi.setFetchStatus(S_UNFETCHABLE_URI);
116                return;
117            }
118            CrawlHost targetHost = getServerCache().getHostFor(dnsName);
119            if (isQuadAddress(curi, dnsName, targetHost)) {
120                // We're done processing.
121                return;
122            }
123            if (!bInitialized) {
124                reload();
125                bInitialized = true;
126            }
127            if (hosts.size() > 0) {
128                // Do actual DNS lookup.
129                String ipAddress = hosts.get(dnsName);
130                if (ipAddress != null) {
131                    curi.setFetchBeginTime(System.currentTimeMillis());
132                    try {
133                        address = InetAddress.getByName(ipAddress);
134                    } catch (UnknownHostException e1) {
135                        address = null;
136                    }
137                    if (address != null) {
138                        //targetHost.setIP(address, DEFAULT_TTL_FOR_HOSTS_RESOLVES);
139                        //curi.setFetchStatus(S_GETBYNAME_SUCCESS);
140                        targetHost.setIP(address, CrawlHost.IP_NEVER_EXPIRES);
141                        curi.setFetchStatus(S_DNS_SUCCESS);
142                        curi.setContentType("text/dns");
143                        if (logger.isLoggable(Level.FINE)) {
144                            logger.fine("Found address for " + dnsName + " using hosts file.");
145                        }
146                    } else {
147                        if (logger.isLoggable(Level.FINE)) {
148                            logger.fine("Failed find of address for " + dnsName + " using hosts file.");
149                        }
150                        setUnresolvable(curi, targetHost);
151                    }
152                    curi.setFetchCompletedTime(System.currentTimeMillis());
153                }
154            }
155        }
156        if (address == null) {
157            super.innerProcess(curi);
158        }
159    }
160
161    /**
162     * Clear loaded hosts of reload from hosts file and value text.
163     */
164    protected void reload() {
165        hosts = new HashMap<String, String>();
166        getHosts(getHostsFile());
167        getHosts(getHostsSource());
168    }
169
170    /**
171     * Run through the lines in a <code>ReadSource</code> and add all valid host lines encountered.
172     * @param hostsSource hosts file or value text
173     */
174    protected void getHosts(ReadSource hostsSource) {
175        if (hostsSource != null) {
176            if (logger.isLoggable(Level.FINE)) {
177                logger.fine("reading surt prefixes from " + hostsSource);
178            }
179            Reader reader = hostsSource.obtainReader();
180            BufferedReader bufferedReader = new BufferedReader(reader);
181            try {
182                String str;
183                while ((str = bufferedReader.readLine()) != null) {
184                    str = str.trim();
185                    if (!str.startsWith("#")) {
186                        int idx = str.indexOf('#');
187                        if (idx != -1) {
188                            str = str.substring(0, idx).trim();
189                        }
190                        String[] tokensArr = new String[3];
191                        int tokens = tokenize(str, tokensArr);
192                        if (tokens >= 2) {
193                            hosts.put(tokensArr[1], tokensArr[0]);
194                        }
195                        if (tokens >= 3) {
196                            hosts.put(tokensArr[2], tokensArr[0]);
197                        }
198                    }
199                }
200            } catch (IOException e) {
201                logger.log(Level.SEVERE, "Exception parsing hosts", e);
202            } finally {
203                IOUtils.closeQuietly(bufferedReader);
204                IOUtils.closeQuietly(reader);
205            }
206        }
207    }
208
209    /**
210     * Split input string into tokens. Treats multiple whitespace as one.
211     * Only parse the number of tokens that are able to fit into the supplied token array.
212     * @param str split input string into tokens
213     * @param tokensArr supply a string array to be filled with tokens
214     * @return number of tokens inserted into the token array
215     */
216    public static int tokenize(String str, String[] tokensArr) {
217        int tokens = 0;
218        int idx = 0;
219        int pIdx;
220        while (tokens < tokensArr.length && idx < str.length()) {
221            while (idx < str.length() && Character.isWhitespace(str.charAt(idx))) {
222                ++idx;
223            }
224            pIdx = idx;
225            while (idx < str.length() && !Character.isWhitespace(str.charAt(idx))) {
226                ++idx;
227            }
228            if (idx > pIdx) {
229                tokensArr[tokens++] = str.substring(pIdx, idx);
230            }
231        }
232        return tokens;
233    }
234
235}