Source code

001/* File:        $Id: SeedUriDomainnameQueueAssignmentPolicy.java 2688 2013-05-05 18:58:18Z svc $
002 * Revision:    $Revision: 2688 $
003 * Author:      $Author: svc $
004 * Date:        $Date: 2013-05-05 20:58:18 +0200 (Sun, 05 May 2013) $
005 *
006 * The Netarchive Suite - Software to harvest and preserve websites
007 * Copyright 2004-2018 The Royal Danish Library,
008 * the National Library of France and the Austrian
009 * National Library.
010 *
011 * This library is free software; you can redistribute it and/or
012 * modify it under the terms of the GNU Lesser General Public
013 * License as published by the Free Software Foundation; either
014 * version 2.1 of the License, or (at your option) any later version.
015 *
016 * This library is distributed in the hope that it will be useful,
017 * but WITHOUT ANY WARRANTY; without even the implied warranty of
018 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
019 * Lesser General Public License for more details.
020 *
021 * You should have received a copy of the GNU Lesser General Public
022 * License along with this library; if not, write to the Free Software
023 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
024 */
025package dk.netarkivet.harvester.harvesting;
026
027import org.apache.commons.logging.Log;
028import org.apache.commons.logging.LogFactory;
029import org.archive.crawler.frontier.HostnameQueueAssignmentPolicy;
030import org.archive.modules.CrawlURI;
031import org.archive.net.UURIFactory;
032
033import dk.netarkivet.common.utils.DomainUtils;
034
035/**
036 * This is a modified version of the {@link DomainnameQueueAssignmentPolicy}
037 * where domainname returned is the domainname of the candidateURI
038 * except where the the SeedURI belongs to a different domain. 
039 *
040 * Using the domain as the queue-name.
041 * The domain is defined as the last two names in the entire hostname or
042 * the entirety of an IP address.
043 * x.y.z -> y.z
044 * y.z -> y.z
045 * nn.nn.nn.nn -> nn.nn.nn.nn
046 * 
047 */
048public class SeedUriDomainnameQueueAssignmentPolicy extends HostnameQueueAssignmentPolicy {
049    
050    /** A key used for the cases when we can't figure out the URI.
051     *  This is taken from parent, where it has private access.  Parent returns
052     *  this on things like about:blank.
053     */
054    static final String DEFAULT_CLASS_KEY = "default...";
055
056    private Log log = LogFactory.getLog(getClass());
057
058
059    /**
060     * The logic is as follows:
061     * We get try to get the queue-name as the domain-name of the seed.
062     * If that fails, or if the uri is a dns entry, we use the "old" logic which is
063     * to take the key from the superclass (in the form host#port or just host) and extract
064     * a domain-name from that. If all that fails, we fall back to a default value,
065     *
066     * In practice this means that dns-lookups for non-seed uris each get their own
067     * queue, which is then never used again. This seems like a good idea because the
068     * frontier needs to be able to prioritise dns lookups.
069     *
070     * @param cauri The crawl URI from which to find the key.
071     * @return the key value
072     */
073    public String getClassKey(CrawlURI cauri) {
074        log.debug("Finding classKey for cauri: " + cauri);
075        String key = null;
076        if (!isDns(cauri)) {
077            key = getKeyFromSeed(cauri);
078        }
079        if (key == null) {
080            key = getKeyFromUriHostname(cauri);
081        }
082        if (key != null) {
083            return key;
084        } else {
085            return DEFAULT_CLASS_KEY;
086        }
087    }
088
089    private boolean isDns(CrawlURI cauri) {
090        return cauri != null && cauri.getCanonicalString().startsWith("dns");
091    }
092
093    /**
094     * Returns the domain name extracted from the URI being crawled itself, without reference to its seed.
095     * @param cauri the uri being crawled.
096     * @return the domain name, if it can be determined. Otherwise null.
097     */
098    private String getKeyFromUriHostname(CrawlURI cauri) {
099        String key = null;
100        try {
101            key = super.getClassKey(cauri);
102        }  catch (NullPointerException e) {
103            log.debug("Heritrix broke getting class key candidate for " + cauri);
104        }
105        if (key != null) {
106            String[] hostnameandportnr = key.split("#");
107            if (hostnameandportnr.length == 1 || hostnameandportnr.length == 2) {
108                key = DomainUtils.domainNameFromHostname(hostnameandportnr[0]);
109            } else {
110                log.debug("Illegal class key candidate from superclass: '" + key + "' for '" + cauri + "'");
111                key = null;
112            }
113        }
114        return key;
115    }
116
117    /**
118     * The bean property &lt;property name="sourceTagSeeds" value="true" /&gt; on the TextSeedModule bean in the
119     * heritrix crawler beans, should ensure that the seed is made available in every CrawlURI reached from that seed.
120     * @param cauri the CrawlURI
121     * @return the domain of the seed, if it can be determined. Otherwise null.
122     */
123    private String getKeyFromSeed(CrawlURI cauri) {
124        String key = null;
125        try {
126            key = DomainUtils.domainNameFromHostname(UURIFactory.getInstance(cauri.getSourceTag()).getHost());
127        } catch (Exception e) {
128            e.printStackTrace();
129        }
130        return key;
131    }
132
133}