001/* File:        $Id: DomainnameQueueAssignmentPolicy.java 2687 2013-05-03 16:38:47Z svc $
002 * Revision:    $Revision: 2687 $
003 * Author:      $Author: svc $
004 * Date:        $Date: 2013-05-03 18:38:47 +0200 (Fri, 03 May 2013) $
005 *
006 * The Netarchive Suite - Software to harvest and preserve websites
007 * Copyright 2004-2018 The Royal Danish Library,
008 * the National Library of France and the Austrian
009 * National Library.
010 *
011 * This library is free software; you can redistribute it and/or
012 * modify it under the terms of the GNU Lesser General Public
013 * License as published by the Free Software Foundation; either
014 * version 2.1 of the License, or (at your option) any later version.
015 *
016 * This library is distributed in the hope that it will be useful,
017 * but WITHOUT ANY WARRANTY; without even the implied warranty of
018 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
019 * Lesser General Public License for more details.
020 *
021 * You should have received a copy of the GNU Lesser General Public
022 * License along with this library; if not, write to the Free Software
023 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
024 */
025package dk.netarkivet.harvester.harvesting;
026
027import org.apache.commons.logging.Log;
028import org.apache.commons.logging.LogFactory;
029import org.archive.crawler.frontier.HostnameQueueAssignmentPolicy;
030import org.archive.net.UURI;
031
032import dk.netarkivet.common.utils.DomainUtils;
033
034/**
035 * Using the domain as the queue-name.
036 * The domain is defined as the last two names in the entire hostname or
037 * the entirety of an IP address.
038 * x.y.z -> y.z
039 * y.z -> y.z
040 * nn.nn.nn.nn -> nn.nn.nn.nn
041 *  
042 */
043public class DomainnameQueueAssignmentPolicy extends HostnameQueueAssignmentPolicy {
044
045        /** A key used for the cases when we can't figure out the URI.
046     *  This is taken from parent, where it has private access.  Parent returns
047     *  this on things like about:blank.
048     */
049    static final String DEFAULT_CLASS_KEY = "default...";
050
051    private Log log = LogFactory.getLog(getClass());
052
053    /**
054     * Return a key for queue names based on domain names (last two parts of
055     * host name) or IP address.  They key may include a #<portnr> at the end.
056     *
057     * @param basis A potential URI.
058     * @return a class key (really an arbitrary string), one of <domainOrIP>,
059     * <domainOrIP>#<port>, or "default...".
060     * @see HostnameQueueAssignmentPolicy#getClassKey(org.archive.modules.CrawlURI)
061     */
062    @Override
063    protected String getCoreKey(UURI basis) {
064        String candidate; 
065        try {
066            candidate = super.getCoreKey(basis);
067        } catch (NullPointerException e) {
068            log.debug("Heritrix broke getting class key candidate for " + basis);
069            candidate = DEFAULT_CLASS_KEY;
070        }
071        if (candidate == null) { //FIXME the candidate should not be null with dns: schema
072                // is this a dns url?
073                if (basis.getScheme().equalsIgnoreCase("dns")) {
074                        log.warn("The url is a dns-url '" + basis + "'. Returning: " +  DEFAULT_CLASS_KEY);
075                } else {
076                        log.warn("The url is not a dns-url '" + basis + "'. Returning: " +  DEFAULT_CLASS_KEY);
077                }
078                return DEFAULT_CLASS_KEY;
079        }
080        
081        String[] hostnameandportnr = candidate.split("#");
082        
083        if (hostnameandportnr.length == 0 || hostnameandportnr.length > 2) {
084            return candidate;
085        }
086        String domainName = DomainUtils.domainNameFromHostname(hostnameandportnr[0]);
087        if (domainName == null) { // Not valid according to our rules
088            log.debug("Illegal class key candidate '" + candidate + "' for '" + basis + "'");
089            return candidate;
090        }
091        return domainName;
092    }
093 
094}