001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.harvesting;
024
025import org.archive.crawler.datamodel.CandidateURI;
026import org.archive.crawler.framework.CrawlController;
027import org.archive.crawler.frontier.HostnameQueueAssignmentPolicy;
028import org.apache.commons.logging.Log;
029import org.apache.commons.logging.LogFactory;
030
031import dk.netarkivet.common.utils.DomainUtils;
032
033/**
034 * Using the domain as the queue-name. The domain is defined as the last two names in the entire hostname or the
035 * entirety of an IP address. x.y.z -> y.z y.z -> y.z nn.nn.nn.nn -> nn.nn.nn.nn
036 */
037public class DomainnameQueueAssignmentPolicy extends HostnameQueueAssignmentPolicy {
038
039    private static final Log log = LogFactory.getLog(DomainnameQueueAssignmentPolicy.class);
040
041    /**
042     * A key used for the cases when we can't figure out the URI. This is taken from parent, where it has private
043     * access. Parent returns this on things like about:blank.
044     */
045    static final String DEFAULT_CLASS_KEY = "default...";
046
047    /**
048     * Return a key for queue names based on domain names (last two parts of host name) or IP address. They key may
049     * include a #<portnr> at the end.
050     *
051     * @param controller The controller the crawl is running on.
052     * @param cauri A potential URI.
053     * @return a class key (really an arbitrary string), one of <domainOrIP>, <domainOrIP>#<port>, or "default...".
054     * @see HostnameQueueAssignmentPolicy#getClassKey(org.archive.crawler.framework.CrawlController,
055     * org.archive.crawler.datamodel.CandidateURI)
056     */
057    public String getClassKey(CrawlController controller, CandidateURI cauri) {
058        String candidate;
059        try {
060            // Since getClassKey has no contract, we must encapsulate it from
061            // errors.
062            candidate = super.getClassKey(controller, cauri);
063        } catch (NullPointerException e) {
064            log.debug("Heritrix broke getting class key candidate for " + cauri);
065            candidate = DEFAULT_CLASS_KEY;
066        }
067        String[] hostnameandportnr = candidate.split("#");
068        if (hostnameandportnr.length == 0 || hostnameandportnr.length > 2) {
069            return candidate;
070        }
071        String domainName = DomainUtils.domainNameFromHostname(hostnameandportnr[0]);
072        if (domainName == null) { // Not valid according to our rules
073            log.debug("Illegal class key candidate '" + candidate + "' for '" + cauri + "'" );
074            return candidate;
075        }
076        return domainName;
077    }
078
079}