001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.harvesting; 024 025import org.archive.crawler.datamodel.CandidateURI; 026import org.archive.crawler.framework.CrawlController; 027import org.archive.crawler.frontier.HostnameQueueAssignmentPolicy; 028import org.apache.commons.logging.Log; 029import org.apache.commons.logging.LogFactory; 030 031import dk.netarkivet.common.utils.DomainUtils; 032 033/** 034 * Using the domain as the queue-name. The domain is defined as the last two names in the entire hostname or the 035 * entirety of an IP address. x.y.z -> y.z y.z -> y.z nn.nn.nn.nn -> nn.nn.nn.nn 036 */ 037public class DomainnameQueueAssignmentPolicy extends HostnameQueueAssignmentPolicy { 038 039 private static final Log log = LogFactory.getLog(DomainnameQueueAssignmentPolicy.class); 040 041 /** 042 * A key used for the cases when we can't figure out the URI. This is taken from parent, where it has private 043 * access. Parent returns this on things like about:blank. 044 */ 045 static final String DEFAULT_CLASS_KEY = "default..."; 046 047 /** 048 * Return a key for queue names based on domain names (last two parts of host name) or IP address. They key may 049 * include a #<portnr> at the end. 050 * 051 * @param controller The controller the crawl is running on. 052 * @param cauri A potential URI. 053 * @return a class key (really an arbitrary string), one of <domainOrIP>, <domainOrIP>#<port>, or "default...". 054 * @see HostnameQueueAssignmentPolicy#getClassKey(org.archive.crawler.framework.CrawlController, 055 * org.archive.crawler.datamodel.CandidateURI) 056 */ 057 public String getClassKey(CrawlController controller, CandidateURI cauri) { 058 String candidate; 059 try { 060 // Since getClassKey has no contract, we must encapsulate it from 061 // errors. 062 candidate = super.getClassKey(controller, cauri); 063 } catch (NullPointerException e) { 064 log.debug("Heritrix broke getting class key candidate for " + cauri); 065 candidate = DEFAULT_CLASS_KEY; 066 } 067 String[] hostnameandportnr = candidate.split("#"); 068 if (hostnameandportnr.length == 0 || hostnameandportnr.length > 2) { 069 return candidate; 070 } 071 String domainName = DomainUtils.domainNameFromHostname(hostnameandportnr[0]); 072 if (domainName == null) { // Not valid according to our rules 073 log.debug("Illegal class key candidate '" + candidate + "' for '" + cauri + "'" ); 074 return candidate; 075 } 076 return domainName; 077 } 078 079}