001/* File: $Id: DomainnameQueueAssignmentPolicy.java 2687 2013-05-03 16:38:47Z svc $ 002 * Revision: $Revision: 2687 $ 003 * Author: $Author: svc $ 004 * Date: $Date: 2013-05-03 18:38:47 +0200 (Fri, 03 May 2013) $ 005 * 006 * The Netarchive Suite - Software to harvest and preserve websites 007 * Copyright 2004-2018 The Royal Danish Library, 008 * the National Library of France and the Austrian 009 * National Library. 010 * 011 * This library is free software; you can redistribute it and/or 012 * modify it under the terms of the GNU Lesser General Public 013 * License as published by the Free Software Foundation; either 014 * version 2.1 of the License, or (at your option) any later version. 015 * 016 * This library is distributed in the hope that it will be useful, 017 * but WITHOUT ANY WARRANTY; without even the implied warranty of 018 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 019 * Lesser General Public License for more details. 020 * 021 * You should have received a copy of the GNU Lesser General Public 022 * License along with this library; if not, write to the Free Software 023 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 024 */ 025package dk.netarkivet.harvester.harvesting; 026 027import org.apache.commons.logging.Log; 028import org.apache.commons.logging.LogFactory; 029import org.archive.crawler.frontier.HostnameQueueAssignmentPolicy; 030import org.archive.net.UURI; 031 032import dk.netarkivet.common.utils.DomainUtils; 033 034/** 035 * Using the domain as the queue-name. 036 * The domain is defined as the last two names in the entire hostname or 037 * the entirety of an IP address. 038 * x.y.z -> y.z 039 * y.z -> y.z 040 * nn.nn.nn.nn -> nn.nn.nn.nn 041 * 042 */ 043public class DomainnameQueueAssignmentPolicy extends HostnameQueueAssignmentPolicy { 044 045 /** A key used for the cases when we can't figure out the URI. 046 * This is taken from parent, where it has private access. Parent returns 047 * this on things like about:blank. 048 */ 049 static final String DEFAULT_CLASS_KEY = "default..."; 050 051 private Log log = LogFactory.getLog(getClass()); 052 053 /** 054 * Return a key for queue names based on domain names (last two parts of 055 * host name) or IP address. They key may include a #<portnr> at the end. 056 * 057 * @param basis A potential URI. 058 * @return a class key (really an arbitrary string), one of <domainOrIP>, 059 * <domainOrIP>#<port>, or "default...". 060 * @see HostnameQueueAssignmentPolicy#getClassKey(org.archive.modules.CrawlURI) 061 */ 062 @Override 063 protected String getCoreKey(UURI basis) { 064 String candidate; 065 try { 066 candidate = super.getCoreKey(basis); 067 } catch (NullPointerException e) { 068 log.debug("Heritrix broke getting class key candidate for " + basis); 069 candidate = DEFAULT_CLASS_KEY; 070 } 071 if (candidate == null) { //FIXME the candidate should not be null with dns: schema 072 // is this a dns url? 073 if (basis.getScheme().equalsIgnoreCase("dns")) { 074 log.warn("The url is a dns-url '" + basis + "'. Returning: " + DEFAULT_CLASS_KEY); 075 } else { 076 log.warn("The url is not a dns-url '" + basis + "'. Returning: " + DEFAULT_CLASS_KEY); 077 } 078 return DEFAULT_CLASS_KEY; 079 } 080 081 String[] hostnameandportnr = candidate.split("#"); 082 083 if (hostnameandportnr.length == 0 || hostnameandportnr.length > 2) { 084 return candidate; 085 } 086 String domainName = DomainUtils.domainNameFromHostname(hostnameandportnr[0]); 087 if (domainName == null) { // Not valid according to our rules 088 log.debug("Illegal class key candidate '" + candidate + "' for '" + basis + "'"); 089 return candidate; 090 } 091 return domainName; 092 } 093 094}