001/* File: $Id: SeedUriDomainnameQueueAssignmentPolicy.java 2688 2013-05-05 18:58:18Z svc $ 002 * Revision: $Revision: 2688 $ 003 * Author: $Author: svc $ 004 * Date: $Date: 2013-05-05 20:58:18 +0200 (Sun, 05 May 2013) $ 005 * 006 * The Netarchive Suite - Software to harvest and preserve websites 007 * Copyright 2004-2018 The Royal Danish Library, 008 * the National Library of France and the Austrian 009 * National Library. 010 * 011 * This library is free software; you can redistribute it and/or 012 * modify it under the terms of the GNU Lesser General Public 013 * License as published by the Free Software Foundation; either 014 * version 2.1 of the License, or (at your option) any later version. 015 * 016 * This library is distributed in the hope that it will be useful, 017 * but WITHOUT ANY WARRANTY; without even the implied warranty of 018 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 019 * Lesser General Public License for more details. 020 * 021 * You should have received a copy of the GNU Lesser General Public 022 * License along with this library; if not, write to the Free Software 023 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 024 */ 025package dk.netarkivet.harvester.harvesting; 026 027import org.apache.commons.logging.Log; 028import org.apache.commons.logging.LogFactory; 029import org.archive.crawler.frontier.HostnameQueueAssignmentPolicy; 030import org.archive.modules.CrawlURI; 031import org.archive.net.UURIFactory; 032 033import dk.netarkivet.common.utils.DomainUtils; 034 035/** 036 * This is a modified version of the {@link DomainnameQueueAssignmentPolicy} 037 * where domainname returned is the domainname of the candidateURI 038 * except where the the SeedURI belongs to a different domain. 039 * 040 * Using the domain as the queue-name. 041 * The domain is defined as the last two names in the entire hostname or 042 * the entirety of an IP address. 043 * x.y.z -> y.z 044 * y.z -> y.z 045 * nn.nn.nn.nn -> nn.nn.nn.nn 046 * 047 */ 048public class SeedUriDomainnameQueueAssignmentPolicy extends HostnameQueueAssignmentPolicy { 049 050 /** A key used for the cases when we can't figure out the URI. 051 * This is taken from parent, where it has private access. Parent returns 052 * this on things like about:blank. 053 */ 054 static final String DEFAULT_CLASS_KEY = "default..."; 055 056 private Log log = LogFactory.getLog(getClass()); 057 058 059 /** 060 * The logic is as follows: 061 * We get try to get the queue-name as the domain-name of the seed. 062 * If that fails, or if the uri is a dns entry, we use the "old" logic which is 063 * to take the key from the superclass (in the form host#port or just host) and extract 064 * a domain-name from that. If all that fails, we fall back to a default value, 065 * 066 * In practice this means that dns-lookups for non-seed uris each get their own 067 * queue, which is then never used again. This seems like a good idea because the 068 * frontier needs to be able to prioritise dns lookups. 069 * 070 * @param cauri The crawl URI from which to find the key. 071 * @return the key value 072 */ 073 public String getClassKey(CrawlURI cauri) { 074 log.debug("Finding classKey for cauri: " + cauri); 075 String key = null; 076 if (!isDns(cauri)) { 077 key = getKeyFromSeed(cauri); 078 } 079 if (key == null) { 080 key = getKeyFromUriHostname(cauri); 081 } 082 if (key != null) { 083 return key; 084 } else { 085 return DEFAULT_CLASS_KEY; 086 } 087 } 088 089 private boolean isDns(CrawlURI cauri) { 090 return cauri != null && cauri.getCanonicalString().startsWith("dns"); 091 } 092 093 /** 094 * Returns the domain name extracted from the URI being crawled itself, without reference to its seed. 095 * @param cauri the uri being crawled. 096 * @return the domain name, if it can be determined. Otherwise null. 097 */ 098 private String getKeyFromUriHostname(CrawlURI cauri) { 099 String key = null; 100 try { 101 key = super.getClassKey(cauri); 102 } catch (NullPointerException e) { 103 log.debug("Heritrix broke getting class key candidate for " + cauri); 104 } 105 if (key != null) { 106 String[] hostnameandportnr = key.split("#"); 107 if (hostnameandportnr.length == 1 || hostnameandportnr.length == 2) { 108 key = DomainUtils.domainNameFromHostname(hostnameandportnr[0]); 109 } else { 110 log.debug("Illegal class key candidate from superclass: '" + key + "' for '" + cauri + "'"); 111 key = null; 112 } 113 } 114 return key; 115 } 116 117 /** 118 * The bean property <property name="sourceTagSeeds" value="true" /> on the TextSeedModule bean in the 119 * heritrix crawler beans, should ensure that the seed is made available in every CrawlURI reached from that seed. 120 * @param cauri the CrawlURI 121 * @return the domain of the seed, if it can be determined. Otherwise null. 122 */ 123 private String getKeyFromSeed(CrawlURI cauri) { 124 String key = null; 125 try { 126 key = DomainUtils.domainNameFromHostname(UURIFactory.getInstance(cauri.getSourceTag()).getHost()); 127 } catch (Exception e) { 128 e.printStackTrace(); 129 } 130 return key; 131 } 132 133}