001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.harvesting; 024 025import java.util.NoSuchElementException; 026 027import org.apache.commons.httpclient.URIException; 028import org.apache.commons.logging.Log; 029import org.apache.commons.logging.LogFactory; 030import org.archive.crawler.datamodel.CandidateURI; 031import org.archive.crawler.datamodel.CoreAttributeConstants; 032import org.archive.crawler.framework.CrawlController; 033import org.archive.crawler.frontier.HostnameQueueAssignmentPolicy; 034import org.archive.net.UURIFactory; 035 036import dk.netarkivet.common.utils.DomainUtils; 037 038/** 039 * This is a modified version of the {@link DomainnameQueueAssignmentPolicy} where domainname returned is the domainname 040 * of the candidateURI except where the domainname of the SeedURI is a different one. 041 * <p> 042 * <p> 043 * Using the domain as the queue-name. The domain is defined as the last two names in the entire hostname or the 044 * entirety of an IP address. x.y.z -> y.z y.z -> y.z nn.nn.nn.nn -> nn.nn.nn.nn 045 */ 046public class SeedUriDomainnameQueueAssignmentPolicy extends HostnameQueueAssignmentPolicy { 047 048 private static final Log log = LogFactory.getLog(SeedUriDomainnameQueueAssignmentPolicy.class); 049 050 /** 051 * A key used for the cases when we can't figure out the URI. This is taken from parent, where it has private 052 * access. Parent returns this on things like about:blank. 053 */ 054 static final String DEFAULT_CLASS_KEY = "default..."; 055 056 /** 057 * Return a key for queue names based on domain names (last two parts of host name) or IP address. They key may 058 * include a #<portnr> at the end. 059 * 060 * @param controller The controller the crawl is running on. 061 * @param cauri A potential URI. 062 * @return a class key (really an arbitrary string), one of <domainOrIP>, <domainOrIP>#<port>, or "default...". 063 * @see HostnameQueueAssignmentPolicy#getClassKey(org.archive.crawler.framework.CrawlController, 064 * org.archive.crawler.datamodel.CandidateURI) 065 */ 066 public String getClassKey(CrawlController controller, CandidateURI cauri) { 067 String candidate; 068 069 boolean ignoreSourceSeed = cauri != null && cauri.getCandidateURIString().startsWith("dns"); 070 try { 071 // Since getClassKey has no contract, we must encapsulate it from 072 // errors. 073 candidate = super.getClassKey(controller, cauri); 074 } catch (NullPointerException e) { 075 log.debug("Heritrix broke getting class key candidate for " + cauri); 076 candidate = DEFAULT_CLASS_KEY; 077 } 078 079 String sourceSeedCandidate = null; 080 if (!ignoreSourceSeed) { 081 sourceSeedCandidate = getCandidateFromSource(cauri); 082 } 083 084 if (sourceSeedCandidate != null) { 085 return sourceSeedCandidate; 086 } else { 087 // If sourceSeedCandidates are disabled, use the old method: 088 089 String[] hostnameandportnr = candidate.split("#"); 090 if (hostnameandportnr.length == 0 || hostnameandportnr.length > 2) { 091 return candidate; 092 } 093 094 String domainName = DomainUtils.domainNameFromHostname(hostnameandportnr[0]); 095 if (domainName == null) { // Not valid according to our rules 096 log.debug("Illegal class key candidate '" + candidate + "' for '" + cauri + "'"); 097 return candidate; 098 } 099 return domainName; 100 } 101 } 102 103 /** 104 * Find a candidate from the source. 105 * 106 * @param cauri A potential URI 107 * @return a candidate from the source or null if none found 108 */ 109 private String getCandidateFromSource(CandidateURI cauri) { 110 String sourceCandidate = null; 111 try { 112 sourceCandidate = cauri.getString(CoreAttributeConstants.A_SOURCE_TAG); 113 } catch (NoSuchElementException e) { 114 log.warn("source-tag-seeds not set in Heritrix template!"); 115 return null; 116 } 117 118 String hostname = null; 119 try { 120 hostname = UURIFactory.getInstance(sourceCandidate).getHost(); 121 } catch (URIException e) { 122 log.warn("Hostname could not be extracted from sourceCandidate: " + sourceCandidate); 123 return null; 124 } 125 return DomainUtils.domainNameFromHostname(hostname); 126 } 127 128}