001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.harvesting; 024 025import java.util.regex.Matcher; 026import java.util.regex.Pattern; 027 028import org.apache.commons.httpclient.URIException; 029import org.archive.crawler.datamodel.CandidateURI; 030import org.archive.crawler.deciderules.SurtPrefixedDecideRule; 031import org.archive.net.UURIFactory; 032import org.archive.util.ArchiveUtils; 033import org.archive.util.SurtPrefixSet; 034 035/** 036 * Class that re-creates the SurtPrefixSet to include only domain names according to the domain definition of 037 * NetarchiveSuite. The NetarchiveSuite can't use the org.archive.crawler.deciderules.OnDomainsDecideRule because it 038 * uses a different domain definition. 039 */ 040@SuppressWarnings({"serial"}) 041public class OnNSDomainsDecideRule extends SurtPrefixedDecideRule { 042 043 /** 044 * This is what SurtPrefixSet.prefixFromPlain returns for a non valid URI. 045 */ 046 public static final String NON_VALID_DOMAIN = "http://(http,)"; 047 048 /** Pattern that matches the first part of SURT - until ?? */ 049 public static final Pattern SURT_FIRSTPART_PATTERN = Pattern.compile("http\\://\\([^\\)]*"); 050 051 /** 052 * Constructor for the class OnNSDomainsDecideRule. 053 * 054 * @param s The name of this DecideRule 055 */ 056 public OnNSDomainsDecideRule(String s) { 057 super(s); 058 setDescription("OnNSDomainsDecideRule. Makes the configured decision " 059 + "for any URI which is inside one of the domains in the " 060 + "configured set of domains - according to the domain " + "definition of the NetarchiveSuite system. " 061 + "Giving that e.g. sports.tv2.dk will resolve to tv2.dk" 062 + " but www.bbc.co.uk will resolve to bbc.co.uk"); 063 } 064 065 /** 066 * We override the default readPrefixes, because we want to make our prefixes. 067 */ 068 protected void readPrefixes() { 069 buildSurtPrefixSet(); 070 myBuildSurtPrefixSet(); 071 dumpSurtPrefixSet(); 072 } 073 074 /** 075 * Method that rebuilds the SurtPrefixSet to include only topmost domains - according to the domain definition in 076 * NetarchiveSuite. This is only done once, during the startup phase? 077 */ 078 protected void myBuildSurtPrefixSet() { 079 // make copy of original SurtPrefixSet to loop 080 SurtPrefixSet newSurtPrefixes = (SurtPrefixSet) surtPrefixes.clone(); 081 // pattern that matches first part of SURT 082 083 // loop all original SURTs 084 for (String s : newSurtPrefixes) { 085 Matcher m = SURT_FIRSTPART_PATTERN.matcher(s); 086 if (m.find()) { 087 // cut off http:// (https:// are converted by heritrix classes) 088 String hostpart = m.group().substring(8); 089 // split in hostname/domainname/TLD parts 090 String[] parts = hostpart.split(","); 091 StringBuilder domnameBuilder = new StringBuilder(); 092 // loop through parts in reverse order - add '.' 093 // (not after last part) 094 for (int j = parts.length - 1; j >= 0; j--) { 095 domnameBuilder.append(parts[j]); 096 if (j != 0) { 097 domnameBuilder.append("."); 098 } 099 } 100 // add the new domain name to surtPrefixes 101 // since this is always shorter SURTs than the originals 102 // they will automatically 103 // override longer ones (built in SURTs logic) 104 surtPrefixes.add(prefixFrom(domnameBuilder.toString())); 105 } 106 } 107 } 108 109 /** 110 * Generate the SURT prefix that matches the domain definition of NetarchiveSuite. 111 * 112 * @param uri URL to convert to SURT 113 * @return String with SURT that matches the domain definition of NetarchiveSuite 114 */ 115 protected String prefixFrom(String uri) { 116 uri = ArchiveUtils.addImpliedHttpIfNecessary(uri); 117 return SurtPrefixSet.prefixFromPlain(convertToDomain(uri)); 118 } 119 120 /** 121 * Convert a URI to its domain. 122 * 123 * @param uri URL to convert to Top most domain-name according to NetarchiveSuite definition 124 * @return Domain name 125 */ 126 public static String convertToDomain(String uri) { 127 if (uri == null || uri.isEmpty()) { 128 throw new IllegalArgumentException("The value of the variable 'String uri' must not be an empty string."); 129 } 130 DomainnameQueueAssignmentPolicy policy = new DomainnameQueueAssignmentPolicy(); 131 String u = uri; 132 try { 133 u = UURIFactory.getInstance(uri).toString(); 134 } catch (URIException e) { 135 e.printStackTrace(); 136 // allow to continue with original string uri 137 } 138 try { 139 return policy.getClassKey(null, CandidateURI.fromString(u.toString())); 140 } catch (URIException e) { 141 // illegal URI - return a SURT that will not match any real URIs 142 return NON_VALID_DOMAIN; 143 } 144 } 145 146}