001/*$Id: OnNSDomainsDecideRule.java 2687 2013-05-03 16:38:47Z svc $ 002* $Revision: 2687 $ 003* $Date: 2013-05-03 18:38:47 +0200 (Fri, 03 May 2013) $ 004* $Author: svc $ 005* 006* The Netarchive Suite - Software to harvest and preserve websites 007* Copyright 2004-2018 The Royal Danish Library, 008 * the National Library of France and the Austrian 009 * National Library. 010* 011* This library is free software; you can redistribute it and/or 012* modify it under the terms of the GNU Lesser General Public 013* License as published by the Free Software Foundation; either 014* version 2.1 of the License, or (at your option) any later version. 015* 016* This library is distributed in the hope that it will be useful, 017* but WITHOUT ANY WARRANTY; without even the implied warranty of 018* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 019* Lesser General Public License for more details. 020* 021* You should have received a copy of the GNU Lesser General Public 022* License along with this library; if not, write to the Free Software 023* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 024*/ 025package dk.netarkivet.harvester.harvesting; 026 027import java.util.regex.Matcher; 028import java.util.regex.Pattern; 029 030import org.apache.commons.httpclient.URIException; 031import org.archive.modules.CrawlURI; 032import org.archive.modules.deciderules.surt.SurtPrefixedDecideRule; 033import org.archive.net.UURI; 034import org.archive.net.UURIFactory; 035import org.archive.util.ArchiveUtils; 036import org.archive.util.SurtPrefixSet; 037 038import dk.netarkivet.common.exceptions.ArgumentNotValid; 039 040/** 041 * Class that re-creates the SurtPrefixSet to include only domain names 042 * according to the domain definition of NetarchiveSuite. 043 * The NetarchiveSuite can't use the 044 * org.archive.crawler.deciderules.OnDomainsDecideRule because 045 * it uses a different domain definition. 046 */ 047public class OnNSDomainsDecideRule extends SurtPrefixedDecideRule { 048 049 /** This is what SurtPrefixSet.prefixFromPlain returns for 050 * a non valid URI. */ 051 public static final String NON_VALID_DOMAIN = "http://(http,)"; 052 053 /** Pattern that matches the first part of SURT - until ?? */ 054 public static final Pattern SURT_FIRSTPART_PATTERN 055 = Pattern.compile("http\\://\\([^\\)]*"); 056 /** 057 * Constructor for the class OnNSDomainsDecideRule. 058 * Makes the configured decision 059 * for any URI which is inside one of the domains in the 060 * configured set of domains - according to the domain 061 * definition of the NetarchiveSuite system. 062 * Giving that e.g. sports.tv2.dk will resolve to tv2.dk 063 * but www.bbc.co.uk will resolve to bbc.co.uk" 064 * 065 */ 066 public OnNSDomainsDecideRule(){ 067 super(); 068 } 069 070 /** 071 * We override the default readPrefixes, because we want to 072 * make our prefixes. 073 */ 074 protected void readPrefixes() { 075 buildSurtPrefixSet(); 076 myBuildSurtPrefixSet(); 077 dumpSurtPrefixSet(); 078 } 079 080 /** 081 * Method that rebuilds the SurtPrefixSet to include only 082 * topmost domains - according to the domain definition 083 * in NetarchiveSuite. 084 * This is only done once, during the startup phase? 085 */ 086 protected void myBuildSurtPrefixSet() { 087 //make copy of original SurtPrefixSet to loop 088 SurtPrefixSet newSurtPrefixes = (SurtPrefixSet) surtPrefixes.clone(); 089 //pattern that matches first part of SURT 090 091 //loop all original SURTs 092 for (String s : newSurtPrefixes) { 093 Matcher m = SURT_FIRSTPART_PATTERN.matcher(s); 094 if (m.find()) { 095 //cut off http:// (https:// are converted by heritrix classes) 096 String hostpart = m.group().substring(8); 097 //split in hostname/domainname/TLD parts 098 String[] parts = hostpart.split(","); 099 StringBuilder domnameBuilder = new StringBuilder(); 100 //loop through parts in reverse order - add '.' 101 //(not after last part) 102 for (int j = parts.length - 1; j >= 0; j--) { 103 domnameBuilder.append(parts[j]); 104 if (j != 0) { 105 domnameBuilder.append("."); 106 } 107 } 108 //add the new domain name to surtPrefixes 109 //since this is always shorter SURTs than the originals 110 //they will automatically 111 //override longer ones (built in SURTs logic) 112 surtPrefixes.add(prefixFrom(domnameBuilder.toString())); 113 } 114 } 115 } 116 117 /** 118 * Generate the SURT prefix that matches the domain definition 119 * of NetarchiveSuite. 120 * @param uri URL to convert to SURT 121 * @return String with SURT that matches the domain definition 122 * of NetarchiveSuite 123 */ 124 protected String prefixFrom(String uri) { 125 uri = ArchiveUtils.addImpliedHttpIfNecessary(uri); 126 //TODO is this correct now ? 127 return SurtPrefixSet.prefixFromPlainForceHttp(convertToDomain(uri)); 128 } 129 130 /** 131 * Convert a URI to its domain. 132 * @param uri URL to convert to Top most domain-name according to 133 * NetarchiveSuite definition 134 * @return Domain name 135 */ 136 public static String convertToDomain(String uri) { 137 ArgumentNotValid.checkNotNullOrEmpty(uri, "String uri"); 138 DomainnameQueueAssignmentPolicy policy 139 = new DomainnameQueueAssignmentPolicy(); 140 141 UURI uuri = null; 142 try { 143 uuri = UURIFactory.getInstance(uri); 144 } catch (URIException e) { 145 e.printStackTrace(); 146 // allow to continue with original string uri 147 // FIXME/TODO 148 // AD "allow to continue with original string uri" 149 // We cannot do that any more, as the argument to getClassKey is now CrawlURI, except for 150 // the string 151 152 } 153 try { 154 return policy.getClassKey(new CrawlURI(uuri)); 155 } catch (Throwable e) { 156 // illegal URI - return a SURT that will not match any real URIs 157 return NON_VALID_DOMAIN; 158 } 159 } 160}