001/* 002 * #%L 003 * Netarchivesuite - Heritrix 3 extensions 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.harvesting; 024 025import java.util.logging.Logger; 026 027import org.archive.modules.CrawlURI; 028import org.archive.modules.deciderules.surt.SurtPrefixedDecideRule; 029 030/** 031 * Extended <code>SurtPrefixedDecideRule</code> class. 032 * Enable SURT seeds that allow sub-domains if the original seed URI has no path at all. 033 * Can also add/remove www/www[<x>] from the SURT and/or also add the orginal SURT seed. 034 * 035 * @author nicl 036 */ 037public class NASSurtPrefixedDecideRule extends SurtPrefixedDecideRule { 038 039 /** 040 * UUID. 041 */ 042 private static final long serialVersionUID = 3334790462876505839L; 043 044 /** Logger instance. */ 045 private static final Logger logger = Logger.getLogger(NASSurtPrefixedDecideRule.class.getName()); 046 047 /** 048 * Enable/Disable the removing of a preceding www[<x>] in SURT host if present. 049 */ 050 protected boolean removeW3xSubDomain = true; 051 public boolean getRemoveW3xSubDomain() { 052 return removeW3xSubDomain; 053 } 054 public void setRemoveW3xSubDomain(boolean removeW3xSubDomain) { 055 this.removeW3xSubDomain = removeW3xSubDomain; 056 } 057 058 /** 059 * Enable/Disable the adding of the original SURT before removing the preceding www[<x>]. 060 */ 061 protected boolean addBeforeRemovingW3xSubDomain = true; 062 public boolean getAddBeforeRemovingW3xSubDomain() { 063 return addBeforeRemovingW3xSubDomain; 064 } 065 public void setAddBeforeRemovingW3xSubDomain(boolean addBeforeRemovingW3xSubDomain) { 066 this.addBeforeRemovingW3xSubDomain = addBeforeRemovingW3xSubDomain; 067 } 068 069 /** 070 * Enable/Disable the adding of a preceding www in SURT host if none is present. 071 */ 072 protected boolean addW3SubDomain = true; 073 public boolean getAddW3SubDomain() { 074 return addW3SubDomain; 075 } 076 public void setAddW3SubDomain(boolean addW3SubDomain) { 077 this.addW3SubDomain = addW3SubDomain; 078 } 079 080 /** 081 * Enable/Disable the adding of the original SURT before adding a preceding www. 082 */ 083 protected boolean addBeforeAddingW3SubDomain = true; 084 public boolean getAddBeforeAddingW3SubDomain() { 085 return addBeforeAddingW3SubDomain; 086 } 087 public void setAddBeforeAddingW3SubDomain(boolean addBeforeAddingW3SubDomain) { 088 this.addBeforeAddingW3SubDomain = addBeforeAddingW3SubDomain; 089 } 090 091 /** 092 * Enable/Disable the removing of ')/' in the SURT if the original URI does not have a path at all. 093 */ 094 protected boolean allowSubDomainsRewrite = true; 095 public boolean getAllowSubDomainsRewrite() { 096 return allowSubDomainsRewrite; 097 } 098 public void setAllowSubDomainsRewrite(boolean allowSubDomainsRewrite) { 099 this.allowSubDomainsRewrite = allowSubDomainsRewrite; 100 } 101 102 @Override 103 public void addedSeed(final CrawlURI curi) { 104 if(getSeedsAsSurtPrefixes()) { 105 addedSeedImpl(curi); 106 } 107 } 108 109 /** 110 * <code>addedSeed</code implementation method to facilitate unit testing. 111 * @param curi <code>CrawlURI</code> object to convert 112 * @return URI converted to SURT string 113 */ 114 protected String addedSeedImpl(final CrawlURI curi) { 115 String originalUri = curi.getSourceTag(); 116 if (originalUri == null && allowSubDomainsRewrite) { 117 logger.warning("originalUri not available"); 118 } 119 String surt = prefixFrom(curi.getURI()); 120 int idx; 121 int idx2; 122 String scheme; 123 String surtHost; 124 String port; 125 String path; 126 String part; 127 boolean bRemoveW3x; 128 if (surt != null) { 129 idx = surt.indexOf("://"); 130 if (idx != -1) { 131 scheme = surt.substring(0, idx); 132 idx += "://".length(); 133 idx2 = surt.indexOf(')', idx); 134 if (idx2 != -1 && surt.charAt(idx++) == '(') { 135 surtHost = surt.substring(idx, idx2); 136 path = surt.substring(idx2 + 1); 137 idx = surtHost.lastIndexOf(':'); 138 if (idx != -1) { 139 port = surtHost.substring(idx); 140 surtHost = surtHost.substring(0, idx); 141 } else { 142 port = ""; 143 } 144 // Look for www[<x>] in host name. 145 idx = surtHost.lastIndexOf(','); 146 if (idx != -1) { 147 idx2 = idx; 148 if (idx == surtHost.length() - 1 && idx > 0) { 149 idx = surtHost.lastIndexOf(',', idx - 1); 150 } 151 } 152 if (idx != -1 && idx < idx2) { 153 part = surtHost.substring(idx + 1, idx2); 154 if (part.startsWith("www")) { 155 bRemoveW3x = true; 156 if (part.length() > 3) { 157 try { 158 Integer.parseInt(part.substring(3)); 159 } catch (NumberFormatException e) { 160 bRemoveW3x = false; 161 } 162 } 163 } else { 164 bRemoveW3x = false; 165 } 166 if (bRemoveW3x) { 167 if (removeW3xSubDomain) { 168 if (addBeforeRemovingW3xSubDomain) { 169 surt = subDomainsRewrite(path, originalUri, scheme, surtHost, port, surt); 170 surtPrefixes.add(surt); 171 } 172 surtHost = surtHost.substring(0, idx + 1); 173 surt = scheme + "://(" + surtHost + port + ")" + path; 174 } 175 } else { 176 if (addW3SubDomain) { 177 if (addBeforeAddingW3SubDomain) { 178 surt = subDomainsRewrite(path, originalUri, scheme, surtHost, port, surt); 179 surtPrefixes.add(surt); 180 } 181 surtHost = surtHost + "www,"; 182 surt = scheme + "://(" + surtHost + port + ")" + path; 183 } 184 } 185 } else { 186 logger.warning("very strange surt host"); 187 } 188 surt = subDomainsRewrite(path, originalUri, scheme, surtHost, port, surt); 189 } 190 } 191 } 192 surtPrefixes.add(surt); 193 return surt; 194 } 195 196 /** 197 * Method to rewrite the SURT to allow sub-domains if the original URI does not have a path at all. 198 * @param path SURT path string 199 * @param originalUri original URI 200 * @param scheme SURT scheme string 201 * @param surtHost SURT host as comma separated list of names 202 * @param surt URI converted to SURT by the default Heritrix means 203 * @return original or rewritten SURT, depending on the SURT and the original URI 204 */ 205 protected String subDomainsRewrite(String path, String originalUri, String scheme, String surtHost, String port, String surt) { 206 int idx; 207 if (allowSubDomainsRewrite) { 208 if ("/".compareTo(path) == 0) { 209 if (originalUri != null) { 210 idx = originalUri.indexOf("://"); 211 if (idx != -1) { 212 idx += "://".length(); 213 idx = originalUri.indexOf('/', idx); 214 if (idx == -1) { 215 surt = scheme + "://(" + surtHost + port; 216 } 217 } 218 } 219 } 220 } 221 return surt; 222 } 223 224}