001package dk.netarkivet.common.utils; 002 003import java.io.BufferedReader; 004import java.io.File; 005import java.io.FileInputStream; 006import java.io.FileNotFoundException; 007import java.io.IOException; 008import java.io.InputStream; 009import java.io.InputStreamReader; 010import java.util.ArrayList; 011import java.util.List; 012import java.util.regex.Pattern; 013 014import org.apache.commons.io.IOUtils; 015import org.slf4j.Logger; 016import org.slf4j.LoggerFactory; 017 018import dk.netarkivet.common.CommonSettings; 019import dk.netarkivet.common.Constants; 020import dk.netarkivet.common.exceptions.UnknownID; 021import static dk.netarkivet.common.utils.DomainUtils.DOMAINNAME_CHAR_REGEX_STRING; 022 023/** 024 * Encapsulate the reading of Top level domains from settings and the embedded public_suffix.dat file. 025 * 026 */ 027public class TLD { 028 029 /** The class logger. */ 030 private static final Logger log = LoggerFactory.getLogger(TLD.class); 031 private static TLD tld; 032 033 public final static String PUBLIC_SUFFIX_LIST_EMBEDDED_PATH = "dk/netarkivet/common/utils/public_suffix_list.dat"; 034 public final static String PUBLIC_SUFFIX_LIST_EXTERNAL_FILE_PATH = "conf/public_suffix_list.dat"; 035 036 /** 037 * A regular expression matching hostnames, and remembering the hostname in group 1 and the domain in group 2. 038 */ 039 private final Pattern HOSTNAME_REGEX; 040 041 /** A string for a regexp recognising a TLD */ 042 private final String TLD_REGEX_STRING; 043 044 /** 045 * Regexp for matching a valid domain, that is a single domain-name part followed by a TLD from settings, or an IP 046 * address. 047 */ 048 private final Pattern VALID_DOMAIN_MATCHER; 049 050 /** 051 * GetInstance method for the TLD. Ensures singleton usage of the TLD class. 052 * @return the current instance of the TLD class. 053 */ 054 public static synchronized TLD getInstance() { 055 if (tld == null) { 056 tld = new TLD(); 057 } 058 return tld; 059 } 060 061 /** 062 * Reset TLD instance. primarily used for testing. 063 */ 064 public static void reset() { 065 tld = null; 066 } 067 /** 068 * List of quoted TLD read from both settings and public suffix file. 069 */ 070 private final List<String> tldListQuoted; 071 072 /** 073 * List of TLD read from both settings and public suffix file. 074 */ 075 private final List<String> tldList; 076 077 /** 078 * Private constructor of the TLD class. This constructor reads the TLDs from both settings and public suffix file. 079 * both quoted and unquoted. Sets the TLD_REGEX_STRING,HOSTNAME_REGEX, and VALID_DOMAIN_MATCHER. 080 */ 081 private TLD() { 082 tldListQuoted = new ArrayList<String>(); 083 tldList = new ArrayList<String>(); 084 readTldsFromPublicSuffixFile(tldList, tldListQuoted); 085 readTldsFromSettings(tldList, tldListQuoted); 086 087 TLD_REGEX_STRING = "\\.(" + StringUtils.conjoin("|", tldListQuoted) + ")"; 088 HOSTNAME_REGEX = Pattern.compile("^(|.*?\\.)(" + DOMAINNAME_CHAR_REGEX_STRING + "+" 089 + TLD_REGEX_STRING + ")"); 090 VALID_DOMAIN_MATCHER = Pattern.compile("^(" + Constants.IP_REGEX_STRING + "|" 091 + DOMAINNAME_CHAR_REGEX_STRING + "+" + TLD_REGEX_STRING + ")$"); 092 } 093 094 /** 095 * Helper method for reading TLDs from settings. Will read all settings, validate them as legal TLDs and warn and 096 * ignore them if any are invalid. Settings may be with or without prefix "." 097 * @param tldList the list to add all the tlds found in the settings 098 * @param quotedTldList the list to add all the tlds found in the settings - as a pattern 099 */ 100 protected static void readTldsFromSettings(List<String> tldList, List<String> quotedTldList) { 101 int count=0; 102 try { 103 String[] settingsTlds = Settings.getAll(CommonSettings.TLDS); 104 for (String tld : settingsTlds) { 105 if (tld.startsWith(".")) { 106 tld = tld.substring(1); 107 } 108 if (!tld.matches(DOMAINNAME_CHAR_REGEX_STRING + "(" + DOMAINNAME_CHAR_REGEX_STRING + "|\\.)*")) { 109 log.warn("Invalid tld '{}', ignoring", tld); 110 continue; 111 } 112 tldList.add(tld); 113 quotedTldList.add(Pattern.quote(tld)); 114 count++; 115 } 116 log.info("Read {} TLDs from settings", count); 117 } catch (UnknownID e) { 118 log.debug("No tlds found in settingsfiles " + StringUtils.conjoin(",", Settings.getSettingsFiles())); 119 } 120 } 121 122 /** 123 * Helper method for reading TLDs from the embedded public suffix file. Will read all entries, validate them as legal TLDs and warn and 124 * ignore them if any are invalid. 125 * Now silently ignores starred tld's in public suffix file (e.g "*.kw") and exclusion rules (e.g. !metro.tokyo.jp) 126 * @param tldList the list to add all the tlds found in the public suffix file 127 * @param quotedTldList the list to add all the tlds found in the public suffix file - as a pattern 128 */ 129 protected static void readTldsFromPublicSuffixFile(List<String> tldList, List<String> quotedTldList) { 130 InputStream stream = getPublicSuffixListDataStream(); 131 boolean silentlyIgnoringStarTldsInPublicSuffixFile = Settings.getBoolean(CommonSettings.TLD_SILENTLY_IGNORE_STARRED_TLDS); 132 int count=0; 133 if (stream != null) { 134 BufferedReader br = null; 135 try { 136 br = new BufferedReader(new InputStreamReader(stream)); 137 String line; 138 while ((line = br.readLine()) != null) { 139 String tld = line.trim(); 140 if (tld.isEmpty() || tld.startsWith("//")) { 141 continue; 142 } else if (silentlyIgnoringStarTldsInPublicSuffixFile && (tld.startsWith("*.") || tld.startsWith("!"))) { 143 continue; 144 } else { 145 if (!tld.matches(DOMAINNAME_CHAR_REGEX_STRING + "(" + DOMAINNAME_CHAR_REGEX_STRING + "|\\.)*")) { 146 log.warn("Invalid tld '{}', ignoring", tld); 147 continue; 148 } 149 tldList.add(tld); 150 quotedTldList.add(Pattern.quote(tld)); 151 } 152 } 153 log.info("Read {} TLDs from public suffix file", count); 154 } catch(IOException e) { 155 e.printStackTrace(); 156 } finally { 157 IOUtils.closeQuietly(br); 158 } 159 } else { 160 log.warn("Unable to retrieve public suffix_list failed. No tlds added!"); 161 } 162 } 163 164 165 private static InputStream getPublicSuffixListDataStream() { 166 InputStream stream = null; 167 File alternateExternalFile = new File(PUBLIC_SUFFIX_LIST_EXTERNAL_FILE_PATH); 168 if (alternateExternalFile.isFile()) { 169 try { 170 stream = new FileInputStream(alternateExternalFile); 171 } catch (FileNotFoundException e) { 172 // Will never happen! 173 e.printStackTrace(); 174 } 175 log.info("Reading public suffixes list from external file '{}'", alternateExternalFile.getAbsolutePath()); 176 } else { // Read embedded copy 177 log.info("Did not found external public suffix list at '{}'! Reading instead the public suffixes list from embedded file '{}' in common-core.jar-VERSION.jar.", 178 alternateExternalFile.getAbsolutePath(), PUBLIC_SUFFIX_LIST_EMBEDDED_PATH); 179 stream = Thread.currentThread().getContextClassLoader().getResourceAsStream(PUBLIC_SUFFIX_LIST_EMBEDDED_PATH); 180 181 } 182 183 return stream; 184 } 185 186 /** 187 * @return the VALID_DOMAIN_MATCHER pattern. 188 */ 189 public Pattern getValidDomainMatcher() { 190 return VALID_DOMAIN_MATCHER; 191 } 192 193 /** 194 * 195 * @return the HOSTNAME_REGEX pattern. 196 */ 197 public Pattern getHostnamePattern() { 198 return HOSTNAME_REGEX; 199 } 200 201 /** 202 * GetAllTlds method. 203 * @param quoted do you want the quoted, or unquoted list. 204 * @return the quoted list (if quoted=true), else the unquoted list. 205 */ 206 public List<String> getAllTlds(boolean quoted) { 207 if (quoted) { 208 return tldListQuoted; 209 } else { 210 return tldList; 211 } 212 } 213}