001package dk.netarkivet.common.utils; 002 003import java.io.BufferedReader; 004import java.io.File; 005import java.io.FileInputStream; 006import java.io.FileNotFoundException; 007import java.io.IOException; 008import java.io.InputStream; 009import java.io.InputStreamReader; 010import java.util.ArrayList; 011import java.util.List; 012import java.util.regex.Pattern; 013 014import org.apache.commons.io.IOUtils; 015import org.slf4j.Logger; 016import org.slf4j.LoggerFactory; 017 018import dk.netarkivet.common.CommonSettings; 019import dk.netarkivet.common.Constants; 020import dk.netarkivet.common.exceptions.UnknownID; 021import static dk.netarkivet.common.utils.DomainUtils.DOMAINNAME_CHAR_REGEX_STRING; 022 023/** 024 * Encapsulate the reading of Top level domains from settings and the embedded public_suffix.dat file. 025 * 026 */ 027public class TLD { 028 029 /** The class logger. */ 030 private static final Logger log = LoggerFactory.getLogger(TLD.class); 031 private static TLD tld; 032 033 public final static String PUBLIC_SUFFIX_LIST_EMBEDDED_PATH = "dk/netarkivet/common/utils/public_suffix_list.dat"; 034 public final static String PUBLIC_SUFFIX_LIST_EXTERNAL_FILE_PATH = "conf/public_suffix_list.dat"; 035 036 /** 037 * A regular expression matching hostnames, and remembering the hostname in group 1 and the domain in group 2. 038 */ 039 private final Pattern HOSTNAME_REGEX; 040 041 /** A string for a regexp recognising a TLD */ 042 private final String TLD_REGEX_STRING; 043 044 /** 045 * Regexp for matching a valid domain, that is a single domain-name part followed by a TLD from settings, or an IP 046 * address. 047 */ 048 private final Pattern VALID_DOMAIN_MATCHER; 049 050 /** 051 * GetInstance method for the TLD. Ensures singleton usage of the TLD class. 052 * @return the current instance of the TLD class. 053 */ 054 public static synchronized TLD getInstance() { 055 if (tld == null) { 056 tld = new TLD(); 057 } 058 return tld; 059 } 060 061 /** 062 * Reset TLD instance. primarily used for testing. 063 */ 064 public static void reset() { 065 tld = null; 066 } 067 /** 068 * List of quoted TLD read from both settings and public suffix file. 069 */ 070 private final List<String> tldListQuoted; 071 072 /** 073 * List of TLD read from both settings and public suffix file. 074 */ 075 private final List<String> tldList; 076 077 /** 078 * Private constructor of the TLD class. This constructor reads the TLDs from both settings and public suffix file. 079 * both quoted and unquoted. Sets the TLD_REGEX_STRING,HOSTNAME_REGEX, and VALID_DOMAIN_MATCHER. 080 */ 081 private TLD() { 082 tldListQuoted = readTldsFromPublicSuffixFile(true); 083 tldListQuoted.addAll(readTldsFromSettings(true)); 084 085 tldList = readTldsFromPublicSuffixFile(false); 086 tldList.addAll(readTldsFromSettings(false)); 087 088 TLD_REGEX_STRING = "\\.(" + StringUtils.conjoin("|", tldListQuoted) + ")"; 089 HOSTNAME_REGEX = Pattern.compile("^(|.*?\\.)(" + DOMAINNAME_CHAR_REGEX_STRING + "+" 090 + TLD_REGEX_STRING + ")"); 091 VALID_DOMAIN_MATCHER = Pattern.compile("^(" + Constants.IP_REGEX_STRING + "|" 092 + DOMAINNAME_CHAR_REGEX_STRING + "+" + TLD_REGEX_STRING + ")$"); 093 } 094 095 /** 096 * Helper method for reading TLDs from settings. Will read all settings, validate them as legal TLDs and warn and 097 * ignore them if any are invalid. Settings may be with or without prefix "." 098 * 099 * @return a List of TLDs as Strings 100 */ 101 protected static List<String> readTldsFromSettings(boolean asPattern) { 102 List<String> tlds = new ArrayList<String>(); 103 try { 104 String[] settingsTlds = Settings.getAll(CommonSettings.TLDS); 105 for (String tld : settingsTlds) { 106 if (tld.startsWith(".")) { 107 tld = tld.substring(1); 108 } 109 if (!tld.matches(DOMAINNAME_CHAR_REGEX_STRING + "(" + DOMAINNAME_CHAR_REGEX_STRING + "|\\.)*")) { 110 log.warn("Invalid tld '{}', ignoring", tld); 111 continue; 112 } 113 if (asPattern) { 114 tlds.add(Pattern.quote(tld)); 115 } else { 116 tlds.add(tld); 117 } 118 } 119 } catch (UnknownID e) { 120 log.debug("No tlds found in settingsfiles " + StringUtils.conjoin(",", Settings.getSettingsFiles())); 121 } 122 return tlds; 123 } 124 125 /** 126 * Helper method for reading TLDs from the embedded public suffix file. Will read all entries, validate them as legal TLDs and warn and 127 * ignore them if any are invalid. 128 * @param asPattern if true, return a list of quoted Strings using Pattern.quote 129 * @return a List of TLDs as Strings 130 */ 131 protected static List<String> readTldsFromPublicSuffixFile(boolean asPattern) { 132 List<String> tlds = new ArrayList<String>(); 133 InputStream stream = getPublicSuffixListDataStream(); 134 if (stream != null) { 135 BufferedReader br = null; 136 try { 137 br = new BufferedReader(new InputStreamReader(stream)); 138 String line; 139 while ((line = br.readLine()) != null) { 140 String tld = line.trim(); 141 if (tld.isEmpty() || tld.startsWith("//")) { 142 continue; 143 } else { 144 if (!tld.matches(DOMAINNAME_CHAR_REGEX_STRING + "(" + DOMAINNAME_CHAR_REGEX_STRING + "|\\.)*")) { 145 log.warn("Invalid tld '{}', ignoring", tld); 146 continue; 147 } 148 if (asPattern) { 149 tlds.add(Pattern.quote(tld)); 150 } else { 151 tlds.add(tld); 152 } 153 } 154 } 155 } catch(IOException e) { 156 e.printStackTrace(); 157 } finally { 158 IOUtils.closeQuietly(br); 159 } 160 } else { 161 log.warn("Unable to retrieve public suffix_list failed. Returned empty list!."); 162 } 163 return tlds; 164 } 165 166 167 private static InputStream getPublicSuffixListDataStream() { 168 InputStream stream = null; 169 File alternateExternalFile = new File(PUBLIC_SUFFIX_LIST_EXTERNAL_FILE_PATH); 170 if (alternateExternalFile.isFile()) { 171 try { 172 stream = new FileInputStream(alternateExternalFile); 173 } catch (FileNotFoundException e) { 174 // Will never happen! 175 e.printStackTrace(); 176 } 177 log.info("Reading public suffixes list from external file '{}'", alternateExternalFile.getAbsolutePath()); 178 } else { // Read embedded copy 179 log.info("Did not found external public suffix list at '{}'! Reading instead the public suffixes list from embedded file '{}' in common-core.jar-VERSION.jar.", 180 alternateExternalFile.getAbsolutePath(), PUBLIC_SUFFIX_LIST_EMBEDDED_PATH); 181 stream = Thread.currentThread().getContextClassLoader().getResourceAsStream(PUBLIC_SUFFIX_LIST_EMBEDDED_PATH); 182 183 } 184 185 return stream; 186 } 187 188 /** 189 * @return the VALID_DOMAIN_MATCHER pattern. 190 */ 191 public Pattern getValidDomainMatcher() { 192 return VALID_DOMAIN_MATCHER; 193 } 194 195 /** 196 * 197 * @return the HOSTNAME_REGEX pattern. 198 */ 199 public Pattern getHostnamePattern() { 200 return HOSTNAME_REGEX; 201 } 202 203 /** 204 * GetAllTlds method. 205 * @param quoted do you want the quoted, or unquoted list. 206 * @return the quoted list (if quoted=true), else the unquoted list. 207 */ 208 public List<String> getAllTlds(boolean quoted) { 209 if (quoted) { 210 return tldListQuoted; 211 } else { 212 return tldList; 213 } 214 } 215}