001package dk.netarkivet.common.utils;
002
003import java.io.BufferedReader;
004import java.io.File;
005import java.io.FileInputStream;
006import java.io.FileNotFoundException;
007import java.io.IOException;
008import java.io.InputStream;
009import java.io.InputStreamReader;
010import java.util.ArrayList;
011import java.util.List;
012import java.util.regex.Pattern;
013
014import org.apache.commons.io.IOUtils;
015import org.slf4j.Logger;
016import org.slf4j.LoggerFactory;
017
018import dk.netarkivet.common.CommonSettings;
019import dk.netarkivet.common.Constants;
020import dk.netarkivet.common.exceptions.UnknownID;
021import static dk.netarkivet.common.utils.DomainUtils.DOMAINNAME_CHAR_REGEX_STRING;
022
023/**
024 * Encapsulate the reading of Top level domains from settings and the embedded public_suffix.dat file.
025 *
026 */
027public class TLD {
028
029        /** The class logger. */
030    private static final Logger log = LoggerFactory.getLogger(TLD.class);
031        private static TLD tld;
032        
033        public final static String PUBLIC_SUFFIX_LIST_EMBEDDED_PATH = "dk/netarkivet/common/utils/public_suffix_list.dat";
034        public final static String PUBLIC_SUFFIX_LIST_EXTERNAL_FILE_PATH = "conf/public_suffix_list.dat";
035        
036        /**
037     * A regular expression matching hostnames, and remembering the hostname in group 1 and the domain in group 2.
038     */
039    private final Pattern HOSTNAME_REGEX; 
040    
041    /** A string for a regexp recognising a TLD  */
042    private final String TLD_REGEX_STRING; 
043        
044    /**
045     * Regexp for matching a valid domain, that is a single domain-name part followed by a TLD from settings, or an IP
046     * address.
047     */
048    private final Pattern VALID_DOMAIN_MATCHER;
049
050    /**
051     * GetInstance method for the TLD. Ensures singleton usage of the TLD class.
052     * @return the current instance of the TLD class.
053     */
054        public static synchronized TLD getInstance() {
055                if (tld == null) {
056                        tld = new TLD();
057                }
058                return tld;
059        }
060        
061        /**
062         * Reset TLD instance. primarily used for testing.
063         */
064        public static void reset() {
065                tld = null;
066        }
067        /**
068         * List of quoted TLD read from both settings and public suffix file.
069         */
070        private final List<String> tldListQuoted;
071        
072        /**
073         * List of TLD read from both settings and public suffix file.
074         */
075        private final List<String> tldList;
076        
077        /**
078         * Private constructor of the TLD class. This constructor reads the TLDs from both settings and public suffix file.
079         * both quoted and unquoted. Sets the TLD_REGEX_STRING,HOSTNAME_REGEX, and  VALID_DOMAIN_MATCHER.
080         */
081        private TLD() { 
082                tldListQuoted = new ArrayList<String>();
083                tldList = new ArrayList<String>();
084                readTldsFromPublicSuffixFile(tldList, tldListQuoted);
085                readTldsFromSettings(tldList, tldListQuoted);
086
087                TLD_REGEX_STRING = "\\.(" + StringUtils.conjoin("|", tldListQuoted) + ")";
088                HOSTNAME_REGEX = Pattern.compile("^(|.*?\\.)(" + DOMAINNAME_CHAR_REGEX_STRING + "+"
089                    + TLD_REGEX_STRING + ")");
090                VALID_DOMAIN_MATCHER = Pattern.compile("^(" + Constants.IP_REGEX_STRING + "|"
091                        + DOMAINNAME_CHAR_REGEX_STRING + "+" + TLD_REGEX_STRING + ")$");
092        }
093        
094        /**
095     * Helper method for reading TLDs from settings. Will read all settings, validate them as legal TLDs and warn and
096     * ignore them if any are invalid. Settings may be with or without prefix "."
097     * @param tldList the list to add all the tlds found in the settings
098     * @param quotedTldList the list to add all the tlds found in the settings - as a pattern  
099     */
100    protected static void readTldsFromSettings(List<String> tldList, List<String> quotedTldList) {
101        int count=0;
102        try {
103                String[] settingsTlds = Settings.getAll(CommonSettings.TLDS);
104                for (String tld : settingsTlds) {
105                if (tld.startsWith(".")) {
106                    tld = tld.substring(1);
107                }
108                if (!tld.matches(DOMAINNAME_CHAR_REGEX_STRING + "(" + DOMAINNAME_CHAR_REGEX_STRING + "|\\.)*")) {
109                    log.warn("Invalid tld '{}', ignoring", tld);
110                    continue;
111                }
112                tldList.add(tld);
113                quotedTldList.add(Pattern.quote(tld));
114                count++;
115            }
116                log.info("Read {} TLDs from settings", count);
117        } catch (UnknownID e) {
118                log.debug("No tlds found in settingsfiles " + StringUtils.conjoin(",", Settings.getSettingsFiles()));
119        } 
120    }
121        
122    /**
123     * Helper method for reading TLDs from the embedded public suffix file. Will read all entries, validate them as legal TLDs and warn and
124     * ignore them if any are invalid.
125     * Now silently ignores starred tld's in public suffix file (e.g "*.kw") and exclusion rules (e.g. !metro.tokyo.jp)
126         * @param tldList the list to add all the tlds found in the public suffix file 
127     * @param quotedTldList the list to add all the tlds found in the public suffix file - as a pattern  
128     */
129    protected static void readTldsFromPublicSuffixFile(List<String> tldList, List<String> quotedTldList) {
130        InputStream stream = getPublicSuffixListDataStream();
131        boolean silentlyIgnoringStarTldsInPublicSuffixFile = Settings.getBoolean(CommonSettings.TLD_SILENTLY_IGNORE_STARRED_TLDS);
132        int count=0;
133        if (stream != null) {
134                BufferedReader br = null;
135                try {
136                        br = new BufferedReader(new InputStreamReader(stream));
137                        String line;
138                        while ((line = br.readLine()) != null) {
139                                String tld = line.trim();
140                                if (tld.isEmpty() || tld.startsWith("//")) {
141                                        continue;
142                                } else if (silentlyIgnoringStarTldsInPublicSuffixFile && (tld.startsWith("*.") || tld.startsWith("!"))) {
143                                        continue;
144                                } else {
145                            if (!tld.matches(DOMAINNAME_CHAR_REGEX_STRING + "(" + DOMAINNAME_CHAR_REGEX_STRING + "|\\.)*")) {
146                                log.warn("Invalid tld '{}', ignoring", tld);
147                                continue; 
148                            }
149                            tldList.add(tld);
150                        quotedTldList.add(Pattern.quote(tld));
151                                }
152                        }
153                        log.info("Read {} TLDs from public suffix file", count);
154                } catch(IOException e) {
155                        e.printStackTrace();
156                } finally {
157                        IOUtils.closeQuietly(br);
158                }
159        } else {
160                log.warn("Unable to retrieve public suffix_list failed. No tlds added!");
161        }        
162    }
163
164    
165    private static InputStream getPublicSuffixListDataStream() {
166        InputStream stream = null;
167        File alternateExternalFile = new File(PUBLIC_SUFFIX_LIST_EXTERNAL_FILE_PATH);
168        if (alternateExternalFile.isFile()) {
169                try {
170                        stream = new FileInputStream(alternateExternalFile);
171                } catch (FileNotFoundException e) {
172                        // Will never happen!
173                        e.printStackTrace();
174                }
175                log.info("Reading public suffixes list from external file '{}'", alternateExternalFile.getAbsolutePath());
176        } else { // Read embedded copy
177                log.info("Did not found external public suffix list at '{}'! Reading instead the public suffixes list from embedded file '{}' in common-core.jar-VERSION.jar.", 
178                                alternateExternalFile.getAbsolutePath(), PUBLIC_SUFFIX_LIST_EMBEDDED_PATH); 
179                stream = Thread.currentThread().getContextClassLoader().getResourceAsStream(PUBLIC_SUFFIX_LIST_EMBEDDED_PATH);
180                
181        }
182
183        return stream;
184    }
185
186        /**
187     * @return the VALID_DOMAIN_MATCHER pattern.
188     */
189        public Pattern getValidDomainMatcher() {
190                return VALID_DOMAIN_MATCHER;
191        }
192
193        /**
194         * 
195         * @return the HOSTNAME_REGEX pattern.
196         */
197        public Pattern getHostnamePattern() {
198                return HOSTNAME_REGEX;
199        }
200        
201        /**
202         * GetAllTlds method.
203         * @param quoted do you want the quoted, or unquoted list.
204         * @return the quoted list (if quoted=true), else the unquoted list.
205         */
206        public List<String> getAllTlds(boolean quoted) {
207                if (quoted) {
208                        return tldListQuoted; 
209                } else {
210                        return tldList;
211                }
212        }
213}