001package dk.netarkivet.common.utils;
002
003import java.io.BufferedReader;
004import java.io.File;
005import java.io.FileInputStream;
006import java.io.FileNotFoundException;
007import java.io.IOException;
008import java.io.InputStream;
009import java.io.InputStreamReader;
010import java.util.ArrayList;
011import java.util.List;
012import java.util.regex.Pattern;
013
014import org.apache.commons.io.IOUtils;
015import org.slf4j.Logger;
016import org.slf4j.LoggerFactory;
017
018import dk.netarkivet.common.CommonSettings;
019import dk.netarkivet.common.Constants;
020import dk.netarkivet.common.exceptions.UnknownID;
021import static dk.netarkivet.common.utils.DomainUtils.DOMAINNAME_CHAR_REGEX_STRING;
022
023/**
024 * Encapsulate the reading of Top level domains from settings and the embedded public_suffix.dat file.
025 *
026 */
027public class TLD {
028
029        /** The class logger. */
030    private static final Logger log = LoggerFactory.getLogger(TLD.class);
031        private static TLD tld;
032        
033        public final static String PUBLIC_SUFFIX_LIST_EMBEDDED_PATH = "dk/netarkivet/common/utils/public_suffix_list.dat";
034        public final static String PUBLIC_SUFFIX_LIST_EXTERNAL_FILE_PATH = "conf/public_suffix_list.dat";
035        
036        /**
037     * A regular expression matching hostnames, and remembering the hostname in group 1 and the domain in group 2.
038     */
039    private final Pattern HOSTNAME_REGEX; 
040    
041    /** A string for a regexp recognising a TLD  */
042    private final String TLD_REGEX_STRING; 
043        
044    /**
045     * Regexp for matching a valid domain, that is a single domain-name part followed by a TLD from settings, or an IP
046     * address.
047     */
048    private final Pattern VALID_DOMAIN_MATCHER;
049
050    /**
051     * GetInstance method for the TLD. Ensures singleton usage of the TLD class.
052     * @return the current instance of the TLD class.
053     */
054        public static synchronized TLD getInstance() {
055                if (tld == null) {
056                        tld = new TLD();
057                }
058                return tld;
059        }
060        
061        /**
062         * Reset TLD instance. primarily used for testing.
063         */
064        public static void reset() {
065                tld = null;
066        }
067        /**
068         * List of quoted TLD read from both settings and public suffix file.
069         */
070        private final List<String> tldListQuoted;
071        
072        /**
073         * List of TLD read from both settings and public suffix file.
074         */
075        private final List<String> tldList;
076        
077        /**
078         * Private constructor of the TLD class. This constructor reads the TLDs from both settings and public suffix file.
079         * both quoted and unquoted. Sets the TLD_REGEX_STRING,HOSTNAME_REGEX, and  VALID_DOMAIN_MATCHER.
080         */
081        private TLD() {
082                tldListQuoted = readTldsFromPublicSuffixFile(true);
083                tldListQuoted.addAll(readTldsFromSettings(true));
084                
085                tldList = readTldsFromPublicSuffixFile(false);
086                tldList.addAll(readTldsFromSettings(false));
087
088                TLD_REGEX_STRING = "\\.(" + StringUtils.conjoin("|", tldListQuoted) + ")";
089                HOSTNAME_REGEX = Pattern.compile("^(|.*?\\.)(" + DOMAINNAME_CHAR_REGEX_STRING + "+"
090                    + TLD_REGEX_STRING + ")");
091                VALID_DOMAIN_MATCHER = Pattern.compile("^(" + Constants.IP_REGEX_STRING + "|"
092                        + DOMAINNAME_CHAR_REGEX_STRING + "+" + TLD_REGEX_STRING + ")$");
093        }
094        
095        /**
096     * Helper method for reading TLDs from settings. Will read all settings, validate them as legal TLDs and warn and
097     * ignore them if any are invalid. Settings may be with or without prefix "."
098     *
099     * @return a List of TLDs as Strings
100     */
101    protected static List<String> readTldsFromSettings(boolean asPattern) {
102        List<String> tlds = new ArrayList<String>();
103        try {
104                String[] settingsTlds = Settings.getAll(CommonSettings.TLDS);
105                for (String tld : settingsTlds) {
106                if (tld.startsWith(".")) {
107                    tld = tld.substring(1);
108                }
109                if (!tld.matches(DOMAINNAME_CHAR_REGEX_STRING + "(" + DOMAINNAME_CHAR_REGEX_STRING + "|\\.)*")) {
110                    log.warn("Invalid tld '{}', ignoring", tld);
111                    continue;
112                }
113                if (asPattern) {
114                        tlds.add(Pattern.quote(tld));
115                } else {
116                        tlds.add(tld);
117                }
118            }
119        } catch (UnknownID e) {
120                log.debug("No tlds found in settingsfiles " + StringUtils.conjoin(",", Settings.getSettingsFiles()));
121        } 
122        return tlds;
123    }
124        
125    /**
126     * Helper method for reading TLDs from the embedded public suffix file. Will read all entries, validate them as legal TLDs and warn and
127     * ignore them if any are invalid.
128     * @param asPattern if true, return a list of quoted Strings using Pattern.quote
129     * @return a List of TLDs as Strings
130     */
131    protected static List<String> readTldsFromPublicSuffixFile(boolean asPattern) {
132        List<String> tlds = new ArrayList<String>();
133        InputStream stream = getPublicSuffixListDataStream();
134        if (stream != null) {
135                BufferedReader br = null;
136                try {
137                        br = new BufferedReader(new InputStreamReader(stream));
138                        String line;
139                        while ((line = br.readLine()) != null) {
140                                String tld = line.trim();
141                                if (tld.isEmpty() || tld.startsWith("//")) {
142                                        continue;
143                                } else {
144                            if (!tld.matches(DOMAINNAME_CHAR_REGEX_STRING + "(" + DOMAINNAME_CHAR_REGEX_STRING + "|\\.)*")) {
145                                log.warn("Invalid tld '{}', ignoring", tld);
146                                continue;
147                            }
148                            if (asPattern) {
149                                tlds.add(Pattern.quote(tld));
150                            } else {
151                                tlds.add(tld);
152                            }
153                                }
154                        }
155                } catch(IOException e) {
156                        e.printStackTrace();
157                } finally {
158                        IOUtils.closeQuietly(br);
159                }
160        } else {
161                log.warn("Unable to retrieve public suffix_list failed. Returned empty list!.");
162        }        
163        return tlds;
164    }
165
166    
167    private static InputStream getPublicSuffixListDataStream() {
168        InputStream stream = null;
169        File alternateExternalFile = new File(PUBLIC_SUFFIX_LIST_EXTERNAL_FILE_PATH);
170        if (alternateExternalFile.isFile()) {
171                try {
172                        stream = new FileInputStream(alternateExternalFile);
173                } catch (FileNotFoundException e) {
174                        // Will never happen!
175                        e.printStackTrace();
176                }
177                log.info("Reading public suffixes list from external file '{}'", alternateExternalFile.getAbsolutePath());
178        } else { // Read embedded copy
179                log.info("Did not found external public suffix list at '{}'! Reading instead the public suffixes list from embedded file '{}' in common-core.jar-VERSION.jar.", 
180                                alternateExternalFile.getAbsolutePath(), PUBLIC_SUFFIX_LIST_EMBEDDED_PATH); 
181                stream = Thread.currentThread().getContextClassLoader().getResourceAsStream(PUBLIC_SUFFIX_LIST_EMBEDDED_PATH);
182                
183        }
184
185        return stream;
186    }
187
188        /**
189     * @return the VALID_DOMAIN_MATCHER pattern.
190     */
191        public Pattern getValidDomainMatcher() {
192                return VALID_DOMAIN_MATCHER;
193        }
194
195        /**
196         * 
197         * @return the HOSTNAME_REGEX pattern.
198         */
199        public Pattern getHostnamePattern() {
200                return HOSTNAME_REGEX;
201        }
202        
203        /**
204         * GetAllTlds method.
205         * @param quoted do you want the quoted, or unquoted list.
206         * @return the quoted list (if quoted=true), else the unquoted list.
207         */
208        public List<String> getAllTlds(boolean quoted) {
209                if (quoted) {
210                        return tldListQuoted; 
211                } else {
212                        return tldList;
213                }
214        }
215}