001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.harvesting;
024
025import java.util.regex.Matcher;
026import java.util.regex.Pattern;
027
028import org.apache.commons.httpclient.URIException;
029import org.archive.crawler.datamodel.CandidateURI;
030import org.archive.crawler.deciderules.SurtPrefixedDecideRule;
031import org.archive.net.UURIFactory;
032import org.archive.util.ArchiveUtils;
033import org.archive.util.SurtPrefixSet;
034
035/**
036 * Class that re-creates the SurtPrefixSet to include only domain names according to the domain definition of
037 * NetarchiveSuite. The NetarchiveSuite can't use the org.archive.crawler.deciderules.OnDomainsDecideRule because it
038 * uses a different domain definition.
039 */
040@SuppressWarnings({"serial"})
041public class OnNSDomainsDecideRule extends SurtPrefixedDecideRule {
042
043    /**
044     * This is what SurtPrefixSet.prefixFromPlain returns for a non valid URI.
045     */
046    public static final String NON_VALID_DOMAIN = "http://(http,)";
047
048    /** Pattern that matches the first part of SURT - until ?? */
049    public static final Pattern SURT_FIRSTPART_PATTERN = Pattern.compile("http\\://\\([^\\)]*");
050
051    /**
052     * Constructor for the class OnNSDomainsDecideRule.
053     *
054     * @param s The name of this DecideRule
055     */
056    public OnNSDomainsDecideRule(String s) {
057        super(s);
058        setDescription("OnNSDomainsDecideRule. Makes the configured decision "
059                + "for any URI which is inside one of the domains in the "
060                + "configured set of domains - according to the domain " + "definition of the NetarchiveSuite system. "
061                + "Giving that e.g. sports.tv2.dk will resolve to tv2.dk"
062                + " but www.bbc.co.uk will resolve to bbc.co.uk");
063    }
064
065    /**
066     * We override the default readPrefixes, because we want to make our prefixes.
067     */
068    protected void readPrefixes() {
069        buildSurtPrefixSet();
070        myBuildSurtPrefixSet();
071        dumpSurtPrefixSet();
072    }
073
074    /**
075     * Method that rebuilds the SurtPrefixSet to include only topmost domains - according to the domain definition in
076     * NetarchiveSuite. This is only done once, during the startup phase?
077     */
078    protected void myBuildSurtPrefixSet() {
079        // make copy of original SurtPrefixSet to loop
080        SurtPrefixSet newSurtPrefixes = (SurtPrefixSet) surtPrefixes.clone();
081        // pattern that matches first part of SURT
082
083        // loop all original SURTs
084        for (String s : newSurtPrefixes) {
085            Matcher m = SURT_FIRSTPART_PATTERN.matcher(s);
086            if (m.find()) {
087                // cut off http:// (https:// are converted by heritrix classes)
088                String hostpart = m.group().substring(8);
089                // split in hostname/domainname/TLD parts
090                String[] parts = hostpart.split(",");
091                StringBuilder domnameBuilder = new StringBuilder();
092                // loop through parts in reverse order - add '.'
093                // (not after last part)
094                for (int j = parts.length - 1; j >= 0; j--) {
095                    domnameBuilder.append(parts[j]);
096                    if (j != 0) {
097                        domnameBuilder.append(".");
098                    }
099                }
100                // add the new domain name to surtPrefixes
101                // since this is always shorter SURTs than the originals
102                // they will automatically
103                // override longer ones (built in SURTs logic)
104                surtPrefixes.add(prefixFrom(domnameBuilder.toString()));
105            }
106        }
107    }
108
109    /**
110     * Generate the SURT prefix that matches the domain definition of NetarchiveSuite.
111     *
112     * @param uri URL to convert to SURT
113     * @return String with SURT that matches the domain definition of NetarchiveSuite
114     */
115    protected String prefixFrom(String uri) {
116        uri = ArchiveUtils.addImpliedHttpIfNecessary(uri);
117        return SurtPrefixSet.prefixFromPlain(convertToDomain(uri));
118    }
119
120    /**
121     * Convert a URI to its domain.
122     *
123     * @param uri URL to convert to Top most domain-name according to NetarchiveSuite definition
124     * @return Domain name
125     */
126    public static String convertToDomain(String uri) {
127        if (uri == null || uri.isEmpty()) {
128            throw new IllegalArgumentException("The value of the variable 'String uri' must not be an empty string.");
129        }
130        DomainnameQueueAssignmentPolicy policy = new DomainnameQueueAssignmentPolicy();
131        String u = uri;
132        try {
133            u = UURIFactory.getInstance(uri).toString();
134        } catch (URIException e) {
135            e.printStackTrace();
136            // allow to continue with original string uri
137        }
138        try {
139            return policy.getClassKey(null, CandidateURI.fromString(u.toString()));
140        } catch (URIException e) {
141            // illegal URI - return a SURT that will not match any real URIs
142            return NON_VALID_DOMAIN;
143        }
144    }
145
146}