001/*$Id: OnNSDomainsDecideRule.java 2687 2013-05-03 16:38:47Z svc $
002* $Revision: 2687 $
003* $Date: 2013-05-03 18:38:47 +0200 (Fri, 03 May 2013) $
004* $Author: svc $
005*
006* The Netarchive Suite - Software to harvest and preserve websites
007* Copyright 2004-2018 The Royal Danish Library,
008 * the National Library of France and the Austrian
009 * National Library.
010*
011* This library is free software; you can redistribute it and/or
012* modify it under the terms of the GNU Lesser General Public
013* License as published by the Free Software Foundation; either
014* version 2.1 of the License, or (at your option) any later version.
015*
016* This library is distributed in the hope that it will be useful,
017* but WITHOUT ANY WARRANTY; without even the implied warranty of
018* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
019* Lesser General Public License for more details.
020*
021* You should have received a copy of the GNU Lesser General Public
022* License along with this library; if not, write to the Free Software
023* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
024*/
025package dk.netarkivet.harvester.harvesting;
026
027import java.util.regex.Matcher;
028import java.util.regex.Pattern;
029
030import org.apache.commons.httpclient.URIException;
031import org.archive.modules.CrawlURI;
032import org.archive.modules.deciderules.surt.SurtPrefixedDecideRule;
033import org.archive.net.UURI;
034import org.archive.net.UURIFactory;
035import org.archive.util.ArchiveUtils;
036import org.archive.util.SurtPrefixSet;
037
038import dk.netarkivet.common.exceptions.ArgumentNotValid;
039
040/**
041 * Class that re-creates the SurtPrefixSet to include only domain names
042 * according to the domain definition of NetarchiveSuite.
043 * The NetarchiveSuite can't use the
044 * org.archive.crawler.deciderules.OnDomainsDecideRule because
045 * it uses a different domain definition.  
046 */
047public class OnNSDomainsDecideRule extends SurtPrefixedDecideRule {
048
049    /** This is what SurtPrefixSet.prefixFromPlain returns for
050     *  a non valid URI. */
051    public static final String NON_VALID_DOMAIN = "http://(http,)";
052    
053    /** Pattern that matches the first part of SURT - until ?? */
054    public static final Pattern SURT_FIRSTPART_PATTERN
055        = Pattern.compile("http\\://\\([^\\)]*");
056    /** 
057     * Constructor for the class OnNSDomainsDecideRule.
058     * Makes the configured decision 
059     * for any URI which is inside one of the domains in the 
060     * configured set of domains - according to the domain 
061     * definition of the NetarchiveSuite system.
062     * Giving that e.g. sports.tv2.dk will resolve to tv2.dk
063     * but www.bbc.co.uk will resolve to bbc.co.uk"
064     *  
065     */
066    public OnNSDomainsDecideRule(){
067        super();
068    }
069    
070    /**
071     * We override the default readPrefixes, because we want to
072     * make our prefixes.
073     */
074    protected void readPrefixes() {
075        buildSurtPrefixSet();
076        myBuildSurtPrefixSet();
077        dumpSurtPrefixSet();
078    }
079
080    /**
081     * Method that rebuilds the SurtPrefixSet to include only
082     * topmost domains - according to the domain definition
083     * in NetarchiveSuite.
084     * This is only done once, during the startup phase?
085     */
086    protected void myBuildSurtPrefixSet() {
087        //make copy of original SurtPrefixSet to loop
088        SurtPrefixSet newSurtPrefixes = (SurtPrefixSet) surtPrefixes.clone();
089        //pattern that matches first part of SURT
090        
091        //loop all original SURTs
092        for (String s : newSurtPrefixes) {
093            Matcher m = SURT_FIRSTPART_PATTERN.matcher(s);
094            if (m.find()) {
095                //cut off http:// (https:// are converted by heritrix classes)
096                String hostpart = m.group().substring(8);
097                //split in hostname/domainname/TLD parts
098                String[] parts = hostpart.split(",");
099                StringBuilder domnameBuilder = new StringBuilder();
100                //loop through parts in reverse order - add '.'
101                //(not after last part)
102                for (int j = parts.length - 1; j >= 0; j--) {
103                    domnameBuilder.append(parts[j]);
104                    if (j != 0) {
105                        domnameBuilder.append(".");
106                    }
107                }
108                //add the new domain name to surtPrefixes
109                //since this is always shorter SURTs than the originals
110                //they will automatically
111                //override longer ones (built in SURTs logic)
112                surtPrefixes.add(prefixFrom(domnameBuilder.toString()));
113            }
114        }
115    }
116
117    /**
118     * Generate the SURT prefix that matches the domain definition
119     * of NetarchiveSuite.
120     * @param uri URL to convert to SURT
121     * @return String with SURT that matches the domain definition
122     * of NetarchiveSuite
123     */
124    protected String prefixFrom(String uri) {
125        uri = ArchiveUtils.addImpliedHttpIfNecessary(uri);
126        //TODO is this correct now ? 
127        return SurtPrefixSet.prefixFromPlainForceHttp(convertToDomain(uri));
128    }
129
130    /**
131     * Convert a URI to its domain.
132     * @param uri URL to convert to Top most domain-name according to
133     * NetarchiveSuite definition
134     * @return Domain name
135     */
136    public static String convertToDomain(String uri) {
137        ArgumentNotValid.checkNotNullOrEmpty(uri, "String uri");
138        DomainnameQueueAssignmentPolicy policy
139                = new DomainnameQueueAssignmentPolicy();
140
141        UURI uuri = null;
142        try {
143            uuri = UURIFactory.getInstance(uri);
144        } catch (URIException e) {
145            e.printStackTrace();
146         // allow to continue with original string uri  
147         // FIXME/TODO 
148         // AD "allow to continue with original string uri"
149         // We cannot do that any more, as the argument to getClassKey is now CrawlURI, except for
150         // the string
151                         
152        }
153        try {          
154            return policy.getClassKey(new CrawlURI(uuri));
155        } catch (Throwable e) {
156            // illegal URI - return a SURT that will not match any real URIs
157            return NON_VALID_DOMAIN;
158        }
159    }
160}