001/*
002 * #%L
003 * Netarchivesuite - Heritrix 3 extensions
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.harvesting;
024
025import java.util.logging.Logger;
026
027import org.archive.modules.CrawlURI;
028import org.archive.modules.deciderules.surt.SurtPrefixedDecideRule;
029
030/**
031 * Extended <code>SurtPrefixedDecideRule</code> class.
032 * Enable SURT seeds that allow sub-domains if the original seed URI has no path at all.
033 * Can also add/remove www/www[<x>] from the SURT and/or also add the orginal SURT seed.
034 *
035 * @author nicl
036 */
037public class NASSurtPrefixedDecideRule extends SurtPrefixedDecideRule {
038
039    /**
040     * UUID.
041     */
042    private static final long serialVersionUID = 3334790462876505839L;
043
044    /** Logger instance. */
045    private static final Logger logger = Logger.getLogger(NASSurtPrefixedDecideRule.class.getName());
046
047    /**
048     * Enable/Disable the removing of a preceding www[<x>] in SURT host if present.
049     */
050    protected boolean removeW3xSubDomain = true;
051    public boolean getRemoveW3xSubDomain() {
052        return removeW3xSubDomain;
053    }
054    public void setRemoveW3xSubDomain(boolean removeW3xSubDomain) {
055        this.removeW3xSubDomain = removeW3xSubDomain;
056    }
057
058    /**
059     * Enable/Disable the adding of the original SURT before removing the preceding www[<x>]. 
060     */
061    protected boolean addBeforeRemovingW3xSubDomain = true;
062    public boolean getAddBeforeRemovingW3xSubDomain() {
063        return addBeforeRemovingW3xSubDomain;
064    }
065    public void setAddBeforeRemovingW3xSubDomain(boolean addBeforeRemovingW3xSubDomain) {
066        this.addBeforeRemovingW3xSubDomain = addBeforeRemovingW3xSubDomain;
067    }
068
069    /**
070     * Enable/Disable the adding of a preceding www in SURT host if none is present.
071     */
072    protected boolean addW3SubDomain = true;
073    public boolean getAddW3SubDomain() {
074        return addW3SubDomain;
075    }
076    public void setAddW3SubDomain(boolean addW3SubDomain) {
077        this.addW3SubDomain = addW3SubDomain;
078    }
079
080    /**
081     * Enable/Disable the adding of the original SURT before adding a preceding www.
082     */
083    protected boolean addBeforeAddingW3SubDomain = true;
084    public boolean getAddBeforeAddingW3SubDomain() {
085        return addBeforeAddingW3SubDomain;
086    }
087    public void setAddBeforeAddingW3SubDomain(boolean addBeforeAddingW3SubDomain) {
088        this.addBeforeAddingW3SubDomain = addBeforeAddingW3SubDomain;
089    }
090
091    /**
092     * Enable/Disable the removing of ')/' in the SURT if the original URI does not have a path at all.
093     */
094    protected boolean allowSubDomainsRewrite = true;
095    public boolean getAllowSubDomainsRewrite() {
096        return allowSubDomainsRewrite;
097    }
098    public void setAllowSubDomainsRewrite(boolean allowSubDomainsRewrite) {
099        this.allowSubDomainsRewrite = allowSubDomainsRewrite;
100    }
101
102    @Override
103    public void addedSeed(final CrawlURI curi) {
104        if(getSeedsAsSurtPrefixes()) {
105            addedSeedImpl(curi);
106        }
107    }
108
109    /**
110     * <code>addedSeed</code implementation method to facilitate unit testing. 
111     * @param curi <code>CrawlURI</code> object to convert
112     * @return URI converted to SURT string
113     */
114    protected String addedSeedImpl(final CrawlURI curi) {
115        String originalUri = curi.getSourceTag();
116        if (originalUri == null && allowSubDomainsRewrite) {
117            logger.warning("originalUri not available");
118        }
119        String surt = prefixFrom(curi.getURI());
120        int idx;
121        int idx2;
122        String scheme;
123        String surtHost;
124        String port;
125        String path;
126        String part;
127        boolean bRemoveW3x;
128        if (surt != null) {
129            idx = surt.indexOf("://");
130            if (idx != -1) {
131                scheme = surt.substring(0, idx);
132                idx += "://".length();
133                idx2 = surt.indexOf(')', idx);
134                if (idx2 != -1 && surt.charAt(idx++) == '(') {
135                    surtHost = surt.substring(idx, idx2);
136                    path = surt.substring(idx2 + 1);
137                    idx = surtHost.lastIndexOf(':');
138                    if (idx != -1) {
139                        port = surtHost.substring(idx);
140                        surtHost = surtHost.substring(0, idx);
141                    } else {
142                        port = "";
143                    }
144                    // Look for www[<x>] in host name.
145                    idx = surtHost.lastIndexOf(',');
146                    if (idx != -1) {
147                        idx2 = idx;
148                        if (idx == surtHost.length() - 1 && idx > 0) {
149                            idx = surtHost.lastIndexOf(',', idx - 1);
150                        }
151                    }
152                    if (idx != -1 && idx < idx2) {
153                        part = surtHost.substring(idx + 1, idx2);
154                        if (part.startsWith("www")) {
155                            bRemoveW3x = true;
156                            if (part.length() > 3) {
157                                try {
158                                    Integer.parseInt(part.substring(3));
159                                } catch (NumberFormatException e) {
160                                    bRemoveW3x = false;
161                                }
162                            }
163                        } else {
164                            bRemoveW3x = false;
165                        }
166                        if (bRemoveW3x) {
167                            if (removeW3xSubDomain) {
168                                if (addBeforeRemovingW3xSubDomain) {
169                                    surt = subDomainsRewrite(path, originalUri, scheme, surtHost, port, surt);
170                                    surtPrefixes.add(surt);
171                                }
172                                surtHost = surtHost.substring(0, idx + 1);
173                                surt = scheme + "://(" + surtHost + port + ")" + path;
174                            }
175                        } else {
176                            if (addW3SubDomain) {
177                                if (addBeforeAddingW3SubDomain) {
178                                    surt = subDomainsRewrite(path, originalUri, scheme, surtHost, port, surt);
179                                    surtPrefixes.add(surt);
180                                }
181                                surtHost = surtHost + "www,";
182                                surt = scheme + "://(" + surtHost + port + ")" + path;
183                            }
184                        }
185                    } else {
186                        logger.warning("very strange surt host");
187                    }
188                    surt = subDomainsRewrite(path, originalUri, scheme, surtHost, port, surt);
189                }
190            }
191        }
192        surtPrefixes.add(surt);
193        return surt;
194    }
195
196    /**
197     * Method to rewrite the SURT to allow sub-domains if the original URI does not have a path at all.
198     * @param path SURT path string
199     * @param originalUri original URI
200     * @param scheme SURT scheme string
201     * @param surtHost SURT host as comma separated list of names
202     * @param surt URI converted to SURT by the default Heritrix means
203     * @return original or rewritten SURT, depending on the SURT and the original URI
204     */
205    protected String subDomainsRewrite(String path, String originalUri, String scheme, String surtHost, String port, String surt) {
206        int idx;
207        if (allowSubDomainsRewrite) {
208            if ("/".compareTo(path) == 0) {
209                if (originalUri != null) {
210                    idx = originalUri.indexOf("://");
211                    if (idx != -1) {
212                        idx += "://".length();
213                        idx = originalUri.indexOf('/', idx);
214                        if (idx == -1) {
215                            surt = scheme + "://(" + surtHost + port; 
216                        }
217                    }
218                }
219            }
220        }
221        return surt;
222    }
223
224}