001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.harvesting;
024
025import java.util.NoSuchElementException;
026
027import org.apache.commons.httpclient.URIException;
028import org.apache.commons.logging.Log;
029import org.apache.commons.logging.LogFactory;
030import org.archive.crawler.datamodel.CandidateURI;
031import org.archive.crawler.datamodel.CoreAttributeConstants;
032import org.archive.crawler.framework.CrawlController;
033import org.archive.crawler.frontier.HostnameQueueAssignmentPolicy;
034import org.archive.net.UURIFactory;
035
036import dk.netarkivet.common.utils.DomainUtils;
037
038/**
039 * This is a modified version of the {@link DomainnameQueueAssignmentPolicy} where domainname returned is the domainname
040 * of the candidateURI except where the domainname of the SeedURI is a different one.
041 * <p>
042 * <p>
043 * Using the domain as the queue-name. The domain is defined as the last two names in the entire hostname or the
044 * entirety of an IP address. x.y.z -> y.z y.z -> y.z nn.nn.nn.nn -> nn.nn.nn.nn
045 */
046public class SeedUriDomainnameQueueAssignmentPolicy extends HostnameQueueAssignmentPolicy {
047
048    private static final Log log = LogFactory.getLog(SeedUriDomainnameQueueAssignmentPolicy.class);
049
050    /**
051     * A key used for the cases when we can't figure out the URI. This is taken from parent, where it has private
052     * access. Parent returns this on things like about:blank.
053     */
054    static final String DEFAULT_CLASS_KEY = "default...";
055
056    /**
057     * Return a key for queue names based on domain names (last two parts of host name) or IP address. They key may
058     * include a #<portnr> at the end.
059     *
060     * @param controller The controller the crawl is running on.
061     * @param cauri A potential URI.
062     * @return a class key (really an arbitrary string), one of <domainOrIP>, <domainOrIP>#<port>, or "default...".
063     * @see HostnameQueueAssignmentPolicy#getClassKey(org.archive.crawler.framework.CrawlController,
064     * org.archive.crawler.datamodel.CandidateURI)
065     */
066    public String getClassKey(CrawlController controller, CandidateURI cauri) {
067        String candidate;
068
069        boolean ignoreSourceSeed = cauri != null && cauri.getCandidateURIString().startsWith("dns");
070        try {
071            // Since getClassKey has no contract, we must encapsulate it from
072            // errors.
073            candidate = super.getClassKey(controller, cauri);
074        } catch (NullPointerException e) {
075            log.debug("Heritrix broke getting class key candidate for " + cauri);
076            candidate = DEFAULT_CLASS_KEY;
077        }
078
079        String sourceSeedCandidate = null;
080        if (!ignoreSourceSeed) {
081            sourceSeedCandidate = getCandidateFromSource(cauri);
082        }
083
084        if (sourceSeedCandidate != null) {
085            return sourceSeedCandidate;
086        } else {
087            // If sourceSeedCandidates are disabled, use the old method:
088
089            String[] hostnameandportnr = candidate.split("#");
090            if (hostnameandportnr.length == 0 || hostnameandportnr.length > 2) {
091                return candidate;
092            }
093
094            String domainName = DomainUtils.domainNameFromHostname(hostnameandportnr[0]);
095            if (domainName == null) { // Not valid according to our rules
096                log.debug("Illegal class key candidate '" + candidate + "' for '" + cauri + "'");
097                return candidate;
098            }
099            return domainName;
100        }
101    }
102
103    /**
104     * Find a candidate from the source.
105     *
106     * @param cauri A potential URI
107     * @return a candidate from the source or null if none found
108     */
109    private String getCandidateFromSource(CandidateURI cauri) {
110        String sourceCandidate = null;
111        try {
112            sourceCandidate = cauri.getString(CoreAttributeConstants.A_SOURCE_TAG);
113        } catch (NoSuchElementException e) {
114            log.warn("source-tag-seeds not set in Heritrix template!");
115            return null;
116        }
117
118        String hostname = null;
119        try {
120            hostname = UURIFactory.getInstance(sourceCandidate).getHost();
121        } catch (URIException e) {
122            log.warn("Hostname could not be extracted from sourceCandidate: " + sourceCandidate);
123            return null;
124        }
125        return DomainUtils.domainNameFromHostname(hostname);
126    }
127
128}