Source code

001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.harvesting.extractor;
024
025import java.io.IOException;
026import java.net.URI;
027import java.net.URISyntaxException;
028import java.util.regex.Matcher;
029
030import org.apache.commons.httpclient.URIException;
031import org.apache.commons.logging.Log;
032import org.apache.commons.logging.LogFactory;
033import org.archive.crawler.datamodel.CrawlURI;
034import org.archive.crawler.extractor.Extractor;
035import org.archive.crawler.extractor.Link;
036import org.archive.io.ReplayCharSequence;
037import org.archive.net.UURI;
038import org.archive.util.TextUtils;
039
040/**
041 * This is a link extractor for use with Heritrix. It will find the resumptionToken in an OAI-PMH listMetadata query and
042 * construct the link for the next page of the results. This extractor will not extract any other links so if there are
043 * additional urls in the OAI metadata then an additional extractor should be used for these. Typically this means that
044 * the extractor chain in the order template will end: <newObject name="ExtractorOAI"
045 * class="dk.netarkivet.harvester.harvesting.extractor.ExtractorOAI"> <boolean name="enabled">true</boolean> <newObject
046 * name="ExtractorOAI#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence"> <map name="rules"/>
047 * </newObject> </newObject> <newObject name="ExtractorXML" class="org.archive.crawler.extractor.ExtractorXML"> <boolean
048 * name="enabled">true</boolean> <newObject name="ExtractorXML#decide-rules"
049 * class="org.archive.crawler.deciderules.DecideRuleSequence"> <map name="rules"/> </newObject> </newObject>
050 */
051@SuppressWarnings({"serial"})
052public class ExtractorOAI extends Extractor {
053
054    /**
055     * Regular expression matching the simple resumptionToken like this.
056     * <resumptionToken>oai_dc/421315/56151148/100/0/292/x/x/x</resumptionToken>
057     */
058    public static final String SIMPLE_RESUMPTION_TOKEN_MATCH = "(?i)<resumptionToken>\\s*(.*)\\s*</resumptionToken>";
059
060    /**
061     * Regular expression matching the extended resumptionToken with attributes like this. <resumptionToken cursor="0"
062     * completeListSize="421315">oai_dc/421315/56151148/100/0/292/x/x/x</resumptionToken> This is seen in OAI targets
063     * used by PURE.
064     */
065    public static final String EXTENDED_RESUMPTION_TOKEN_MATCH = "(?i)<resumptionToken\\s*cursor=\"[0-9]+\"\\s*completeListSize=\"[0-9]+\">\\s*(.*)\\s*</resumptionToken>";
066
067    /** The class logger. */
068    final Log log = LogFactory.getLog(getClass());
069
070    /**
071     * The number of crawl-uris handled by this extractor.
072     */
073    private long numberOfCURIsHandled = 0;
074
075    /**
076     * The number of links extracted by this extractor.
077     */
078    private long numberOfLinksExtracted = 0;
079
080    /**
081     * Constructor for this extractor.
082     *
083     * @param name the name of this extractor
084     */
085    public ExtractorOAI(String name) {
086        super(name, "Extractor which finds the resumptionToken in an OAI "
087                + "listMetadata query and adds the next page of results " + "to the crawl");
088    }
089
090    /**
091     * Perform the link extraction on the current crawl uri. This method does not set linkExtractorFinished() on the
092     * current crawlURI, so subsequent extractors in the chain can find more links.
093     *
094     * @param curi the CrawlUI from which to extract the link.
095     */
096    @Override
097    protected void extract(CrawlURI curi) {
098        if (!isHttpTransactionContentToProcess(curi)) {
099            return;
100        }
101        String mimeType = curi.getContentType();
102        if (mimeType == null) {
103            return;
104        }
105        if ((mimeType.toLowerCase().indexOf("xml") < 0) && (!curi.toString().toLowerCase().endsWith(".rss"))
106                && (!curi.toString().toLowerCase().endsWith(".xml"))) {
107            return;
108        }
109        try {
110            String query = curi.getUURI().getQuery();
111            if (query == null || !query.contains("verb=ListRecords")) { // Not an OAI-PMH document
112                return;
113            }
114        } catch (URIException e) {
115            log.error("Cannot get query part from '" + curi + "'", e);
116        }
117        this.numberOfCURIsHandled++;
118        ReplayCharSequence cs = null;
119        try {
120            cs = curi.getHttpRecorder().getReplayCharSequence();
121        } catch (IOException e) {
122            log.error("Failed getting ReplayCharSequence: " + e.getMessage());
123        }
124        if (cs == null) {
125            log.error("Failed getting ReplayCharSequence: " + curi.toString());
126            return;
127        }
128        try {
129            boolean foundResumptionToken = processXml(curi, cs);
130            if (foundResumptionToken) {
131                numberOfLinksExtracted += 1;
132            }
133        } finally {
134            if (cs != null) {
135                try {
136                    cs.close();
137                } catch (IOException ioe) {
138                    log.warn(TextUtils.exceptionToString("Failed close of ReplayCharSequence.", ioe));
139                }
140            }
141        }
142    }
143
144    /**
145     * Searches for resumption token and adds link if it is found. Returns true iff a link is added.
146     *
147     * @param curi the CrawlURI.
148     * @param cs the character sequency in which to search.
149     * @return true iff a resumptionToken is found and a link added.
150     */
151    public boolean processXml(CrawlURI curi, CharSequence cs) {
152        Matcher m = TextUtils.getMatcher(SIMPLE_RESUMPTION_TOKEN_MATCH, cs);
153        Matcher mPure = TextUtils.getMatcher(EXTENDED_RESUMPTION_TOKEN_MATCH, cs);
154        boolean matchesPure = mPure.find();
155        boolean matches = m.find();
156        String token = null;
157        if (matches) {
158            token = m.group(1);
159        } else if (matchesPure) {
160            token = mPure.group(1);
161        }
162        if (token != null) {
163            UURI oldUri = curi.getUURI();
164            try {
165                final String newQueryPart = "verb=ListRecords&resumptionToken=" + token;
166                URI newUri = new URI(oldUri.getScheme(), oldUri.getAuthority(), oldUri.getPath(), newQueryPart,
167                        oldUri.getFragment());
168                curi.createAndAddLink(newUri.toString(), Link.NAVLINK_MISC, Link.NAVLINK_HOP);
169            } catch (URISyntaxException e) {
170                log.error(e);
171            } catch (URIException e) {
172                log.error(e);
173            }
174        }
175        TextUtils.recycleMatcher(m);
176        TextUtils.recycleMatcher(mPure);
177        return matches || matchesPure;
178    }
179
180    /**
181     * Return a report from this processor.
182     *
183     * @return the report.
184     */
185    @Override
186    public String report() {
187        StringBuffer ret = new StringBuffer();
188        ret.append("Processor: dk.netarkivet.harvester.harvesting.extractor.ExtractorOAI\n");
189        ret.append("  Function:          Link extraction as part of OAI harvesting\n");
190        ret.append("  CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");
191        ret.append("  Links extracted:   " + this.numberOfLinksExtracted + "\n\n");
192        return ret.toString();
193    }
194
195}