Source code

001/* File:        $Id: ExtractorOAI.java 2687 2013-05-03 16:38:47Z svc $
002 * Revision:    $Revision: 2687 $
003 * Author:      $Author: svc $
004 * Date:        $Date: 2013-05-03 18:38:47 +0200 (Fri, 03 May 2013) $
005 *
006 * Copyright 2004-2018 The Royal Danish Library,
007 * the National Library of France and the Austrian
008 * National Library.
009 *
010 * This program is free software; you can redistribute it and/or modify
011 * it under the terms of the GNU General Public License as published by
012 * the Free Software Foundation; either version 2 of the License, or
013 * (at your option) any later version.
014 *
015 * This program is distributed in the hope that it will be useful,
016 * but WITHOUT ANY WARRANTY; without even the implied warranty of
017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
018 * GNU General Public License for more details.
019 *
020 * You should have received a copy of the GNU General Public License
021 * along with this program; if not, write to the Free Software
022 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
023 */
024package dk.netarkivet.harvester.harvesting.extractor;
025
026import java.io.IOException;
027import java.net.URI;
028import java.net.URISyntaxException;
029import java.util.regex.Matcher;
030
031import org.apache.commons.httpclient.URIException;
032import org.apache.commons.logging.Log;
033import org.apache.commons.logging.LogFactory;
034import org.archive.io.ReplayCharSequence;
035import org.archive.modules.CrawlURI;
036import org.archive.modules.extractor.ContentExtractor;
037import org.archive.modules.extractor.Hop;
038import org.archive.modules.extractor.LinkContext;
039import org.archive.net.UURI;
040import org.archive.util.TextUtils;
041
042/**
043 * This is a link extractor for use with Heritrix. It will find the
044 * resumptionToken in an OAI-PMH listMetadata query and construct the link for
045 * the next page of the results. This extractor will not extract any other links
046 * so if there are additional urls in the OAI metadata then an additional
047 * extractor should be used for these. Typically this means that the extractor
048 * chain in the order template will end: 
049 * <newObject name="ExtractorOAI"
050 *      class="dk.netarkivet.harvester.harvesting.extractor.ExtractorOAI"> 
051 *      <boolean name="enabled">true</boolean> 
052 *      <newObject name="ExtractorOAI#decide-rules"
053 *       class="org.archive.crawler.deciderules.DecideRuleSequence"> 
054 *            <map name="rules"/> 
055 *       </newObject> 
056 *</newObject> 
057 * <newObject name="ExtractorXML"
058 *      class="org.archive.crawler.extractor.ExtractorXML"> 
059 *      <boolean name="enabled">true</boolean> 
060 *      <newObject name="ExtractorXML#decide-rules"
061 *              class="org.archive.crawler.deciderules.DecideRuleSequence"> 
062 *              <map name="rules"/> 
063 *      </newObject> 
064 * </newObject>
065 */
066public class ExtractorOAI extends ContentExtractor {
067
068    /**
069     * Regular expression matching the simple resumptionToken like this.
070     * <resumptionToken>oai_dc/421315/56151148/100/0/292/x/x/x</resumptionToken>
071     */
072    public static final String SIMPLE_RESUMPTION_TOKEN_MATCH = "(?i)<resumptionToken>\\s*(.*)\\s*</resumptionToken>";
073
074    /**
075     * Regular expression matching the extended resumptionToken with attributes like this. <resumptionToken cursor="0"
076     * completeListSize="421315">oai_dc/421315/56151148/100/0/292/x/x/x</resumptionToken> This is seen in OAI targets
077     * used by PURE.
078     */
079    public static final String EXTENDED_RESUMPTION_TOKEN_MATCH = "(?i)<resumptionToken\\s*cursor=\"[0-9]+\"\\s*completeListSize=\"[0-9]+\">\\s*(.*)\\s*</resumptionToken>";
080       
081     /** The class logger. */
082    final Log log = LogFactory.getLog(getClass());
083
084    /**
085     * The number of crawl-uris handled by this extractor.
086     */
087    private long numberOfCURIsHandled = 0;
088
089    /**
090     * The number of links extracted by this extractor.
091     */
092    private long numberOfLinksExtracted = 0;
093
094    /**
095     * Constructor for this extractor.
096     */
097    public ExtractorOAI() {
098        super();
099    }
100
101    /**
102     * Perform the link extraction on the current crawl uri. This method
103     * does not set linkExtractorFinished() on the current crawlURI, so
104     * subsequent extractors in the chain can find more links.
105     * @param curi the CrawlUI from which to extract the link.
106     */
107    @Override
108        protected boolean innerExtract(CrawlURI curi) {
109        try {
110            String query = curi.getUURI().getQuery();
111            if (query == null) { // Test for null query - strange that we need to do that
112                return false;
113            }
114            if (!query.contains("verb=ListRecords")) { //Not an OAI-PMH document
115                return false;
116            }
117        } catch (URIException e) {
118            log.error("Cannot get query part from '" + curi + "'", e);
119        }
120        this.numberOfCURIsHandled++;
121        ReplayCharSequence cs = null;
122        try {
123                cs = curi.getRecorder().getContentReplayCharSequence();            
124        } catch (IOException e) {
125            log.error("Failed getting ReplayCharSequence: " + e.getMessage());
126        }
127        if (cs == null) {
128            log.error("Failed getting ReplayCharSequence: "
129                    + curi.toString());
130            return false;
131        }
132        try {
133            boolean foundResumptionToken = processXml(curi, cs);
134            if (foundResumptionToken) {
135                numberOfLinksExtracted += 1;
136            }
137        } finally {
138            if (cs != null) {
139                try {
140                    cs.close();
141                } catch (IOException ioe) {
142                    log.warn(TextUtils.exceptionToString(
143                            "Failed close of ReplayCharSequence.", ioe));
144                }
145            }
146        }
147        return false;
148    }
149
150    /**
151     * Searches for resumption token and adds link if it is found. Returns true
152     * iff a link is added.
153     * @param curi the CrawlURI.
154     * @param cs the character sequence in which to search.
155     * @return true iff a resumptionToken is found and a link added.
156     */
157    public boolean processXml(CrawlURI curi, CharSequence cs) {
158        Matcher m = TextUtils.getMatcher(SIMPLE_RESUMPTION_TOKEN_MATCH, cs);
159        Matcher mPure = TextUtils.getMatcher(EXTENDED_RESUMPTION_TOKEN_MATCH, cs);
160        boolean matchesPure = mPure.find();
161        boolean matches = m.find();
162        String token = null;
163        if (matches) {
164            token = m.group(1);
165        } else if (matchesPure) {
166            token = mPure.group(1);
167        }
168        
169        if (token != null) {
170            UURI oldUri = curi.getUURI();
171            try {
172                final String newQueryPart = "verb=ListRecords&resumptionToken="
173                                            + token;
174                URI newUri = new URI(oldUri.getScheme(), oldUri.getAuthority(),
175                                     oldUri.getPath(),
176                                     newQueryPart, oldUri.getFragment());
177                
178                log.info("Found resumption link: " + newUri);
179                add(curi, 10000, newUri.toString(), LinkContext.NAVLINK_MISC, Hop.NAVLINK);
180            } catch (URISyntaxException e) {
181                log.error(e);
182            } catch (URIException e) {
183                log.error(e);
184            }
185        } else {
186                log.info("No resumption tokens found for url " + curi.getCanonicalString());
187        }
188        TextUtils.recycleMatcher(m);
189        return matches;
190    }
191
192    /**
193     * Return a report from this processor.
194     * @return the report.
195     */
196    @Override
197    public String report() {
198        StringBuffer ret = new StringBuffer();
199        ret.append("Processor: dk.netarkivet.harvester.harvesting.extractor.ExtractorOAI\n");
200        ret.append("  Function:          Link extraction on OAI XML documents\n");
201        ret.append("  CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");
202        ret.append("  Links extracted:   " + this.numberOfLinksExtracted + "\n\n");
203        return ret.toString();
204    }
205
206    @Override
207    protected boolean shouldExtract(CrawlURI curi) {
208        //curi.isHttpTransaction();
209        String mimeType = curi.getContentType();
210        if (mimeType == null) {
211            return false;
212        }
213        if ((mimeType.toLowerCase().indexOf("xml") < 0)
214            && (!curi.toString().toLowerCase().endsWith(".rss"))
215            && (!curi.toString().toLowerCase().endsWith(".xml"))) {
216            return false;
217        }
218        return true;
219
220    }    
221}