001/* File: $Id: ExtractorOAI.java 2687 2013-05-03 16:38:47Z svc $ 002 * Revision: $Revision: 2687 $ 003 * Author: $Author: svc $ 004 * Date: $Date: 2013-05-03 18:38:47 +0200 (Fri, 03 May 2013) $ 005 * 006 * Copyright 2004-2018 The Royal Danish Library, 007 * the National Library of France and the Austrian 008 * National Library. 009 * 010 * This program is free software; you can redistribute it and/or modify 011 * it under the terms of the GNU General Public License as published by 012 * the Free Software Foundation; either version 2 of the License, or 013 * (at your option) any later version. 014 * 015 * This program is distributed in the hope that it will be useful, 016 * but WITHOUT ANY WARRANTY; without even the implied warranty of 017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 018 * GNU General Public License for more details. 019 * 020 * You should have received a copy of the GNU General Public License 021 * along with this program; if not, write to the Free Software 022 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 023 */ 024package dk.netarkivet.harvester.harvesting.extractor; 025 026import java.io.IOException; 027import java.net.URI; 028import java.net.URISyntaxException; 029import java.util.regex.Matcher; 030 031import org.apache.commons.httpclient.URIException; 032import org.apache.commons.logging.Log; 033import org.apache.commons.logging.LogFactory; 034import org.archive.io.ReplayCharSequence; 035import org.archive.modules.CrawlURI; 036import org.archive.modules.extractor.ContentExtractor; 037import org.archive.modules.extractor.Hop; 038import org.archive.modules.extractor.LinkContext; 039import org.archive.net.UURI; 040import org.archive.util.TextUtils; 041 042/** 043 * This is a link extractor for use with Heritrix. It will find the 044 * resumptionToken in an OAI-PMH listMetadata query and construct the link for 045 * the next page of the results. This extractor will not extract any other links 046 * so if there are additional urls in the OAI metadata then an additional 047 * extractor should be used for these. Typically this means that the extractor 048 * chain in the order template will end: 049 * <newObject name="ExtractorOAI" 050 * class="dk.netarkivet.harvester.harvesting.extractor.ExtractorOAI"> 051 * <boolean name="enabled">true</boolean> 052 * <newObject name="ExtractorOAI#decide-rules" 053 * class="org.archive.crawler.deciderules.DecideRuleSequence"> 054 * <map name="rules"/> 055 * </newObject> 056 *</newObject> 057 * <newObject name="ExtractorXML" 058 * class="org.archive.crawler.extractor.ExtractorXML"> 059 * <boolean name="enabled">true</boolean> 060 * <newObject name="ExtractorXML#decide-rules" 061 * class="org.archive.crawler.deciderules.DecideRuleSequence"> 062 * <map name="rules"/> 063 * </newObject> 064 * </newObject> 065 */ 066public class ExtractorOAI extends ContentExtractor { 067 068 /** 069 * Regular expression matching the simple resumptionToken like this. 070 * <resumptionToken>oai_dc/421315/56151148/100/0/292/x/x/x</resumptionToken> 071 */ 072 public static final String SIMPLE_RESUMPTION_TOKEN_MATCH = "(?i)<resumptionToken>\\s*(.*)\\s*</resumptionToken>"; 073 074 /** 075 * Regular expression matching the extended resumptionToken with attributes like this. <resumptionToken cursor="0" 076 * completeListSize="421315">oai_dc/421315/56151148/100/0/292/x/x/x</resumptionToken> This is seen in OAI targets 077 * used by PURE. 078 */ 079 public static final String EXTENDED_RESUMPTION_TOKEN_MATCH = "(?i)<resumptionToken\\s*cursor=\"[0-9]+\"\\s*completeListSize=\"[0-9]+\">\\s*(.*)\\s*</resumptionToken>"; 080 081 /** The class logger. */ 082 final Log log = LogFactory.getLog(getClass()); 083 084 /** 085 * The number of crawl-uris handled by this extractor. 086 */ 087 private long numberOfCURIsHandled = 0; 088 089 /** 090 * The number of links extracted by this extractor. 091 */ 092 private long numberOfLinksExtracted = 0; 093 094 /** 095 * Constructor for this extractor. 096 */ 097 public ExtractorOAI() { 098 super(); 099 } 100 101 /** 102 * Perform the link extraction on the current crawl uri. This method 103 * does not set linkExtractorFinished() on the current crawlURI, so 104 * subsequent extractors in the chain can find more links. 105 * @param curi the CrawlUI from which to extract the link. 106 */ 107 @Override 108 protected boolean innerExtract(CrawlURI curi) { 109 try { 110 String query = curi.getUURI().getQuery(); 111 if (query == null) { // Test for null query - strange that we need to do that 112 return false; 113 } 114 if (!query.contains("verb=ListRecords")) { //Not an OAI-PMH document 115 return false; 116 } 117 } catch (URIException e) { 118 log.error("Cannot get query part from '" + curi + "'", e); 119 } 120 this.numberOfCURIsHandled++; 121 ReplayCharSequence cs = null; 122 try { 123 cs = curi.getRecorder().getContentReplayCharSequence(); 124 } catch (IOException e) { 125 log.error("Failed getting ReplayCharSequence: " + e.getMessage()); 126 } 127 if (cs == null) { 128 log.error("Failed getting ReplayCharSequence: " 129 + curi.toString()); 130 return false; 131 } 132 try { 133 boolean foundResumptionToken = processXml(curi, cs); 134 if (foundResumptionToken) { 135 numberOfLinksExtracted += 1; 136 } 137 } finally { 138 if (cs != null) { 139 try { 140 cs.close(); 141 } catch (IOException ioe) { 142 log.warn(TextUtils.exceptionToString( 143 "Failed close of ReplayCharSequence.", ioe)); 144 } 145 } 146 } 147 return false; 148 } 149 150 /** 151 * Searches for resumption token and adds link if it is found. Returns true 152 * iff a link is added. 153 * @param curi the CrawlURI. 154 * @param cs the character sequence in which to search. 155 * @return true iff a resumptionToken is found and a link added. 156 */ 157 public boolean processXml(CrawlURI curi, CharSequence cs) { 158 Matcher m = TextUtils.getMatcher(SIMPLE_RESUMPTION_TOKEN_MATCH, cs); 159 Matcher mPure = TextUtils.getMatcher(EXTENDED_RESUMPTION_TOKEN_MATCH, cs); 160 boolean matchesPure = mPure.find(); 161 boolean matches = m.find(); 162 String token = null; 163 if (matches) { 164 token = m.group(1); 165 } else if (matchesPure) { 166 token = mPure.group(1); 167 } 168 169 if (token != null) { 170 UURI oldUri = curi.getUURI(); 171 try { 172 final String newQueryPart = "verb=ListRecords&resumptionToken=" 173 + token; 174 URI newUri = new URI(oldUri.getScheme(), oldUri.getAuthority(), 175 oldUri.getPath(), 176 newQueryPart, oldUri.getFragment()); 177 178 log.info("Found resumption link: " + newUri); 179 add(curi, 10000, newUri.toString(), LinkContext.NAVLINK_MISC, Hop.NAVLINK); 180 } catch (URISyntaxException e) { 181 log.error(e); 182 } catch (URIException e) { 183 log.error(e); 184 } 185 } else { 186 log.info("No resumption tokens found for url " + curi.getCanonicalString()); 187 } 188 TextUtils.recycleMatcher(m); 189 return matches; 190 } 191 192 /** 193 * Return a report from this processor. 194 * @return the report. 195 */ 196 @Override 197 public String report() { 198 StringBuffer ret = new StringBuffer(); 199 ret.append("Processor: dk.netarkivet.harvester.harvesting.extractor.ExtractorOAI\n"); 200 ret.append(" Function: Link extraction on OAI XML documents\n"); 201 ret.append(" CrawlURIs handled: " + this.numberOfCURIsHandled + "\n"); 202 ret.append(" Links extracted: " + this.numberOfLinksExtracted + "\n\n"); 203 return ret.toString(); 204 } 205 206 @Override 207 protected boolean shouldExtract(CrawlURI curi) { 208 //curi.isHttpTransaction(); 209 String mimeType = curi.getContentType(); 210 if (mimeType == null) { 211 return false; 212 } 213 if ((mimeType.toLowerCase().indexOf("xml") < 0) 214 && (!curi.toString().toLowerCase().endsWith(".rss")) 215 && (!curi.toString().toLowerCase().endsWith(".xml"))) { 216 return false; 217 } 218 return true; 219 220 } 221}