001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.harvesting.extractor; 024 025import java.io.IOException; 026import java.net.URI; 027import java.net.URISyntaxException; 028import java.util.regex.Matcher; 029 030import org.apache.commons.httpclient.URIException; 031import org.apache.commons.logging.Log; 032import org.apache.commons.logging.LogFactory; 033import org.archive.crawler.datamodel.CrawlURI; 034import org.archive.crawler.extractor.Extractor; 035import org.archive.crawler.extractor.Link; 036import org.archive.io.ReplayCharSequence; 037import org.archive.net.UURI; 038import org.archive.util.TextUtils; 039 040/** 041 * This is a link extractor for use with Heritrix. It will find the resumptionToken in an OAI-PMH listMetadata query and 042 * construct the link for the next page of the results. This extractor will not extract any other links so if there are 043 * additional urls in the OAI metadata then an additional extractor should be used for these. Typically this means that 044 * the extractor chain in the order template will end: <newObject name="ExtractorOAI" 045 * class="dk.netarkivet.harvester.harvesting.extractor.ExtractorOAI"> <boolean name="enabled">true</boolean> <newObject 046 * name="ExtractorOAI#decide-rules" class="org.archive.crawler.deciderules.DecideRuleSequence"> <map name="rules"/> 047 * </newObject> </newObject> <newObject name="ExtractorXML" class="org.archive.crawler.extractor.ExtractorXML"> <boolean 048 * name="enabled">true</boolean> <newObject name="ExtractorXML#decide-rules" 049 * class="org.archive.crawler.deciderules.DecideRuleSequence"> <map name="rules"/> </newObject> </newObject> 050 */ 051@SuppressWarnings({"serial"}) 052public class ExtractorOAI extends Extractor { 053 054 /** 055 * Regular expression matching the simple resumptionToken like this. 056 * <resumptionToken>oai_dc/421315/56151148/100/0/292/x/x/x</resumptionToken> 057 */ 058 public static final String SIMPLE_RESUMPTION_TOKEN_MATCH = "(?i)<resumptionToken>\\s*(.*)\\s*</resumptionToken>"; 059 060 /** 061 * Regular expression matching the extended resumptionToken with attributes like this. <resumptionToken cursor="0" 062 * completeListSize="421315">oai_dc/421315/56151148/100/0/292/x/x/x</resumptionToken> This is seen in OAI targets 063 * used by PURE. 064 */ 065 public static final String EXTENDED_RESUMPTION_TOKEN_MATCH = "(?i)<resumptionToken\\s*cursor=\"[0-9]+\"\\s*completeListSize=\"[0-9]+\">\\s*(.*)\\s*</resumptionToken>"; 066 067 /** The class logger. */ 068 final Log log = LogFactory.getLog(getClass()); 069 070 /** 071 * The number of crawl-uris handled by this extractor. 072 */ 073 private long numberOfCURIsHandled = 0; 074 075 /** 076 * The number of links extracted by this extractor. 077 */ 078 private long numberOfLinksExtracted = 0; 079 080 /** 081 * Constructor for this extractor. 082 * 083 * @param name the name of this extractor 084 */ 085 public ExtractorOAI(String name) { 086 super(name, "Extractor which finds the resumptionToken in an OAI " 087 + "listMetadata query and adds the next page of results " + "to the crawl"); 088 } 089 090 /** 091 * Perform the link extraction on the current crawl uri. This method does not set linkExtractorFinished() on the 092 * current crawlURI, so subsequent extractors in the chain can find more links. 093 * 094 * @param curi the CrawlUI from which to extract the link. 095 */ 096 @Override 097 protected void extract(CrawlURI curi) { 098 if (!isHttpTransactionContentToProcess(curi)) { 099 return; 100 } 101 String mimeType = curi.getContentType(); 102 if (mimeType == null) { 103 return; 104 } 105 if ((mimeType.toLowerCase().indexOf("xml") < 0) && (!curi.toString().toLowerCase().endsWith(".rss")) 106 && (!curi.toString().toLowerCase().endsWith(".xml"))) { 107 return; 108 } 109 try { 110 String query = curi.getUURI().getQuery(); 111 if (query == null || !query.contains("verb=ListRecords")) { // Not an OAI-PMH document 112 return; 113 } 114 } catch (URIException e) { 115 log.error("Cannot get query part from '" + curi + "'", e); 116 } 117 this.numberOfCURIsHandled++; 118 ReplayCharSequence cs = null; 119 try { 120 cs = curi.getHttpRecorder().getReplayCharSequence(); 121 } catch (IOException e) { 122 log.error("Failed getting ReplayCharSequence: " + e.getMessage()); 123 } 124 if (cs == null) { 125 log.error("Failed getting ReplayCharSequence: " + curi.toString()); 126 return; 127 } 128 try { 129 boolean foundResumptionToken = processXml(curi, cs); 130 if (foundResumptionToken) { 131 numberOfLinksExtracted += 1; 132 } 133 } finally { 134 if (cs != null) { 135 try { 136 cs.close(); 137 } catch (IOException ioe) { 138 log.warn(TextUtils.exceptionToString("Failed close of ReplayCharSequence.", ioe)); 139 } 140 } 141 } 142 } 143 144 /** 145 * Searches for resumption token and adds link if it is found. Returns true iff a link is added. 146 * 147 * @param curi the CrawlURI. 148 * @param cs the character sequency in which to search. 149 * @return true iff a resumptionToken is found and a link added. 150 */ 151 public boolean processXml(CrawlURI curi, CharSequence cs) { 152 Matcher m = TextUtils.getMatcher(SIMPLE_RESUMPTION_TOKEN_MATCH, cs); 153 Matcher mPure = TextUtils.getMatcher(EXTENDED_RESUMPTION_TOKEN_MATCH, cs); 154 boolean matchesPure = mPure.find(); 155 boolean matches = m.find(); 156 String token = null; 157 if (matches) { 158 token = m.group(1); 159 } else if (matchesPure) { 160 token = mPure.group(1); 161 } 162 if (token != null) { 163 UURI oldUri = curi.getUURI(); 164 try { 165 final String newQueryPart = "verb=ListRecords&resumptionToken=" + token; 166 URI newUri = new URI(oldUri.getScheme(), oldUri.getAuthority(), oldUri.getPath(), newQueryPart, 167 oldUri.getFragment()); 168 curi.createAndAddLink(newUri.toString(), Link.NAVLINK_MISC, Link.NAVLINK_HOP); 169 } catch (URISyntaxException e) { 170 log.error(e); 171 } catch (URIException e) { 172 log.error(e); 173 } 174 } 175 TextUtils.recycleMatcher(m); 176 TextUtils.recycleMatcher(mPure); 177 return matches || matchesPure; 178 } 179 180 /** 181 * Return a report from this processor. 182 * 183 * @return the report. 184 */ 185 @Override 186 public String report() { 187 StringBuffer ret = new StringBuffer(); 188 ret.append("Processor: dk.netarkivet.harvester.harvesting.extractor.ExtractorOAI\n"); 189 ret.append(" Function: Link extraction as part of OAI harvesting\n"); 190 ret.append(" CrawlURIs handled: " + this.numberOfCURIsHandled + "\n"); 191 ret.append(" Links extracted: " + this.numberOfLinksExtracted + "\n\n"); 192 return ret.toString(); 193 } 194 195}