001/* 002 * This file is part of the Heritrix web crawler (crawler.archive.org). 003 * 004 * Licensed to the Internet Archive (IA) by one or more individual 005 * contributors. 006 * 007 * The IA licenses this file to You under the Apache License, Version 2.0 008 * (the "License"); you may not use this file except in compliance with 009 * the License. You may obtain a copy of the License at 010 * 011 * http://www.apache.org/licenses/LICENSE-2.0 012 * 013 * Unless required by applicable law or agreed to in writing, software 014 * distributed under the License is distributed on an "AS IS" BASIS, 015 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 016 * See the License for the specific language governing permissions and 017 * limitations under the License. 018 */ 019package dk.netarkivet.harvester.harvesting.extractor; 020 021import static org.archive.modules.extractor.Hop.SPECULATIVE; 022import static org.archive.modules.extractor.LinkContext.JS_MISC; 023 024import java.io.IOException; 025import java.util.ArrayList; 026import java.util.List; 027import java.util.logging.Level; 028import java.util.logging.Logger; 029import java.util.regex.Matcher; 030import java.util.regex.Pattern; 031 032import org.apache.commons.httpclient.URIException; 033import org.apache.commons.lang.StringEscapeUtils; 034import org.apache.commons.lang.exception.NestableRuntimeException; 035import org.archive.io.ReplayCharSequence; 036import org.archive.modules.CrawlURI; 037import org.archive.modules.extractor.Extractor; 038import org.archive.modules.extractor.LinkContext; 039import org.archive.net.UURI; 040import org.archive.util.DevUtils; 041import org.archive.util.TextUtils; 042import org.archive.util.UriUtils; 043 044/** 045 * Processes Javascript files for strings that are likely to be 046 * crawlable URIs. 047 * 048 * NOTE: This processor may open a ReplayCharSequence from the 049 * CrawlURI's Recorder, without closing that ReplayCharSequence, to allow 050 * reuse by later processors in sequence. In the usual (Heritrix) case, a 051 * call after all processing to the Recorder's endReplays() method ensures 052 * timely close of any reused ReplayCharSequences. Reuse of this processor 053 * elsewhere should ensure a similar cleanup call to Recorder.endReplays() 054 * occurs. 055 * 056 * TODO: Replace with a system for actually executing Javascript in a 057 * browser-workalike DOM, such as via HtmlUnit or remote-controlled 058 * browser engines. 059 * 060 * @contributor gojomo 061 * 062 * 063 * This class is a modification by Kristinn Sigurdsson of the org.archive.modules.extractor.ExtractorJS 064 * class that enables you to reject some false positives found by the original ExtractorJS. 065 */ 066public class IcelandicExtractorJS extends org.archive.modules.extractor.ExtractorJS { 067 068 long foundFalsePositives = 0; 069 070 private static Logger LOGGER = 071 Logger.getLogger("org.archive.crawler.extractor.ExtractorJS"); 072 073 /** 074 * The list of regular expressions to evalute potential relative url against, rejecting any that match 075 */ 076 { 077 setRejectRelativeMatchingRegexList(new ArrayList<Pattern>()); 078 } 079 @SuppressWarnings("unchecked") 080 public List<Pattern> getRejectRelativeMatchingRegexList() { 081 return (List<Pattern>) kp.get("rejectRelativeMatchingRegexList"); 082 } 083 public void setRejectRelativeMatchingRegexList(List<Pattern> patterns) { 084 kp.put("rejectRelativeMatchingRegexList", patterns); 085 } 086 087 // finds whitespace-free strings in Javascript 088 // (areas between paired ' or " characters, possibly backslash-quoted 089 // on the ends, but not in the middle) 090 static final String JAVASCRIPT_STRING_EXTRACTOR = 091 "(\\\\{0,8}+(?:\"|\'))(\\S{0,"+UURI.MAX_URL_LENGTH+"}?)(?:\\1)"; 092 // GROUPS: 093 // (G1) ' or " with optional leading backslashes 094 // (G2) whitespace-free string delimited on boths ends by G1 095 096 // determines whether a string is likely URI 097 // (no whitespace or '<' '>', has an internal dot or some slash, 098 // begins and ends with either '/' or a word-char) 099 static final String STRING_URI_DETECTOR = 100 "(?:\\w|[\\.]{0,2}/)[\\S&&[^<>]]*(?:\\.|/)[\\S&&[^<>]]*(?:\\w|/)"; 101 102 protected long numberOfCURIsHandled = 0; 103 104 // URIs known to produce false-positives with the current JS extractor. 105 // e.g. currently (2.0.3) the JS extractor produces 13 false-positive 106 // URIs from http://www.google-analytics.com/urchin.js and only 2 107 // good URIs, which are merely one pixel images. 108 // TODO: remove this blacklist when JS extractor is improved 109 protected final static String[] EXTRACTOR_URI_EXCEPTIONS = { 110 "http://www.google-analytics.com/urchin.js" 111 }; 112 113 /** 114 * Constructor. 115 */ 116 public IcelandicExtractorJS() { 117 } 118 119 protected boolean shouldExtract(CrawlURI uri) { 120 121 // special-cases, for when we know our current JS extractor does poorly. 122 // TODO: remove this test when JS extractor is improved 123 for (String s: EXTRACTOR_URI_EXCEPTIONS) { 124 if (uri.toString().equals(s)) 125 return false; 126 } 127 128 String contentType = uri.getContentType(); 129 if ((contentType == null)) { 130 return false; 131 } 132 133 // If the content-type indicates js, we should process it. 134 if (contentType.indexOf("javascript") >= 0) { 135 return true; 136 } 137 if (contentType.indexOf("jscript") >= 0) { 138 return true; 139 } 140 if (contentType.indexOf("ecmascript") >= 0) { 141 return true; 142 } 143 144 // If the filename indicates js, we should process it. 145 if (uri.toString().toLowerCase().endsWith(".js")) { 146 return true; 147 } 148 149 // If the viaContext indicates a script, we should process it. 150 LinkContext context = uri.getViaContext(); 151 if (context == null) { 152 return false; 153 } 154 String s = context.toString().toLowerCase(); 155 return s.startsWith("script"); 156 } 157 158 @Override 159 protected boolean innerExtract(CrawlURI curi) { 160 this.numberOfCURIsHandled++; 161 ReplayCharSequence cs = null; 162 try { 163 cs = curi.getRecorder().getContentReplayCharSequence(); 164 try { 165 numberOfLinksExtracted.getAndAdd(considerStrings(this, curi, cs, true)); 166 } catch (StackOverflowError e) { 167 DevUtils.warnHandle(e, "ExtractorJS StackOverflowError"); 168 } 169 // Set flag to indicate that link extraction is completed. 170 return true; 171 } catch (IOException e) { 172 curi.getNonFatalFailures().add(e); 173 } 174 return false; 175 } 176 177 public long considerStrings(Extractor ext, 178 CrawlURI curi, CharSequence cs, boolean handlingJSFile) { 179 long foundLinks = 0; 180 Matcher strings = 181 TextUtils.getMatcher(JAVASCRIPT_STRING_EXTRACTOR, cs); 182 int startIndex = 0; 183 while (strings.find(startIndex)) { 184 CharSequence subsequence = 185 cs.subSequence(strings.start(2), strings.end(2)); 186 Matcher uri = 187 TextUtils.getMatcher(STRING_URI_DETECTOR, subsequence); 188 if(uri.matches()) { 189 String string = uri.group(); 190 boolean falsePositive = false; 191 try { 192 string = StringEscapeUtils.unescapeJavaScript(string); 193 } catch (NestableRuntimeException e) { 194 LOGGER.log(Level.WARNING, "problem unescaping some javascript", e); 195 } 196 string = UriUtils.speculativeFixup(string, curi.getUURI()); 197 198 // Filter out some bad false positives (should really fix regexp for URI detection) 199 if (string.contains("/.") || string.contains("@") || string.length() > 150) { 200 // While legal in URIs, these are rare and usually an indication of a false positive 201 // in the speculative extraction. 202 falsePositive = true; 203 } 204 205 if (!falsePositive) { 206 falsePositive = shouldIgnorePossibleRelativeLink(string); 207 } 208 209 if (falsePositive) { 210 foundFalsePositives++; 211 } else { 212 foundLinks++; 213 try { 214 int max = ext.getExtractorParameters().getMaxOutlinks(); 215 if (handlingJSFile) { 216 addRelativeToVia(curi, max, string, JS_MISC, SPECULATIVE); 217 } else { 218 addRelativeToBase(curi, max, string, JS_MISC, SPECULATIVE); 219 } 220 } catch (URIException e) { 221 ext.logUriError(e, curi.getUURI(), string); 222 } 223 } 224 } else { 225 foundLinks += considerStrings(ext, curi, subsequence, handlingJSFile); 226 } 227 228 // reconsider the last closing quote as possible opening quote 229 startIndex = strings.end(2); 230 } 231 TextUtils.recycleMatcher(strings); 232 return foundLinks; 233 } 234 235 private boolean shouldIgnorePossibleRelativeLink(String str) { 236 if (str.matches("^[a-zA-Z]://.*$")) { 237 // Absolute path. Assume it is ok. 238 return false; 239 } 240 241 List<Pattern> regexes = getRejectRelativeMatchingRegexList(); 242 if(regexes.size()==0){ 243 return false; 244 } 245 246 for (Pattern p: regexes) { 247 boolean matches = p.matcher(str).matches(); 248 if(matches){ 249 return true; 250 } 251 } 252 return false; 253 } 254 255 @Override 256 public String report() { 257 StringBuffer report = new StringBuffer(); 258 report.append(super.report()); 259 report.append(" False positives eliminated: " + foundFalsePositives + "\n"); 260 return report.toString(); 261 } 262}