Source code

001/*
002 *  This file is part of the Heritrix web crawler (crawler.archive.org).
003 *
004 *  Licensed to the Internet Archive (IA) by one or more individual 
005 *  contributors. 
006 *
007 *  The IA licenses this file to You under the Apache License, Version 2.0
008 *  (the "License"); you may not use this file except in compliance with
009 *  the License.  You may obtain a copy of the License at
010 *
011 *      http://www.apache.org/licenses/LICENSE-2.0
012 *
013 *  Unless required by applicable law or agreed to in writing, software
014 *  distributed under the License is distributed on an "AS IS" BASIS,
015 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016 *  See the License for the specific language governing permissions and
017 *  limitations under the License.
018 */
019package dk.netarkivet.harvester.harvesting.extractor;
020
021import static org.archive.modules.extractor.Hop.SPECULATIVE;
022import static org.archive.modules.extractor.LinkContext.JS_MISC;
023
024import java.io.IOException;
025import java.util.ArrayList;
026import java.util.List;
027import java.util.logging.Level;
028import java.util.logging.Logger;
029import java.util.regex.Matcher;
030import java.util.regex.Pattern;
031
032import org.apache.commons.httpclient.URIException;
033import org.apache.commons.lang.StringEscapeUtils;
034import org.apache.commons.lang.exception.NestableRuntimeException;
035import org.archive.io.ReplayCharSequence;
036import org.archive.modules.CrawlURI;
037import org.archive.modules.extractor.Extractor;
038import org.archive.modules.extractor.LinkContext;
039import org.archive.net.UURI;
040import org.archive.util.DevUtils;
041import org.archive.util.TextUtils;
042import org.archive.util.UriUtils;
043
044/**
045 * Processes Javascript files for strings that are likely to be
046 * crawlable URIs.
047 *
048 * NOTE: This processor may open a ReplayCharSequence from the 
049 * CrawlURI's Recorder, without closing that ReplayCharSequence, to allow
050 * reuse by later processors in sequence. In the usual (Heritrix) case, a 
051 * call after all processing to the Recorder's endReplays() method ensures
052 * timely close of any reused ReplayCharSequences. Reuse of this processor
053 * elsewhere should ensure a similar cleanup call to Recorder.endReplays()
054 * occurs. 
055 * 
056 * TODO: Replace with a system for actually executing Javascript in a 
057 * browser-workalike DOM, such as via HtmlUnit or remote-controlled 
058 * browser engines. 
059 * 
060 * @contributor gojomo
061 * 
062 * 
063 * This class is a modification by Kristinn Sigurdsson of the org.archive.modules.extractor.ExtractorJS 
064 * class that enables you to reject some false positives found by the original ExtractorJS.
065 */
066public class IcelandicExtractorJS extends org.archive.modules.extractor.ExtractorJS {
067
068    long foundFalsePositives = 0;
069
070    private static Logger LOGGER =
071        Logger.getLogger("org.archive.crawler.extractor.ExtractorJS");
072
073    /**
074     * The list of regular expressions to evalute potential relative url against, rejecting any that match
075     */
076    {
077        setRejectRelativeMatchingRegexList(new ArrayList<Pattern>());
078    }
079    @SuppressWarnings("unchecked")
080    public List<Pattern> getRejectRelativeMatchingRegexList() {
081        return (List<Pattern>) kp.get("rejectRelativeMatchingRegexList");
082    }
083    public void setRejectRelativeMatchingRegexList(List<Pattern> patterns) {
084        kp.put("rejectRelativeMatchingRegexList", patterns);
085    }
086    
087    // finds whitespace-free strings in Javascript
088    // (areas between paired ' or " characters, possibly backslash-quoted
089    // on the ends, but not in the middle)
090    static final String JAVASCRIPT_STRING_EXTRACTOR =
091        "(\\\\{0,8}+(?:\"|\'))(\\S{0,"+UURI.MAX_URL_LENGTH+"}?)(?:\\1)";
092    // GROUPS:
093    // (G1) ' or " with optional leading backslashes
094    // (G2) whitespace-free string delimited on boths ends by G1
095
096    // determines whether a string is likely URI
097    // (no whitespace or '<' '>',  has an internal dot or some slash,
098    // begins and ends with either '/' or a word-char)
099    static final String STRING_URI_DETECTOR =
100        "(?:\\w|[\\.]{0,2}/)[\\S&&[^<>]]*(?:\\.|/)[\\S&&[^<>]]*(?:\\w|/)";
101    
102    protected long numberOfCURIsHandled = 0;
103
104    // URIs known to produce false-positives with the current JS extractor.
105    // e.g. currently (2.0.3) the JS extractor produces 13 false-positive 
106    // URIs from http://www.google-analytics.com/urchin.js and only 2 
107    // good URIs, which are merely one pixel images.
108    // TODO: remove this blacklist when JS extractor is improved 
109    protected final static String[] EXTRACTOR_URI_EXCEPTIONS = {
110        "http://www.google-analytics.com/urchin.js"
111        };
112    
113    /**
114     * Constructor.
115     */
116    public IcelandicExtractorJS() {
117    }
118
119    protected boolean shouldExtract(CrawlURI uri) {
120        
121        // special-cases, for when we know our current JS extractor does poorly.
122        // TODO: remove this test when JS extractor is improved 
123        for (String s: EXTRACTOR_URI_EXCEPTIONS) {
124            if (uri.toString().equals(s))
125                return false;
126        }
127        
128        String contentType = uri.getContentType();
129        if ((contentType == null)) {
130            return false;
131        }
132
133        // If the content-type indicates js, we should process it.
134        if (contentType.indexOf("javascript") >= 0) {
135            return true;
136        }
137        if (contentType.indexOf("jscript") >= 0) {
138            return true;
139        }
140        if (contentType.indexOf("ecmascript") >= 0) {
141            return true;
142        }
143        
144        // If the filename indicates js, we should process it.
145        if (uri.toString().toLowerCase().endsWith(".js")) {
146            return true;
147        }
148        
149        // If the viaContext indicates a script, we should process it.
150        LinkContext context = uri.getViaContext();
151        if (context == null) {
152            return false;
153        }
154        String s = context.toString().toLowerCase();
155        return s.startsWith("script");
156    }
157    
158    @Override
159    protected boolean innerExtract(CrawlURI curi) {
160        this.numberOfCURIsHandled++;
161        ReplayCharSequence cs = null;
162        try {
163            cs = curi.getRecorder().getContentReplayCharSequence();
164            try {
165                numberOfLinksExtracted.getAndAdd(considerStrings(this, curi, cs, true));
166            } catch (StackOverflowError e) {
167                DevUtils.warnHandle(e, "ExtractorJS StackOverflowError");
168            }
169            // Set flag to indicate that link extraction is completed.
170            return true;
171        } catch (IOException e) {
172            curi.getNonFatalFailures().add(e);
173        }
174        return false;
175    }
176
177    public long considerStrings(Extractor ext, 
178            CrawlURI curi, CharSequence cs, boolean handlingJSFile) {
179        long foundLinks = 0;
180        Matcher strings =
181            TextUtils.getMatcher(JAVASCRIPT_STRING_EXTRACTOR, cs);
182        int startIndex = 0;
183        while (strings.find(startIndex)) {
184            CharSequence subsequence =
185                cs.subSequence(strings.start(2), strings.end(2));
186            Matcher uri =
187                TextUtils.getMatcher(STRING_URI_DETECTOR, subsequence);
188            if(uri.matches()) {
189                String string = uri.group();
190                boolean falsePositive = false;
191                try {
192                    string = StringEscapeUtils.unescapeJavaScript(string);
193                } catch (NestableRuntimeException e) {
194                    LOGGER.log(Level.WARNING, "problem unescaping some javascript", e);
195                }
196                string = UriUtils.speculativeFixup(string, curi.getUURI());
197                
198                // Filter out some bad false positives (should really fix regexp for URI detection) 
199                if (string.contains("/.") || string.contains("@") || string.length() > 150) {
200                        // While legal in URIs, these are rare and usually an indication of a false positive
201                        // in the speculative extraction.
202                        falsePositive = true;
203                }
204                
205                if (!falsePositive) {
206                    falsePositive = shouldIgnorePossibleRelativeLink(string);
207                }
208                       
209                if (falsePositive) {
210                        foundFalsePositives++;
211                } else {
212                        foundLinks++;
213                        try {
214                            int max = ext.getExtractorParameters().getMaxOutlinks();
215                            if (handlingJSFile) {
216                                addRelativeToVia(curi, max, string, JS_MISC, SPECULATIVE);
217                            } else {
218                                addRelativeToBase(curi, max, string, JS_MISC, SPECULATIVE);
219                            }
220                        } catch (URIException e) {
221                            ext.logUriError(e, curi.getUURI(), string);
222                        }
223                }
224            } else {
225               foundLinks += considerStrings(ext, curi, subsequence, handlingJSFile);
226            }
227            
228            // reconsider the last closing quote as possible opening quote
229            startIndex = strings.end(2);
230        }
231        TextUtils.recycleMatcher(strings);
232        return foundLinks;
233    }
234    
235    private boolean shouldIgnorePossibleRelativeLink(String str) {
236        if (str.matches("^[a-zA-Z]://.*$")) {
237            // Absolute path. Assume it is ok.
238            return false;
239        }
240        
241        List<Pattern> regexes = getRejectRelativeMatchingRegexList();
242        if(regexes.size()==0){
243            return false;
244        }
245
246        for (Pattern p: regexes) {
247            boolean matches = p.matcher(str).matches();
248            if(matches){
249                return true;
250            } 
251        }
252        return false;
253    }
254    
255        @Override
256        public String report() {
257        StringBuffer report = new StringBuffer();
258        report.append(super.report());
259        report.append("  False positives eliminated: " + foundFalsePositives + "\n"); 
260                return report.toString();
261        }
262}