Source code

001/* Copyright (C) 2003 Internet Archive.
002 *
003 * This file is part of the Heritrix web crawler (crawler.archive.org).
004 *
005 * Heritrix is free software; you can redistribute it and/or modify
006 * it under the terms of the GNU Lesser Public License as published by
007 * the Free Software Foundation; either version 2.1 of the License, or
008 * any later version.
009 *
010 * Heritrix is distributed in the hope that it will be useful,
011 * but WITHOUT ANY WARRANTY; without even the implied warranty of
012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013 * GNU Lesser Public License for more details.
014 *
015 * You should have received a copy of the GNU Lesser Public License
016 * along with Heritrix; if not, write to the Free Software
017 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018 *
019 * Created on Nov 17, 2003
020 *
021 */
022package dk.netarkivet.harvester.harvesting.extractor;
023
024import java.io.IOException;
025import java.util.logging.Logger;
026import java.util.regex.Matcher;
027
028import org.apache.commons.httpclient.URIException;
029import org.apache.commons.lang.StringEscapeUtils;
030import org.archive.crawler.datamodel.CoreAttributeConstants;
031import org.archive.crawler.datamodel.CrawlURI;
032import org.archive.crawler.extractor.Extractor;
033import org.archive.crawler.extractor.Link;
034import org.archive.crawler.framework.CrawlController;
035import org.archive.io.ReplayCharSequence;
036import org.archive.net.UURI;
037import org.archive.util.DevUtils;
038import org.archive.util.TextUtils;
039import org.archive.util.UriUtils;
040
041/**
042 * Processes Javascript files for strings that are likely to be crawlable URIs.
043 * <p>
044 * contributor gojomo contributor szznax contributor svc
045 */
046public class ExtractorJS extends Extractor implements CoreAttributeConstants {
047
048    private static final long serialVersionUID = -2231962381454717720L;
049
050    private static final Logger LOGGER = Logger.getLogger(ExtractorJS.class.getName());
051
052    // finds whitespace-free strings in Javascript
053    // (areas between paired ' or " characters, possibly backslash-quoted
054    // on the ends, but not in the middle)
055    static final String JAVASCRIPT_STRING_EXTRACTOR = "(\\\\{0,8}+(?:\"|\'))(\\S{0," + UURI.MAX_URL_LENGTH
056            + "}?)(?:\\1)";
057    // GROUPS:
058    // (G1) ' or " with optional leading backslashes
059    // (G2) whitespace-free string delimited on boths ends by G1
060
061    protected long numberOfCURIsHandled = 0;
062    protected static long numberOfLinksExtracted = 0;
063
064    // URIs known to produce false-positives with the current JS extractor.
065    // e.g. currently (2.0.3) the JS extractor produces 13 false-positive
066    // URIs from http://www.google-analytics.com/urchin.js and only 2
067    // good URIs, which are merely one pixel images.
068    // TODO: remove this blacklist when JS extractor is improved
069    protected final static String[] EXTRACTOR_URI_EXCEPTIONS = {"http://www.google-analytics.com/urchin.js"};
070
071    /**
072     * @param name
073     */
074    public ExtractorJS(String name) {
075        super(name, "JavaScript extractor. Link extraction on JavaScript" + " files (.js).");
076    }
077
078    /*
079     * (non-Javadoc)
080     * 
081     * @see org.archive.crawler.framework.Processor#process(org.archive.crawler.datamodel.CrawlURI)
082     */
083    public void extract(CrawlURI curi) {
084        // special-cases, for when we know our current JS extractor does poorly.
085        // TODO: remove this test when JS extractor is improved
086        for (String s : EXTRACTOR_URI_EXCEPTIONS) {
087            if (curi.toString().equals(s)) {
088                return;
089            }
090        }
091
092        if (!isHttpTransactionContentToProcess(curi)) {
093            return;
094        }
095        String contentType = curi.getContentType();
096        if ((contentType == null)) {
097            return;
098        }
099        // If content type is not js and if the viaContext
100        // does not begin with 'script', return.
101        if ((contentType.indexOf("javascript") < 0)
102                && (contentType.indexOf("jscript") < 0)
103                && (contentType.indexOf("ecmascript") < 0)
104                && (!curi.toString().toLowerCase().endsWith(".js"))
105                && (curi.getViaContext() == null || !curi.getViaContext().toString().toLowerCase().startsWith("script"))) {
106            return;
107        }
108
109        this.numberOfCURIsHandled++;
110
111        ReplayCharSequence cs = null;
112        try {
113            cs = curi.getHttpRecorder().getReplayCharSequence();
114        } catch (IOException e) {
115            curi.addLocalizedError(this.getName(), e, "Failed get of replay char sequence.");
116        }
117        if (cs == null) {
118            LOGGER.warning("Failed getting ReplayCharSequence: " + curi.toString());
119            return;
120        }
121
122        try {
123            try {
124                numberOfLinksExtracted += considerStrings(curi, cs, getController(), true);
125            } catch (StackOverflowError e) {
126                DevUtils.warnHandle(e, "ExtractorJS StackOverflowError");
127            }
128            // Set flag to indicate that link extraction is completed.
129            curi.linkExtractorFinished();
130        } finally {
131            // Done w/ the ReplayCharSequence. Close it.
132            if (cs != null) {
133                try {
134                    cs.close();
135                } catch (IOException ioe) {
136                    LOGGER.warning(TextUtils.exceptionToString("Failed close of ReplayCharSequence.", ioe));
137                }
138            }
139        }
140    }
141
142    public static long considerStrings(CrawlURI curi, CharSequence cs, CrawlController controller,
143            boolean handlingJSFile) {
144        long foundLinks = 0;
145        Matcher strings = TextUtils.getMatcher(JAVASCRIPT_STRING_EXTRACTOR, cs);
146        while (strings.find()) {
147            CharSequence subsequence = cs.subSequence(strings.start(2), strings.end(2));
148
149            if (UriUtils.isLikelyUriJavascriptContextLegacy(subsequence)) {
150                String string = subsequence.toString();
151                string = StringEscapeUtils.unescapeJavaScript(string);
152                string = UriUtils.speculativeFixup(string, curi.getUURI());
153                foundLinks++;
154                try {
155                    if (handlingJSFile) {
156                        curi.createAndAddLinkRelativeToVia(string, Link.JS_MISC, Link.SPECULATIVE_HOP);
157                    } else {
158                        curi.createAndAddLinkRelativeToBase(string, Link.JS_MISC, Link.SPECULATIVE_HOP);
159                    }
160                } catch (URIException e) {
161                    // There may not be a controller (e.g. If we're being run
162                    // by the extractor tool).
163                    if (controller != null) {
164                        controller.logUriError(e, curi.getUURI(), string);
165                    } else {
166                        LOGGER.info(curi + ", " + string + ": " + e.getMessage());
167                    }
168                }
169            } else {
170                foundLinks += considerStrings(curi, subsequence, controller, handlingJSFile);
171            }
172        }
173        TextUtils.recycleMatcher(strings);
174        return foundLinks;
175    }
176
177    /*
178     * (non-Javadoc)
179     * 
180     * @see org.archive.crawler.framework.Processor#report()
181     */
182    public String report() {
183        StringBuffer ret = new StringBuffer();
184        ret.append("Processor: dk.netarkivet.harvester.harvesting.extractor.ExtractorJS\n");
185        ret.append("  Function:          Link extraction on JavaScript code\n");
186        ret.append("  CrawlURIs handled: " + numberOfCURIsHandled + "\n");
187        ret.append("  Links extracted:   " + numberOfLinksExtracted + "\n\n");
188
189        return ret.toString();
190    }
191}