001/*
002 * #%L
003 * Netarchivesuite - harvester
004 * %%
005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library,
006 *             the National Library of France and the Austrian National Library.
007 * %%
008 * This program is free software: you can redistribute it and/or modify
009 * it under the terms of the GNU Lesser General Public License as
010 * published by the Free Software Foundation, either version 2.1 of the
011 * License, or (at your option) any later version.
012 * 
013 * This program is distributed in the hope that it will be useful,
014 * but WITHOUT ANY WARRANTY; without even the implied warranty of
015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016 * GNU General Lesser Public License for more details.
017 * 
018 * You should have received a copy of the GNU General Lesser Public
019 * License along with this program.  If not, see
020 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
021 * #L%
022 */
023package dk.netarkivet.harvester.tools;
024
025import java.net.URLEncoder;
026import java.util.List;
027
028import javax.management.AttributeNotFoundException;
029import javax.management.MBeanException;
030import javax.management.ReflectionException;
031
032import org.apache.commons.httpclient.URIException;
033import org.archive.crawler.datamodel.CandidateURI;
034import org.archive.crawler.deciderules.DecidingScope;
035import org.archive.crawler.framework.CrawlController;
036import org.archive.crawler.settings.SimpleType;
037import org.archive.crawler.settings.StringList;
038import org.archive.net.UURIFactory;
039import org.slf4j.Logger;
040import org.slf4j.LoggerFactory;
041
042import twitter4j.GeoLocation;
043import twitter4j.MediaEntity;
044import twitter4j.Query;
045import twitter4j.QueryResult;
046import twitter4j.Tweet;
047import twitter4j.Twitter;
048import twitter4j.TwitterException;
049import twitter4j.TwitterFactory;
050import twitter4j.URLEntity;
051
052/**
053 * Heritrix CrawlScope that uses the Twitter Search API (https://dev.twitter.com/docs/api/1/get/search) to add seeds to
054 * a crawl. The following parameters to twitter search are supported: keywords: a list equivalent twitters "query" text.
055 * geo_locations: as defined in the twitter api. language: quivalent to twitter's "lang" parameter. These may be
056 * omitted. In practice only "keywords" works well in the current version of twitter.
057 * <p>
058 * <p>
059 * In addition, the number of results to be considered is determined by the parameters "pages" and
060 * "twitter_results_per_page".
061 */
062@SuppressWarnings({"deprecation", "serial"})
063public class TwitterDecidingScope extends DecidingScope {
064    private static final Logger log = LoggerFactory.getLogger(TwitterDecidingScope.class);
065
066    /**
067     * Here we define bean properties which specify the search parameters for Twitter
068     *
069     */
070
071    /**
072     * Attribute/value pair. The list of keywords to search for
073     */
074    public static final String ATTR_KEYWORDS = "keywords";
075    private StringList keywords;
076
077    /**
078     * Attribute/value pair. The number of pages of results to process.
079     */
080    public static final String ATTR_PAGES = "pages";
081    private int pages = 1;
082
083    /**
084     * Attribute/value pair. The number of results per twitter page.
085     */
086    public static final String ATTR_RESULTS_PER_PAGE = "twitter_results_per_page";
087    private int resultsPerPage = 100;
088
089    /**
090     * Attribute/value pair. A list of geo_locations to include in the search. These have the form lat,long,radius,units
091     * e.g. 100.1,10.5,25.0,km
092     */
093    public static final String ATTR_GEOLOCATIONS = "geo_locations";
094    private StringList geoLocations;
095
096    /**
097     * Attribute/value pair. If set, the language to which results are restricted. Unfortunately the twitter language
098     * identification heuristics are so poor that this option is unusable. (Broken. See
099     * http://code.google.com/p/twitter-api/issues/detail?id=1942 )
100     */
101    public static final String ATTR_LANG = "language";
102    private String language = "all";
103
104    /**
105     * Attribute/value pair specifying whether embedded links should be queued.
106     */
107    public static final String ATTR_QUEUE_LINKS = "queue_links";
108    private boolean queueLinks = true;
109
110    /**
111     * Attribute/value pair specifying whether the status of discovered users should be harvested.
112     */
113    public static final String ATTR_QUEUE_USER_STATUS = "queue_user_status";
114    private boolean queueUserStatus = true;
115
116    /**
117     * Attribute/value pair specifying whether one should additionally queue all links embedded in a users status.
118     */
119    public static final String ATTR_QUEUE_USER_STATUS_LINKS = "queue_user_status_links";
120    private boolean queueUserStatusLinks = true;
121
122    /**
123     * Attribute/value pair specifying whether an html search for the given keyword(s) should also be queued.
124     */
125    public static final String ATTR_QUEUE_KEYWORD_LINKS = "queue_keyword_links";
126    private boolean queueKeywordLinks = true;
127
128    private Twitter twitter;
129    private int tweetCount = 0;
130    private int linkCount = 0;
131
132    /**
133     * This routine makes any necessary Twitter API calls and queues the content discovered.
134     *
135     * @param controller The controller for this crawl.
136     */
137    @Override
138    public void initialize(CrawlController controller) {
139        super.initialize(controller);
140        twitter = (new TwitterFactory()).getInstance();
141        keywords = null;
142        try {
143            keywords = (StringList) super.getAttribute(ATTR_KEYWORDS);
144            pages = ((Integer) super.getAttribute(ATTR_PAGES)).intValue();
145            geoLocations = (StringList) super.getAttribute(ATTR_GEOLOCATIONS);
146            language = (String) super.getAttribute(ATTR_LANG);
147            if (language == null) {
148                language = "all";
149            }
150            resultsPerPage = (Integer) super.getAttribute(ATTR_RESULTS_PER_PAGE);
151            queueLinks = (Boolean) super.getAttribute(ATTR_QUEUE_LINKS);
152            queueUserStatus = (Boolean) super.getAttribute(ATTR_QUEUE_USER_STATUS);
153            queueUserStatusLinks = (Boolean) super.getAttribute(ATTR_QUEUE_USER_STATUS_LINKS);
154            queueKeywordLinks = (Boolean) super.getAttribute(ATTR_QUEUE_KEYWORD_LINKS);
155        } catch (AttributeNotFoundException e1) {
156            e1.printStackTrace();
157            throw new RuntimeException(e1);
158        } catch (MBeanException e1) {
159            e1.printStackTrace();
160            throw new RuntimeException(e1);
161        } catch (ReflectionException e1) {
162            e1.printStackTrace();
163            throw new RuntimeException(e1);
164        }
165        for (Object keyword : keywords) {
166            log.info("Twitter Scope keyword: {}", keyword);
167        }
168        // If keywords or geoLocations is missing, add a list with a single empty string so that the main loop is
169        // executed at least once.
170        if (keywords == null || keywords.isEmpty()) {
171            keywords = new StringList("keywords", "empty keyword list", new String[] {""});
172        }
173        if (geoLocations == null || geoLocations.isEmpty()) {
174            geoLocations = new StringList("geolocations", "empty geolocation list", new String[] {""});
175        }
176        log.info("Twitter Scope will queue {} page(s) of results.", pages);
177        // Nested loop over keywords, geo_locations and pages.
178        for (Object keyword : keywords) {
179            String keywordString = (String) keyword;
180            for (Object geoLocation : geoLocations) {
181                String urlQuery = (String) keyword;
182                Query query = new Query();
183                query.setRpp(resultsPerPage);
184                if (language != null && !language.equals("")) {
185                    query.setLang(language);
186                    urlQuery += " lang:" + language;
187                    keywordString += " lang:" + language;
188                }
189                urlQuery = "http://twitter.com/search/" + URLEncoder.encode(urlQuery);
190                if (queueKeywordLinks) {
191                    addSeedIfLegal(urlQuery);
192                }
193                for (int page = 1; page <= pages; page++) {
194                    query.setPage(page);
195                    if (!keyword.equals("")) {
196                        query.setQuery(keywordString);
197                    }
198                    if (!geoLocation.equals("")) {
199                        String[] locationArray = ((String) geoLocation).split(",");
200                        try {
201                            GeoLocation location = new GeoLocation(Double.parseDouble(locationArray[0]),
202                                    Double.parseDouble(locationArray[1]));
203                            query.setGeoCode(location, Double.parseDouble(locationArray[2]), locationArray[3]);
204                        } catch (NumberFormatException e) {
205                            e.printStackTrace();
206                        }
207                    }
208                    try {
209                        final QueryResult result = twitter.search(query);
210                        List<Tweet> tweets = result.getTweets();
211                        for (Tweet tweet : tweets) {
212                            long id = tweet.getId();
213                            String fromUser = tweet.getFromUser();
214                            String tweetUrl = "http://www.twitter.com/" + fromUser + "/status/" + id;
215                            addSeedIfLegal(tweetUrl);
216                            tweetCount++;
217                            if (queueLinks) {
218                                extractEmbeddedLinks(tweet);
219                            }
220                            if (queueUserStatus) {
221                                String statusUrl = "http://twitter.com/" + tweet.getFromUser() + "/";
222                                addSeedIfLegal(statusUrl);
223                                linkCount++;
224                                if (queueUserStatusLinks) {
225                                    queueUserStatusLinks(tweet.getFromUser());
226                                }
227                            }
228                        }
229                    } catch (TwitterException e1) {
230                        log.error(e1.getMessage());
231                    }
232                }
233            }
234
235        }
236        System.out.println(TwitterDecidingScope.class + " added " + tweetCount + " tweets and " + linkCount
237                + " other links.");
238    }
239
240    /**
241     * Adds links to embedded url's and media in a tweet.
242     *
243     * @param tweet The tweet from which links are to be extracted.
244     */
245    private void extractEmbeddedLinks(Tweet tweet) {
246        final URLEntity[] urlEntities = tweet.getURLEntities();
247        if (urlEntities != null) {
248            for (URLEntity urlEntity : urlEntities) {
249                addSeedIfLegal(urlEntity.getURL().toString());
250                addSeedIfLegal(urlEntity.getExpandedURL().toString());
251                linkCount++;
252            }
253        }
254        final MediaEntity[] mediaEntities = tweet.getMediaEntities();
255        if (mediaEntities != null) {
256            for (MediaEntity mediaEntity : mediaEntities) {
257                final String mediaUrl = mediaEntity.getMediaURL().toString();
258                addSeedIfLegal(mediaUrl);
259                linkCount++;
260            }
261        }
262    }
263
264    /**
265     * Searches for a given users recent tweets and queues and embedded material found.
266     *
267     * @param user The twitter username (without the @ prefix).
268     */
269    private void queueUserStatusLinks(String user) {
270        Query query = new Query();
271        query.setQuery("@" + user);
272        query.setRpp(20);
273        if (!language.equals("")) {
274            query.setLang(language);
275        }
276        try {
277            List<Tweet> results = twitter.search(query).getTweets();
278            if (results != null && !results.isEmpty()) {
279                System.out.println("Extracting embedded links for user " + user);
280            }
281            for (Tweet result : results) {
282                if (result.getIsoLanguageCode().equals(language) || language.equals("")) {
283                    extractEmbeddedLinks(result);
284                }
285            }
286        } catch (TwitterException e) {
287            e.printStackTrace();
288        }
289    }
290
291    /**
292     * Adds a url as a seed if possible. Otherwise just prints an error description and returns.
293     *
294     * @param tweetUrl The url to be added.
295     */
296    private void addSeedIfLegal(String tweetUrl) {
297        try {
298            CandidateURI curi = CandidateURI.createSeedCandidateURI(UURIFactory.getInstance(tweetUrl));
299            System.out.println("Adding seed: '" + curi.toString() + "'");
300            addSeed(curi);
301        } catch (URIException e1) {
302            log.error(e1.getMessage());
303            e1.printStackTrace();
304        }
305    }
306
307    /**
308     * Constructor for the method. Sets up all known attributes.
309     *
310     * @param name the name of this scope.
311     */
312    public TwitterDecidingScope(String name) {
313        super(name);
314        addElementToDefinition(new StringList(ATTR_KEYWORDS, "Keywords to search for"));
315        addElementToDefinition(new SimpleType(ATTR_PAGES, "Number of pages of twitter results to use.", new Integer(1)));
316        addElementToDefinition(new StringList(ATTR_GEOLOCATIONS, "Geolocations to search for, comma separated as "
317                + "lat,long,radius,units e.g. 56.0,10.1,200.0,km"));
318        addElementToDefinition(new SimpleType(ATTR_LANG, "Exclusive language for search", ""));
319        addElementToDefinition(new SimpleType(ATTR_RESULTS_PER_PAGE,
320                "Number of results per twitter search page (max 100)", new Integer(100)));
321        addElementToDefinition(new SimpleType(ATTR_QUEUE_KEYWORD_LINKS,
322                "Whether to queue an html search result for the specified keywords", new Boolean(true)));
323        addElementToDefinition(new SimpleType(ATTR_QUEUE_LINKS, "Whether to queue links discovered in search results",
324                new Boolean(true)));
325        addElementToDefinition(new SimpleType(ATTR_QUEUE_USER_STATUS,
326                "Whether to queue an html status listing for discovered users.", new Boolean(true)));
327        addElementToDefinition(new SimpleType(ATTR_QUEUE_USER_STATUS_LINKS,
328                "Whether to search for and queue links embedded in the status of discovered users.", new Boolean(true)));
329    }
330
331    /**
332     * Adds a candidate uri as a seed for the crawl.
333     *
334     * @param curi The crawl uri to be added.
335     * @return whether the uri was added as a seed.
336     */
337    @Override
338    public boolean addSeed(CandidateURI curi) {
339        return super.addSeed(curi);
340    }
341}