001/* 002 * #%L 003 * Netarchivesuite - harvester 004 * %% 005 * Copyright (C) 2005 - 2014 The Royal Danish Library, the Danish State and University Library, 006 * the National Library of France and the Austrian National Library. 007 * %% 008 * This program is free software: you can redistribute it and/or modify 009 * it under the terms of the GNU Lesser General Public License as 010 * published by the Free Software Foundation, either version 2.1 of the 011 * License, or (at your option) any later version. 012 * 013 * This program is distributed in the hope that it will be useful, 014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 016 * GNU General Lesser Public License for more details. 017 * 018 * You should have received a copy of the GNU General Lesser Public 019 * License along with this program. If not, see 020 * <http://www.gnu.org/licenses/lgpl-2.1.html>. 021 * #L% 022 */ 023package dk.netarkivet.harvester.tools; 024 025import java.net.URLEncoder; 026import java.util.List; 027 028import javax.management.AttributeNotFoundException; 029import javax.management.MBeanException; 030import javax.management.ReflectionException; 031 032import org.apache.commons.httpclient.URIException; 033import org.archive.crawler.datamodel.CandidateURI; 034import org.archive.crawler.deciderules.DecidingScope; 035import org.archive.crawler.framework.CrawlController; 036import org.archive.crawler.settings.SimpleType; 037import org.archive.crawler.settings.StringList; 038import org.archive.net.UURIFactory; 039import org.slf4j.Logger; 040import org.slf4j.LoggerFactory; 041 042import twitter4j.GeoLocation; 043import twitter4j.MediaEntity; 044import twitter4j.Query; 045import twitter4j.QueryResult; 046import twitter4j.Tweet; 047import twitter4j.Twitter; 048import twitter4j.TwitterException; 049import twitter4j.TwitterFactory; 050import twitter4j.URLEntity; 051 052/** 053 * Heritrix CrawlScope that uses the Twitter Search API (https://dev.twitter.com/docs/api/1/get/search) to add seeds to 054 * a crawl. The following parameters to twitter search are supported: keywords: a list equivalent twitters "query" text. 055 * geo_locations: as defined in the twitter api. language: quivalent to twitter's "lang" parameter. These may be 056 * omitted. In practice only "keywords" works well in the current version of twitter. 057 * <p> 058 * <p> 059 * In addition, the number of results to be considered is determined by the parameters "pages" and 060 * "twitter_results_per_page". 061 */ 062@SuppressWarnings({"deprecation", "serial"}) 063public class TwitterDecidingScope extends DecidingScope { 064 private static final Logger log = LoggerFactory.getLogger(TwitterDecidingScope.class); 065 066 /** 067 * Here we define bean properties which specify the search parameters for Twitter 068 * 069 */ 070 071 /** 072 * Attribute/value pair. The list of keywords to search for 073 */ 074 public static final String ATTR_KEYWORDS = "keywords"; 075 private StringList keywords; 076 077 /** 078 * Attribute/value pair. The number of pages of results to process. 079 */ 080 public static final String ATTR_PAGES = "pages"; 081 private int pages = 1; 082 083 /** 084 * Attribute/value pair. The number of results per twitter page. 085 */ 086 public static final String ATTR_RESULTS_PER_PAGE = "twitter_results_per_page"; 087 private int resultsPerPage = 100; 088 089 /** 090 * Attribute/value pair. A list of geo_locations to include in the search. These have the form lat,long,radius,units 091 * e.g. 100.1,10.5,25.0,km 092 */ 093 public static final String ATTR_GEOLOCATIONS = "geo_locations"; 094 private StringList geoLocations; 095 096 /** 097 * Attribute/value pair. If set, the language to which results are restricted. Unfortunately the twitter language 098 * identification heuristics are so poor that this option is unusable. (Broken. See 099 * http://code.google.com/p/twitter-api/issues/detail?id=1942 ) 100 */ 101 public static final String ATTR_LANG = "language"; 102 private String language = "all"; 103 104 /** 105 * Attribute/value pair specifying whether embedded links should be queued. 106 */ 107 public static final String ATTR_QUEUE_LINKS = "queue_links"; 108 private boolean queueLinks = true; 109 110 /** 111 * Attribute/value pair specifying whether the status of discovered users should be harvested. 112 */ 113 public static final String ATTR_QUEUE_USER_STATUS = "queue_user_status"; 114 private boolean queueUserStatus = true; 115 116 /** 117 * Attribute/value pair specifying whether one should additionally queue all links embedded in a users status. 118 */ 119 public static final String ATTR_QUEUE_USER_STATUS_LINKS = "queue_user_status_links"; 120 private boolean queueUserStatusLinks = true; 121 122 /** 123 * Attribute/value pair specifying whether an html search for the given keyword(s) should also be queued. 124 */ 125 public static final String ATTR_QUEUE_KEYWORD_LINKS = "queue_keyword_links"; 126 private boolean queueKeywordLinks = true; 127 128 private Twitter twitter; 129 private int tweetCount = 0; 130 private int linkCount = 0; 131 132 /** 133 * This routine makes any necessary Twitter API calls and queues the content discovered. 134 * 135 * @param controller The controller for this crawl. 136 */ 137 @Override 138 public void initialize(CrawlController controller) { 139 super.initialize(controller); 140 twitter = (new TwitterFactory()).getInstance(); 141 keywords = null; 142 try { 143 keywords = (StringList) super.getAttribute(ATTR_KEYWORDS); 144 pages = ((Integer) super.getAttribute(ATTR_PAGES)).intValue(); 145 geoLocations = (StringList) super.getAttribute(ATTR_GEOLOCATIONS); 146 language = (String) super.getAttribute(ATTR_LANG); 147 if (language == null) { 148 language = "all"; 149 } 150 resultsPerPage = (Integer) super.getAttribute(ATTR_RESULTS_PER_PAGE); 151 queueLinks = (Boolean) super.getAttribute(ATTR_QUEUE_LINKS); 152 queueUserStatus = (Boolean) super.getAttribute(ATTR_QUEUE_USER_STATUS); 153 queueUserStatusLinks = (Boolean) super.getAttribute(ATTR_QUEUE_USER_STATUS_LINKS); 154 queueKeywordLinks = (Boolean) super.getAttribute(ATTR_QUEUE_KEYWORD_LINKS); 155 } catch (AttributeNotFoundException e1) { 156 e1.printStackTrace(); 157 throw new RuntimeException(e1); 158 } catch (MBeanException e1) { 159 e1.printStackTrace(); 160 throw new RuntimeException(e1); 161 } catch (ReflectionException e1) { 162 e1.printStackTrace(); 163 throw new RuntimeException(e1); 164 } 165 for (Object keyword : keywords) { 166 log.info("Twitter Scope keyword: {}", keyword); 167 } 168 // If keywords or geoLocations is missing, add a list with a single empty string so that the main loop is 169 // executed at least once. 170 if (keywords == null || keywords.isEmpty()) { 171 keywords = new StringList("keywords", "empty keyword list", new String[] {""}); 172 } 173 if (geoLocations == null || geoLocations.isEmpty()) { 174 geoLocations = new StringList("geolocations", "empty geolocation list", new String[] {""}); 175 } 176 log.info("Twitter Scope will queue {} page(s) of results.", pages); 177 // Nested loop over keywords, geo_locations and pages. 178 for (Object keyword : keywords) { 179 String keywordString = (String) keyword; 180 for (Object geoLocation : geoLocations) { 181 String urlQuery = (String) keyword; 182 Query query = new Query(); 183 query.setRpp(resultsPerPage); 184 if (language != null && !language.equals("")) { 185 query.setLang(language); 186 urlQuery += " lang:" + language; 187 keywordString += " lang:" + language; 188 } 189 urlQuery = "http://twitter.com/search/" + URLEncoder.encode(urlQuery); 190 if (queueKeywordLinks) { 191 addSeedIfLegal(urlQuery); 192 } 193 for (int page = 1; page <= pages; page++) { 194 query.setPage(page); 195 if (!keyword.equals("")) { 196 query.setQuery(keywordString); 197 } 198 if (!geoLocation.equals("")) { 199 String[] locationArray = ((String) geoLocation).split(","); 200 try { 201 GeoLocation location = new GeoLocation(Double.parseDouble(locationArray[0]), 202 Double.parseDouble(locationArray[1])); 203 query.setGeoCode(location, Double.parseDouble(locationArray[2]), locationArray[3]); 204 } catch (NumberFormatException e) { 205 e.printStackTrace(); 206 } 207 } 208 try { 209 final QueryResult result = twitter.search(query); 210 List<Tweet> tweets = result.getTweets(); 211 for (Tweet tweet : tweets) { 212 long id = tweet.getId(); 213 String fromUser = tweet.getFromUser(); 214 String tweetUrl = "http://www.twitter.com/" + fromUser + "/status/" + id; 215 addSeedIfLegal(tweetUrl); 216 tweetCount++; 217 if (queueLinks) { 218 extractEmbeddedLinks(tweet); 219 } 220 if (queueUserStatus) { 221 String statusUrl = "http://twitter.com/" + tweet.getFromUser() + "/"; 222 addSeedIfLegal(statusUrl); 223 linkCount++; 224 if (queueUserStatusLinks) { 225 queueUserStatusLinks(tweet.getFromUser()); 226 } 227 } 228 } 229 } catch (TwitterException e1) { 230 log.error(e1.getMessage()); 231 } 232 } 233 } 234 235 } 236 System.out.println(TwitterDecidingScope.class + " added " + tweetCount + " tweets and " + linkCount 237 + " other links."); 238 } 239 240 /** 241 * Adds links to embedded url's and media in a tweet. 242 * 243 * @param tweet The tweet from which links are to be extracted. 244 */ 245 private void extractEmbeddedLinks(Tweet tweet) { 246 final URLEntity[] urlEntities = tweet.getURLEntities(); 247 if (urlEntities != null) { 248 for (URLEntity urlEntity : urlEntities) { 249 addSeedIfLegal(urlEntity.getURL().toString()); 250 addSeedIfLegal(urlEntity.getExpandedURL().toString()); 251 linkCount++; 252 } 253 } 254 final MediaEntity[] mediaEntities = tweet.getMediaEntities(); 255 if (mediaEntities != null) { 256 for (MediaEntity mediaEntity : mediaEntities) { 257 final String mediaUrl = mediaEntity.getMediaURL().toString(); 258 addSeedIfLegal(mediaUrl); 259 linkCount++; 260 } 261 } 262 } 263 264 /** 265 * Searches for a given users recent tweets and queues and embedded material found. 266 * 267 * @param user The twitter username (without the @ prefix). 268 */ 269 private void queueUserStatusLinks(String user) { 270 Query query = new Query(); 271 query.setQuery("@" + user); 272 query.setRpp(20); 273 if (!language.equals("")) { 274 query.setLang(language); 275 } 276 try { 277 List<Tweet> results = twitter.search(query).getTweets(); 278 if (results != null && !results.isEmpty()) { 279 System.out.println("Extracting embedded links for user " + user); 280 } 281 for (Tweet result : results) { 282 if (result.getIsoLanguageCode().equals(language) || language.equals("")) { 283 extractEmbeddedLinks(result); 284 } 285 } 286 } catch (TwitterException e) { 287 e.printStackTrace(); 288 } 289 } 290 291 /** 292 * Adds a url as a seed if possible. Otherwise just prints an error description and returns. 293 * 294 * @param tweetUrl The url to be added. 295 */ 296 private void addSeedIfLegal(String tweetUrl) { 297 try { 298 CandidateURI curi = CandidateURI.createSeedCandidateURI(UURIFactory.getInstance(tweetUrl)); 299 System.out.println("Adding seed: '" + curi.toString() + "'"); 300 addSeed(curi); 301 } catch (URIException e1) { 302 log.error(e1.getMessage()); 303 e1.printStackTrace(); 304 } 305 } 306 307 /** 308 * Constructor for the method. Sets up all known attributes. 309 * 310 * @param name the name of this scope. 311 */ 312 public TwitterDecidingScope(String name) { 313 super(name); 314 addElementToDefinition(new StringList(ATTR_KEYWORDS, "Keywords to search for")); 315 addElementToDefinition(new SimpleType(ATTR_PAGES, "Number of pages of twitter results to use.", new Integer(1))); 316 addElementToDefinition(new StringList(ATTR_GEOLOCATIONS, "Geolocations to search for, comma separated as " 317 + "lat,long,radius,units e.g. 56.0,10.1,200.0,km")); 318 addElementToDefinition(new SimpleType(ATTR_LANG, "Exclusive language for search", "")); 319 addElementToDefinition(new SimpleType(ATTR_RESULTS_PER_PAGE, 320 "Number of results per twitter search page (max 100)", new Integer(100))); 321 addElementToDefinition(new SimpleType(ATTR_QUEUE_KEYWORD_LINKS, 322 "Whether to queue an html search result for the specified keywords", new Boolean(true))); 323 addElementToDefinition(new SimpleType(ATTR_QUEUE_LINKS, "Whether to queue links discovered in search results", 324 new Boolean(true))); 325 addElementToDefinition(new SimpleType(ATTR_QUEUE_USER_STATUS, 326 "Whether to queue an html status listing for discovered users.", new Boolean(true))); 327 addElementToDefinition(new SimpleType(ATTR_QUEUE_USER_STATUS_LINKS, 328 "Whether to search for and queue links embedded in the status of discovered users.", new Boolean(true))); 329 } 330 331 /** 332 * Adds a candidate uri as a seed for the crawl. 333 * 334 * @param curi The crawl uri to be added. 335 * @return whether the uri was added as a seed. 336 */ 337 @Override 338 public boolean addSeed(CandidateURI curi) { 339 return super.addSeed(curi); 340 } 341}