rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(readtext) library(quanteda) library(ggplot2) library(ggmap) library(httpuv) library(dplyr) library(maps) library(leaflet) library(stringr) library(rtweet) packageVersion("rtweet") ######################################################################################### # STREAMING data from Twitter ######################################################################################### # with stream_tweets (so called Streaming API) you connect to the stream of tweets as they are being published # through that you capture the tweets that contain one or more of the keywords given in the track argument. # Here you need the 1.0.2. version of rtweet (but with an exception as discussed above) # Do not run (in the lab): # install.packages("rtweet", repos = "http://cran.us.r-project.org") # library(rtweet) # packageVersion("rtweet") # auth_setup_default() # You can filter by keyword, provide a comma separated character string with the desired phrase(s) and keyword(s). # in this case it returns ALL the tweets, UNLESS that number of tweets is higher than 1% of *all* tweets # in a given moment. Note that it will return also those tweets that do not directly mention the word "biden", but # for example quote a tweet that use the word "biden" or share an article that in its title contains the word "biden". # Note moreover that the latest version of rtweet does not provide you the entire text of the tweet # dt <- stream_tweets("biden", timeout = 10) # print(dt$text[1:10]) # You can also track users (tweets, retweets, mention) by providing a comma separated list of user IDs or screen names (up to 5000 user_ids) # dt <- stream_tweets("CNN", timeout = 10) # print(dt$text[1:10]) # Finally, we can also stream geographically. In this latter case, we can keep using the 0.7.0. version of rtweet ######################################################################################### # STREAMING data with latitude/longitude ######################################################################################### # Use four latitude/longitude bounding box points to stream by geo location. # What twitter will allow is for searches by geocode, and to achieve this, twitter will use the info under the bbox_coords # The info about the geocode must be provided via a vector of length 4, e.g., c(-125, 26, -65, 49). # how to recover that? either you use your google map API...or check here: # http://boundingbox.klokantech.com/ and select "csv" # api <- "YOUR GOOGLE MAP API" # in my case I have saved it in a txt file myText <- readtext("Google API.txt", encoding = "UTF-8") api <- myText$text usa <- stream_tweets( c(-124.84897, 24.39631, -66.88544, 49.38436), timeout = 10 ) print(usa$text[1:5]) # how many tweets have we downloaded? length(usa$text) ## create lat/lng variables using all available tweet and profile geo-location data rtll <- lat_lng(usa) ## plot state boundaries par(mar = c(0, 0, 0, 0)) map("state", lwd = .25) ## plot lat and lng points onto state map with(rtll, points(lng, lat, pch = 20, cex = 5, col = rgb(0, .3, .7, .75))) # An alternative plot using ggplot map.data <- map_data("state") str(map.data) points <- data.frame(x = rtll$lng, y = rtll$lat ) ggplot(map.data) + geom_map(aes(map_id = region), map = map.data, fill = "white", color = "grey20", size = 0.25)+ expand_limits(x = map.data$long, y = map.data$lat) + theme(axis.line = element_blank(), axis.text = element_blank(), axis.ticks = element_blank(), axis.title = element_blank(), plot.margin = unit(0 * c(-1.5, -1.5, -1.5, -1.5), "lines")) + geom_point(data = points, aes(x = x, y = y), size = 3, alpha = 1/3, color = "darkblue") # alternative plot via leaflet m2 <- leaflet(rtll) m2 <- addTiles(m2) # Add default OpenStreetMap map tiles m2 <- addMarkers(m2, lng=rtll$lng, lat=rtll$lat, popup=rtll$text) m2 # let's change the colour of the tweets in the map according to some exogenous info included in the databaset # Such as the number of followers for example (a metadata NOT available in the 1.0.2. version of rtweet) summary(rtll$ followers_count) # let's create 3 groups out of the number of followers getColor <- function(rtll) { sapply(rtll$followers_count, function(followers_count) { if(followers_count=quantile(rtll$followers_count , 0.5 ) & followers_count<=quantile(rtll$followers_count , 0.75 )) { "orange" } else { "red" } }) } icons <- awesomeIcons( icon = 'ios-close', iconColor = 'black', library = 'ion', markerColor = getColor(rtll) ) m <- leaflet(rtll) m <- addTiles(m) m <- addAwesomeMarkers(m, lng=rtll$lng, lat=rtll$lat, icon=icons, popup=rtll$text, label=~as.character(followers_count)) m ######################################################## # Now, I'd like to stream tweets from Italy ######################################################## lookup_coords("Italy", apikey=api) ## For doing that, we can use either the coordinates you get via lookup_coords # italy <- stream_tweets( # c(6.62672, 35.48970, 18.79760, 47.09200 ), # timeout = 30 # ) # Or alternatively: # italy <- stream_tweets(lookup_coords("Italy", apikey=api),timeout = 30) # print(italy $text[1:10]) # let's open a dataframe including the data frame of tweets of a similar geographical query to understand a bit more about our results italy <- readRDS("italy.rds") # how many tweets? 177 nrow(italy) # let's search for this Twitter user called Curini: table(italy$screen_name=="Curini") curini <- italy[ which(italy$screen_name=="Curini"), ] print(curini $text[1:2]) # let's see some info about my location print(curini $location) # this is defined by the users (and the user can write basically anything she wants: real or unreal...) # these info is related to the POI selected by the user print(curini $place_name) print(curini $place_full_name) print(curini $place_type) # these info are related to the country of the author (linked to the POI as identified by the user) print(curini $country) print(curini $country_code) # these are the coordinates of the POI print(curini $bbox_coords) # a polygon! # Let's check the entire dataset print(italy$location) print(italy$place_name) # I have info for all the 177 tweets print(italy$country_code) table(italy$country) print(italy$bbox_coords) # In the dataset you have some info for geo_coords as well (latitude and longitude). For example for tweet 139 and 152 print(italy$geo_coords) print(italy$geo_coords[139]) print(italy$geo_coords[152]) # let's see their source: Instagram as expected! print(italy$source[139]) print(italy$source[152]) # let's draw some Italian maps! ## create lat/lng variables using all available tweet and profile geo-location data rtll_it <- lat_lng(italy) ## plot state boundaries par(mar = c(0, 0, 0, 0)) map("world", "Italy", lwd = .25) ## plot lat and lng points onto state map with(rtll_it, points(lng, lat, pch = 20, cex = 5, col = rgb(0, .3, .7, .75))) # plot the results map.data <- map_data("world", "Italy") str(map.data) points <- data.frame(x = rtll_it$lng, y = rtll_it$lat ) ggplot(map.data) + geom_map(aes(map_id = region), map = map.data, fill = "white", color = "grey20", size = 0.25)+ expand_limits(x = map.data$long, y = map.data$lat) + theme(axis.line = element_blank(), axis.text = element_blank(), axis.ticks = element_blank(), axis.title = element_blank(), plot.margin = unit(0 * c(-1.5, -1.5, -1.5, -1.5), "lines")) + geom_point(data = points, aes(x = x, y = y), size = 3, alpha = 1/3, color = "darkblue") # alternative plot via leaflet m2 <- leaflet(rtll_it) m2 <- addTiles(m2) # Add default OpenStreetMap map tiles m2 <- addMarkers(m2, lng=rtll_it$lng, lat=rtll_it$lat, popup=rtll_it$text) m2 ######################################################## ### note that you can also run a search_tweets with geocode ## search for 100 tweets sent from ITALY discussing about Meloni ######################################################## rt <- search_tweets("meloni", n = 100, geocode = lookup_coords("Italy", apikey=api)) nrow(rt) print(rt$text[1]) # in this case however rtweet will not just look for data with bbox_coords, but it will do the following: # FIRST check if the tweet is geocoded # If not, rtweet will check whether a place can be extrapolated from the user's profile information (with all the risks already discussed). # Having said that, this is the ONLY WAY to recover tweets with geocode PLUS a query. # Indeed, at the moment you can not implement via streaming-API both a search-like query PLUS a Location via geo coordinates print(rt$bbox_coords) table(rt$place_name) table(rt$country) print(rt$location)