rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(rtweet) library(readtext) library(quanteda) library(ggplot2) library(ggmap) library(httpuv) library(dplyr) library(maps) library(leaflet) library(stringr) library(htmlwidgets) library(knitr) ######################################################################################### # STREAMING data ######################################################################################### # with stream_tweets (so called Streaming API) you connect to the “stream” of tweets as they are being published # through that you capture the tweets that contain one or more of the keywords given in the track argument # There are four possible methods. # Note HOWEVER the following issue: # with the CRAN version of the rtweet package you could have some issues with the first 3 methods (an issue related to parsing data). # One solution here would be to use the development version of rweet. # You can download it via "devtools::install_github("mkearney/rtweet")" # Given that we will use the Streaming API today for getting the geo coordinates of tweets, and that the CRAN version # has no problem with that, we will stick to this version # (1) The default, q = "", returns a small random sample of all publicly available Twitter statuses # (1% random sample of all the tweets posted in that specific moment) # dt <- stream_tweets("", timeout = 10) # print(dt$text[1:10]) # (2) To filter by keyword, provide a comma separated character string with the desired phrase(s) and keyword(s) # in this case it returns ALL the tweets, UNLESS that number of tweets is higher than 1% of *all* tweets # in a given moment # dt2 <- stream_tweets("trump", timeout = 10) # print(dt2$text[1:10]) # (3) Track users by providing a comma separated list of user IDs or screen names (up to 5000 user_ids) # dt <- stream_tweets("CNN", timeout = 10) # print(dt$text[1:10]) # (4) Use four latitude/longitude bounding box points to stream by geo location. # What twitter will allow is for searches by geocode, and to achieve this, twitter will use the info under the bbox_coords # The info about the geocode must be provided via a vector of length 4, e.g., c(-125, 26, -65, 49). # how to recover that? either you use your google map API...or check here: # http://boundingbox.klokantech.com/ and select "csv" # api <- "YOUR GOOGLE MAP API" # in my case I have saved it in a txt file myText <- readtext("Google API.txt", encoding = "UTF-8") api <- myText$text lookup_coords("london", apikey=api) lookup_coords("brazil", apikey=api) # I'd like to stream tweets from US (you do not need a GOOGLE MAP KEY to have these info - just for the US!) lookup_coords("USA") ## Use the coordinates you get via lookup_coords usa <- stream_tweets( c(-124.84897, 24.39631, -66.88544, 49.38436), timeout = 10 ) print(usa$text[1:10]) # Or alternatively: usa <- stream_tweets(lookup_coords("USA", apikey=api),timeout = 10) print(usa$text[1:10]) ## create lat/lng variables using all available tweet and profile geo-location data rtll <- lat_lng(usa) ## plot state boundaries par(mar = c(0, 0, 0, 0)) map("state", lwd = .25) ## plot lat and lng points onto state map with(rtll, points(lng, lat, pch = 20, cex = 5, col = rgb(0, .3, .7, .75))) # An alternative plot using ggplot map.data <- map_data("state") str(map.data) points <- data.frame(x = rtll$lng, y = rtll$lat ) ggplot(map.data) + geom_map(aes(map_id = region), map = map.data, fill = "white", color = "grey20", size = 0.25)+ expand_limits(x = map.data$long, y = map.data$lat) + theme(axis.line = element_blank(), axis.text = element_blank(), axis.ticks = element_blank(), axis.title = element_blank(), plot.margin = unit(0 * c(-1.5, -1.5, -1.5, -1.5), "lines")) + geom_point(data = points, aes(x = x, y = y), size = 3, alpha = 1/3, color = "darkblue") # But we can do even more...much more! # Let's open a dataframe including the results of the following geographical query about Italy: # italy <- stream_tweets(lookup_coords("Italy", apikey=api),timeout = 360) italy <- readRDS("italy.rds") # how many tweets? 177 nrow(italy) # let's search for this Twitter user called Curini: italy$screen_name=="Curini" curini <- italy[ which(italy$screen_name=="Curini"), ] print(curini $text[1:2]) # What am I using to tweet? print(curini$source) # let's see some info about my location print(curini $location) # This location refers to what you have decided to write when you created your account. # basically you can write whatever you want (or leave the space empty) print(curini $place_name) print(curini $place_full_name) print(curini $place_type) print(curini $country) print(curini $country_code) print(curini $bbox_coords) # a polygon! print(curini $geo_coords) # not available! # Let's check the entire dataset print(italy$location) print(italy$place_name) # I have info for all the 177 tweets # Let’s sort by count and just plot the top 4 locations. To do this you use top_n(). # Note that in this case you are grouping your data by user. count <- count(italy, place_name, sort = TRUE) str(count) # let's drop the NA count <-count[-c(1), ] str(count) count <- mutate(count, place_name= reorder(place_name, n)) count <- top_n(count, 4) ggplot(count, aes(x = place_name, y = n)) + geom_col() + coord_flip() + labs(x = "Count", y = "Place", title = "Where Twitter users are from - unique place_name ") print(italy$place_full_name) print(italy$place_type) print(italy$country) print(italy$country_code) table(italy$country_code) # you have some info for geo_coords (latitude and longitude). For example for tweet 139 and 152 print(italy$geo_coords) print(italy$geo_coords[139]) print(italy$geo_coords[152]) # let's see their source: Instagram as expected! print(italy$source[139]) print(italy$source[152]) print(italy$bbox_coords) # info about all 177 tweets ### note that you can also run a search_tweets with geocode ## search for 100 tweets sent from ITALY discussing about greenpass rt <- search_tweets( "greenpass", n = 200, geocode = lookup_coords("Italy", apikey=api)) nrow(rt) # in this case however rtweet will not just look for data with bbox_coords, but it will do the following: # first check if the tweet is geocoded, and if not, will check whether a place can be extrapolated from the user's profile information. # So you could, for example, search for tweets from the Barcelona area and twitter will deliver a lot of tweets that aren't geocoded # because the users have "Barcelona" in their profile. print(rt$location) print(rt$place_name) print(rt$bbox_coords[184]) print(rt$country) ## create lat/lng variables using all available tweet and profile geo-location data rtll <- lat_lng(italy) ## plot them within Italian map par(mar = c(0, 0, 0, 0)) map("world", "Italy", lwd = .25) ## plot lat and lng points onto state map with(rtll, points(lng, lat, pch = 20, cex = 5, col = rgb(0, .3, .7, .75))) # An alternative plot using ggplot map.data <- map_data("world", "Italy") str(map.data) points <- data.frame(x = rtll$lng, y = rtll$lat ) ggplot(map.data) + geom_map(aes(map_id = region), map = map.data, fill = "white", color = "grey20", size = 0.25)+ expand_limits(x = map.data$long, y = map.data$lat) + theme(axis.line = element_blank(), axis.text = element_blank(), axis.ticks = element_blank(), axis.title = element_blank(), plot.margin = unit(0 * c(-1.5, -1.5, -1.5, -1.5), "lines")) + geom_point(data = points, aes(x = x, y = y), size = 3, alpha = 1/3, color = "darkblue") # A further alternative plot via the leaflet package # Before doing that, however, let's change the color of the points in the map according to some exogenous info included in the database. # For example, let's create 3 groups out of the sentiment of the texts (but we could have done in terms of emotions, # or any other variables we could be interest in) colnames(rtll) library(syuzhet) rtll$sentiment <- get_sentiment(rtll$text, method="nrc", language="italian") summary(rtll$sentiment ) points $sentimentOK <- ifelse(rtll$sentiment <0, -1, ifelse(rtll$sentiment >0 , 1, 0)) table(points $sentimentOK) points $sentimentOK <- as.factor(points $sentimentOK ) levels(points $sentimentOK ) levels(points $sentimentOK ) <- c("Negative", "Neutral", "Positive") levels(points $sentimentOK ) table(points $sentimentOK) # that would be our usual ggplot graph ggplot(map.data) + geom_map(aes(map_id = region), map = map.data, fill = "white", color = "grey20", size = 0.25)+ expand_limits(x = map.data$long, y = map.data$lat) + theme(axis.line = element_blank(), axis.text = element_blank(), axis.ticks = element_blank(), axis.title = element_blank(), plot.margin = unit(0 * c(-1.5, -1.5, -1.5, -1.5), "lines")) + geom_point(data = points, aes(x = x, y = y, colour = sentimentOK), size = 3) + scale_colour_manual(values = c("red", "blue", "green")) # let's do the same thing with leaflet # let's create a function that applies 3 different colors according to the level of the sentiment getColor <- function(rtll) { sapply(rtll$sentiment, function(sentiment) { if(sentiment < 0) { "red" } else if(sentiment >0) { "green" } else { "blue" } }) } icons <- awesomeIcons( icon = 'ios-close', iconColor = 'black', library = 'ion', markerColor = getColor(rtll) ) m <- leaflet(rtll) m <- addTiles(m) m <- addAwesomeMarkers(m, lng=rtll$lng, lat=rtll$lat, icon=icons, popup=rtll$text) m # if you want to save this graph on your pc # saveWidget(m, "map.html", selfcontained =TRUE, background = "white") # let's change the shape of the Icons icons <- awesomeIcons( icon = 'glass', iconColor = 'white', markerColor = getColor(rtll) ) m <- leaflet(rtll) m <- addTiles(m) m <- addAwesomeMarkers(m, lng=rtll$lng, lat=rtll$lat, icon=icons, popup=rtll$text) m # a lot of the markers are clumped together rather closely. # We can cluster them together by specifying clusterOptions as follows (zoom out and zoom in the graph!) # The number inside each circle represents the total number of observations in that area. # Areas with higher observations are marked by yellow circles and areas with lower incidents are marked by green circle m <- addAwesomeMarkers(m, lng=rtll$lng, lat=rtll$lat, icon=icons, popup=rtll$text, clusterOptions = markerClusterOptions()) m # you can also decide to plot the tweets w/o any sentiment m2 <- leaflet(rtll) m2 <- addTiles(m2) m2 <- addMarkers(m2, lng=rtll$lng, lat=rtll$lat, popup=rtll$text) m2 # if you have problems (I have them since 3 days...google issues?) you can implement the following workaround # i.e., here we always plot the blue color irrespective of the value of the sentiment getColor <- function(rtll) { sapply(rtll$sentiment, function(sentiment) { if(sentiment < 0) { "blue" } else if(sentiment >0) { "blue" } else { "blue" } }) } icons <- awesomeIcons( icon = 'ios-close', iconColor = 'white', library = 'ion', markerColor = getColor(rtll) ) m2 <- leaflet(rtll) m2 <- addTiles(m2) m2 <- addAwesomeMarkers(m2, lng=rtll$lng, lat=rtll$lat, icon=icons, popup=rtll$text) m2