rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(rtweet) library(readtext) library(quanteda) library(ggplot2) library(ggmap) library(httpuv) library(dplyr) library(maps) library(leaflet) library(stringr) library(rtweet) ######################################################################################### # STREAMING data ######################################################################################### # with stream_tweets (so called Streaming API) you connect to the “stream” of tweets as they are being published # through that you capture the tweets that contain one or more of the keywords given in the track argument # There are four possible methods. (1) The default, q = "", returns a small random sample of all publicly available Twitter statuses # (1% random sample of all the tweets posted in that specific moment) dt <- stream_tweets("", timeout = 10) print(dt$text[1:10]) # (2) To filter by keyword, provide a comma separated character string with the desired phrase(s) and keyword(s) # in this case it returns ALL the tweets, UNLESS that number of tweets is higher than 1% of *all* tweets # in a given moment dt <- stream_tweets("trump", timeout = 10) print(dt$text[1:10]) # (3) Track users by providing a comma separated list of user IDs or screen names (up to 5000 user_ids) # dt <- stream_tweets("CNN", timeout = 10) # print(dt$text[1:10]) # (4) Use four latitude/longitude bounding box points to stream by geo location. # What twitter will allow is for searches by geocode, and to achieve this, twitter will use the info under the bbox_coords # The info about the geocode must be provided via a vector of length 4, e.g., c(-125, 26, -65, 49). # how to recover that? either you use your google map API...or check here: # http://boundingbox.klokantech.com/ and select "csv" api <- "YOUR GOOGLE MAP API" # in my case I have saved it in a txt file myText <- readtext("Google API.txt", encoding = "UTF-8") api <- myText$text lookup_coords("london", apikey=api) lookup_coords("brazil", apikey=api) # I'd like to stream tweets from US (you do not need a GOOGLE MAP KEY to have these info - just for the US!) lookup_coords("USA") usa <- stream_tweets( c(-124.84897, 24.39631, -66.88544, 49.38436), timeout = 30 ) # I'd like to stream tweets from Italy lookup_coords("Italy", apikey=api) ## Use the coordinates you get via lookup_coords # italy <- stream_tweets( # c(6.62672, 35.48970, 18.79760, 47.09200 ), # timeout = 30 # ) # Or faster: # italy <- stream_tweets(lookup_coords("Italy", apikey=api),timeout = 360) # print(italy $text[1:10]) # let's open a dataframe including the data frame of tweets of a similar geographical query italy <- readRDS("italy.rds") str(italy) # how many tweets? 177 nrow(italy) length(italy$text) print(italy$text[13]) print(italy$text[14]) # let's search for this Twitter user called Curini: italy$screen_name=="Curini" curini <- italy[ which(italy$screen_name=="Curini"), ] print(curini $text[1:2]) # What am I using to tweet? print(curini$source) # let's see some info about my location print(curini $location) # This location refers to what you have decided to write when you created your account. # basically you can write whatever you want (or leave the space empty) print(curini $place_name) print(curini $place_full_name) print(curini $place_type) print(curini $country) print(curini $country_code) print(curini $geo_coords) print(curini $bbox_coords) # a polygon! # Let's check the entire dataset print(italy$location) print(italy$place_name) # I have info for all the 177 tweets # Let’s sort by count and just plot the top 4 locations. To do this you use top_n(). # Note that in this case you are grouping your data by user. count <- count(italy, location, sort = TRUE) str(count) # let's drop the NA count <-count[-c(1), ] str(count) count <- mutate(count, location = reorder(location, n)) count <- top_n(count, 4) ggplot(count, aes(x = location, y = n)) + geom_col() + coord_flip() + labs(x = "Count", y = "Location", title = "Where Twitter users are from - unique locations ") print(italy$place_full_name) print(italy$place_type) print(italy$country) print(italy$country_code) table(italy$country_code) # you have some info for geo_coords (latitude and longitude). For example for tweet 139 and 152 print(italy$geo_coords) print(italy$geo_coords[139]) print(italy$geo_coords[152]) # let's see their source: Instagram as expected! print(italy$source[139]) print(italy$source[152]) print(italy$bbox_coords) # info about all 177 tweets ### note that you can also run a search_tweets with geocode ## search for 100 tweets sent from ITALY discussing about greenpass rt <- search_tweets( "greenpass", n = 100, geocode = lookup_coords("Italy", apikey=api)) nrow(rt) print(rt$text[13]) # in this case however rtweet will not just look for data with bbox_coords, but it will do the following: # first check if the tweet is geocoded, and if not, will check whether a place can be extrapolated from the user's profile information. # So you could, for example, search for tweets from the Barcelona area and twitter will deliver a lot of tweets that aren't geocoded # because the users have "Barcelona" in their profile. print(rt$bbox_coords) print(rt$location) print(rt$place_name) print(rt$bbox_coords[76]) print(rt$country) ## create lat/lng variables using all available tweet and profile geo-location data rtll <- lat_lng(italy) ## plot state boundaries par(mar = c(0, 0, 0, 0)) map("world", "Italy", lwd = .25) ## plot lat and lng points onto state map with(rtll, points(lng, lat, pch = 20, cex = 5, col = rgb(0, .3, .7, .75))) # plot the results map.data <- map_data("world", "Italy") str(map.data) points <- data.frame(x = rtll$lng, y = rtll$lat ) ggplot(map.data) + geom_map(aes(map_id = region), map = map.data, fill = "white", color = "grey20", size = 0.25)+ expand_limits(x = map.data$long, y = map.data$lat) + theme(axis.line = element_blank(), axis.text = element_blank(), axis.ticks = element_blank(), axis.title = element_blank(), plot.margin = unit(0 * c(-1.5, -1.5, -1.5, -1.5), "lines")) + geom_point(data = points, aes(x = x, y = y), size = 3, alpha = 1/3, color = "darkblue") # let's create 3 groups out of the number of followers of each single users in the database and let's plot it # value=1 for the users with a number of followers lower than the median value # value=2 for the users with a number of followers higher than (or equal to) the median value and lower than the 3rd Quartile # value=3 for the users with a number of followers higher than (or equal to) the 3rd Quartile summary(rtll$followers_count) points$followers[rtll$followers_count< quantile(rtll$followers_count , 0.5 )] <- 1 points$followers[rtll$followers_count>= quantile(rtll$followers_count , 0.5 ) & rtll$followers_count= quantile(rtll$followers_count , 0.75 )] <- 3 points ggplot(map.data) + geom_map(aes(map_id = region), map = map.data, fill = "white", color = "grey20", size = 0.25)+ expand_limits(x = map.data$long, y = map.data$lat) + theme(axis.line = element_blank(), axis.text = element_blank(), axis.ticks = element_blank(), axis.title = element_blank(), plot.margin = unit(0 * c(-1.5, -1.5, -1.5, -1.5), "lines")) + geom_point(data = points, aes(x = x, y = y, colour = factor(followers)), size = 3) # alternative plot via leaflet m2 <- leaflet(rtll) m2 <- addTiles(m2) # Add default OpenStreetMap map tiles m2 <- addMarkers(m2, lng=rtll$lng, lat=rtll$lat, popup=rtll$text) m2 # a lot of the markers are clumped together rather closely. # We can cluster them together by specifying clusterOptions as follows (zoom out and zoom in the graph!) # The number inside each circle represents the total number of observations in that area. # Areas with higher observations are marked by yellow circles and areas with lower incidents are marked by green circle m <- addMarkers(m2, lng=rtll$lng, lat=rtll$lat, popup=rtll$text, clusterOptions = markerClusterOptions()) m # let's change the colour of the tweets in the map according to some exogenous # info included in the databaset. Such as the number of followers for example summary(rtll$ followers_count) # let's create 3 groups out of the number of followers getColor <- function(rtll) { sapply(rtll$followers_count, function(followers_count) { if(followers_count=quantile(rtll$followers_count , 0.5 ) && followers_count<=quantile(rtll$followers_count , 0.75 )) { "orange" } else { "red" } }) } icons <- awesomeIcons( icon = 'ios-close', iconColor = 'black', library = 'ion', markerColor = getColor(rtll) ) m <- leaflet(rtll) m <- addTiles(m) m <- addAwesomeMarkers(m, lng=rtll$lng, lat=rtll$lat, icon=icons, popup=rtll$text, label=~as.character(followers_count)) m icons <- awesomeIcons( icon = 'glass', iconColor = 'white', markerColor = getColor(rtll) ) m <- leaflet(rtll) m <- addTiles(m) m <- addAwesomeMarkers(m, lng=rtll$lng, lat=rtll$lat, icon=icons, popup=rtll$text, label=~as.character(followers_count)) m # for more info: https://rstudio.github.io/leaflet/markers.html