rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(rtweet) library(readtext) library(quanteda) library(ggplot2) library(ggmap) library(httpuv) library(dplyr) library(maps) library(leaflet) library(stringr) # "search_tweets" command: it implements the REST API search # it returns Twitter statuses matching a user provided search query # it ONLY RETURNS DATA FROM THE PAST 6-9 DAYS # (exception: user timelines as we will seen - 3,200 most recent tweets are available) # Search for up to 100 (non-retweeted) tweets written in English containing the rstats hashtag without retweets. rt <- search_tweets( "#rstats", n = 100, lang = "en", include_rts = FALSE) length(rt$text) # days covered by our analysis since <- rt$created_at[nrow(rt)] latest <- rt$created_at[1] cat("Twitter data","\n",paste("From:",since),"\n",paste(" To:",latest)) # print tweet text print(rt$text[1:5]) # lots of info about each single tweet colnames(rt) # What are the most popular hashtags at the moment? We’ll use regular expressions to extract hashtags. ht <- str_extract_all(rt$text, '#[A-Za-z0-9_]+') ht <- unlist(ht) head(sort(table(ht), decreasing = TRUE)) # And who are the most frequently mentioned users? handles <- str_extract_all(rt$text, '@[0-9_A-Za-z]+') handles_vector <- unlist(handles) head(sort(table(handles_vector), decreasing = TRUE), n=10) # How many tweets mention "Data"? length(grep("data", rt$text, ignore.case=TRUE)) # The query to be searched must be a character string not to exceed maximum of 500 characters. # Spaces behave like boolean "AND" operator. To search for tweets containing at least one of multiple possible terms, # separate each search term with spaces and "OR" (in caps). # For example, the search q = "data science" looks for tweets containing both "data" and "science" # anywhere located anywhere in the tweets and in any order. # When "OR" is entered between search terms, query = "data OR science", Twitter should return any tweet that contains # either "data" or "science." # It is also possible to search for exact phrases using double quotes. # To do this, either wrap single quotes around a search query using double quotes, e.g., q = '"data science"' # or escape each internal double quote with a single backslash, e.g., q = "\"data science\"". # example: rt <- search_tweets('"data science"', n = 100, lang = "en", include_rts = FALSE) # you can then save your results as a csv file # write_as_csv(rt, "twitter.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8") ## plot time series of tweets ts_plot(rt, "1 hours") + theme_minimal() + theme(plot.title = element_text(face = "bold")) + labs( x = NULL, y = NULL, title = "Frequency of #rstats Twitter statuses from past 6-9 days", subtitle = "Twitter status (tweet) counts aggregated using one-hour intervals", caption = "Source: Data collected from Twitter's REST API via rtweet" ) ## plot time series of tweets frequency ts_plot(rt, by = "mins") ts_plot(rt, by = "days") # Next, let’s figure out who is tweeting about R using the #rstats hashtag. # you can access to users data discussing about #rstats via users_data() users_data(rt) # view column with screen names head(rt$screen_name) # get a list of unique usernames unique(rt$screen_name) # You can similarly use the search_users() function to just see what users are tweeting using a particular hashtag. # This function returns just a data.frame of the users and information about their accounts. # what users are tweeting with #rstats (max=100) users <- search_users("#rstats", n = 100) # once again, you can then save your results as a csv file # write_as_csv(users, "users.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8") # What's the difference with search_tweets? That with search_tweets you retrieve a given amount of tweets, # with search_users you retrieve a given amount of UNIQUE users. If a user can tweets a lot about #rstats # it will count as "1 author" when using search_users, but his/her tweets will appear several times # in the data.frame you get out of search_tweets. And indeed compare the two above results: length(unique(users$user_id)) length(unique(rt$user_id)) # Let’s learn a bit more about these people tweeting about R. First, where are they from? # how many languages are represented (und=undeterminated) length(unique(users$lang)) count(users, lang, sort = TRUE) # how many locations are represented length(unique(users$location)) count(users, location, sort = TRUE) # Let’s sort by count and just plot the top 4 locations. To do this you use top_n(). # Note that in this case you are grouping your data by user. count <- count(users, location, sort = TRUE) str(count) count <- count [-which(count$location == ""), ] str(count) count <- mutate(count, location = reorder(location, n)) count <- top_n(count, 4) ggplot(count, aes(x = location, y = n)) + geom_col() + coord_flip() + labs(x = "Count", y = "Location", title = "Where Twitter users are from - unique locations ") # Twitter rate limits cap the number of search results returned to 18,000 every 15 minutes. # To request more than that, simply set retryonratelimit = TRUE and rtweet will wait for rate limit resets for you. ## search for 20,000 tweets containing the word data (do not run it!) ## rt <- search_tweets("data", n = 20000, retryonratelimit = TRUE) ## search for tweets containing "rstats", including retweets rtR <- search_tweets("#rstats", n = 100) ## plot multiple time series--retweets vs non-retweets ts_plot(group_by(rtR, is_retweet), "mins") # What is the most retweeted tweet? x <- rtR[which.max(rtR$retweet_count),] print(x$retweet_count) # how many retweets? print(x$text) print(x$screen_name) ## Get friends # Retrieve a list of all the accounts a user follows. ## get user IDs of accounts followed by CNN cnn_fds <- get_friends("cnn") str(cnn_fds) length(cnn_fds$user_id) ## lookup data on those accounts cnn_fds_data <- lookup_users(cnn_fds$user_id) head(cnn_fds_data$name) # Get followers # Retrieve a list of the accounts following a user ## get user IDs of accounts following CNN (just the first 100 in this example) cnn_flw <- get_followers("cnn", n = 100) ## lookup data on those accounts cnn_flw_data <- lookup_users(cnn_flw$user_id) head(cnn_flw_data$name) # Or if you really want ALL of their followers: # do not run in the lab! # cnn_flw <- get_followers("cnn", retryonratelimit = TRUE) # Get favorites # Get the 10 most recently favorited statuses by Joe Biden. fav <- get_favorites("JoeBiden", n = 10) print(fav$text[1:10]) print(fav$lang[1:10]) #Get trends # Discover what’s currently trending in San Francisco. sf <- get_trends("san francisco") sf$trend # Get timelines # Get the most recent 100 tweets from some important US political figures tmls <- get_timeline( c("BernieSanders", "JoeBiden"), n = 100 ) table(tmls$name) # group by screen name and plot each time series [same two variants!] ts_plot(group_by(tmls, screen_name), "days") ######################### ## Passing your rtweet results to Quanteda ######################### rt <- search_tweets("#rstats", n = 100, include_rts = TRUE, lang = "en") print(rt$lang[1:20]) colnames(rt) # I want to convert the POSIXct time format to a date (here you can also change the time zone by selecting tz="Hongkong" for example) # here I choose Greenwich Mean Time (GMT) str(rt$created_at) rt$date <- as.Date(rt$created_at, "GMT") str(rt$date) # If you want to clean Twitter data you can use some of the commands below print(rt$text[1:30]) clean_tweet <- rt url_regex <- "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+" clean_tweet$text <- str_remove_all(clean_tweet$text, url_regex) #remove url clean_tweet$text <- gsub("&", "", clean_tweet$text) #remove html entity clean_tweet$text <- gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", clean_tweet$text) #remove rt via clean_tweet$text <- gsub("@\\w+", "", clean_tweet$text) #remove mentions clean_tweet$text <- str_replace_all(clean_tweet$text,"#[a-z,A-Z]*","") #remove hashtags clean_tweet$text <- gsub("[^[:alnum:]///' ]", " ", clean_tweet$text) #keep only alpha numeric print(clean_tweet $text[1:30]) # altenatively you can just decide to delete # and @ sample <- rt sample$text <- gsub("#","",sample$text ) sample$text <- gsub("@","",sample$text ) print(sample $text[1:30]) myCorpusTwitter<- corpus(rt) texts(myCorpusTwitter)[1:2] # number of documents ndoc(myCorpusTwitter) # inspect the document-level variables head(docvars(myCorpusTwitter)) myDfm <- dfm(myCorpusTwitter , remove = stopwords("english"), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, stem = TRUE, remove_url = TRUE, remove_symbols = TRUE, split_hyphens = TRUE) topfeatures(myDfm , 20) # 20 top words # Let me see my document-feature matrix for the first four documents and first 10 words myDfm[1:4, 1:10] topfeatures(myDfm) ######################### ## An example of a possible analysis ######################### rt <- search_tweets("liberal OR conservative", n = 1000, include_rts = FALSE, lang = "en") print(rt$text[1:10]) myCorpusTwitter<- corpus(rt) myDfmTwitter <- dfm(myCorpusTwitter , remove = stopwords("english"), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, stem = FALSE, remove_url = TRUE) topfeatures(myDfmTwitter) # let's keep just words with more than 2 characters myDfmTwitter <- dfm_remove(myDfmTwitter , min_nchar = 2) dict <- dictionary(list(lib = c("liberal*"), cons = c("conservativ*"))) dict # let's see how many times the word liberal* and conservativ* appear in each tweet dfm_lookup(myDfmTwitter, dict)[,1] dfm_lookup(myDfmTwitter, dict)[,2] # let's save this info and let's merge them liberal <- dfm_lookup(myDfmTwitter, dict)[,1] conservative <- dfm_lookup(myDfmTwitter, dict)[,2] df_tot <- merge(liberal, conservative, by = "doc_id") str(df_tot) # let's identify a tweet as a "conservative" one if it uses the word conservative more than liberal df_tot <- mutate(df_tot, conservative = if_else(cons > lib, 1, 0)) str(df_tot) # let's add back this info to our dfm docvars(myDfmTwitter)$conservative <- df_tot$conservative # let's see the difference in the language by using as target the "conservative" tstat_key <- textstat_keyness(myDfmTwitter,target = myDfmTwitter$conservative == 1) textplot_keyness(tstat_key) head(tstat_key , 10) tail(tstat_key, 10) ######################### ## Let's play with some dictionaries! ######################### # let's download 2,000 tweets written in English and discussing about Salvini rt <- search_tweets("Salvini", n = 2000, include_rts = FALSE, lang = "en") print(rt$lang[1:20]) length(rt$lang) # I want to convert the POSIXct time format to a date (here you can also change the time zone by selecting tz="Hongkong" for example) # here I choose Greenwich Mean Time (GMT) str(rt$created_at) rt$date <- as.Date(rt$created_at, "GMT") str(rt$date) table(rt$date) myCorpusTwitter<- corpus(rt) texts(myCorpusTwitter)[1:2] # number of documents ndoc(myCorpusTwitter) # inspect the document-level variables head(docvars(myCorpusTwitter)) ########################################################################### # let's suppose we want to see the trend of the overall Sentiment per day about Salvini over all the tweets ########################################################################### library(plyr) library(gridExtra) library(syuzhet) library(wordcloud) library(tm) library(PerformanceAnalytics) sentiment <- dfm(myCorpusTwitter , remove = stopwords("english"), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, stem = TRUE, remove_url = TRUE, remove_symbols = TRUE, split_hyphens = TRUE, dictionary = data_dictionary_LSD2015[1:2]) head(sentiment, 10) Dictionary <-convert(sentiment , to="data.frame") str(Dictionary ) Dictionary$Sentiment <- Dictionary$posit-Dictionary$negat str(Dictionary ) summary(Dictionary$Sentiment) # Let's suppose we want to plot the sentiment vs. the volume rt$Sentiment <- Dictionary$Sentiment # add the sentiment values back to the data frame you got via Twitter colnames(rt) # get daily summaries of the results (average sentiments and number of tweets) daily <- ddply(rt, ~ date, summarize, num_tweets = length(Sentiment), ave_sentiment = mean(Sentiment )) str(daily) # correlation between the volume of discussion and sentiment cor(daily$ave_sentiment, daily$num_tweets) # plot the daily sentiment vs. volume sentiment <- ggplot(daily, aes(x=date, y=ave_sentiment)) + geom_line(linetype = "dashed", colour="red") + ggtitle("Salvini Sentiment") + xlab("Day") + ylab("Sentiment") + theme_gray(base_size = 12) volume <- ggplot(daily, aes(x=date, y=num_tweets)) + geom_line() + ggtitle("Salvini #") + xlab("Day") + ylab("Volume") + theme_gray(base_size = 12) grid.arrange(sentiment , volume , ncol = 1) # Let's apply other dictionaries! colnames(rt) syuzhet_vector <- get_sentiment(rt$text , method="syuzhet") nrc_vector <- get_sentiment(rt$text, method="nrc") rt$sentiment_syuzhet <- syuzhet_vector rt$sentiment_nrc <- nrc_vector colnames(rt) # correlation attach(rt) set_sentiment <- cbind(Sentiment,sentiment_nrc , sentiment_syuzhet) chart.Correlation(set_sentiment ) # Another general dictionaries with more categories than simply positive-negative nrc_data <- get_nrc_sentiment(rt$text, language="english") str(nrc_data) # I want to read the tweets that includes a number of anger words>2 rt$text[nrc_data$anger > 2] barplot( sort(colSums(prop.table(nrc_data[, 1:8]))), horiz = TRUE, cex.names = 0.7, las = 1, main = "Emotions in tweets discussing about Salvini", xlab="Percentage" ) # let's plot the wordcloud of emotions all = c( paste(rt$text[nrc_data$anger > 0], collapse=" "), paste(rt$text[nrc_data$anticipation > 0], collapse=" "), paste(rt$text[nrc_data$disgust > 0], collapse=" "), paste(rt$text[nrc_data$fear > 0], collapse=" "), paste(rt$text[nrc_data$joy > 0], collapse=" "), paste(rt$text[nrc_data$sadness > 0], collapse=" "), paste(rt$text[nrc_data$surprise > 0], collapse=" "), paste(rt$text[nrc_data$trust > 0], collapse=" ") ) str(all) # clean the text # function to make the text suitable for analysis clean.text = function(x) { # tolower x = tolower(x) # remove rt x = gsub("rt", "", x) # remove at x = gsub("@\\w+", "", x) # remove punctuation x = gsub("[[:punct:]]", "", x) # remove numbers x = gsub("[[:digit:]]", "", x) # remove links http x = gsub("http\\w+", "", x) # remove tabs x = gsub("[ |\t]{2,}", "", x) # remove blank spaces at the beginning x = gsub("^ ", "", x) # remove blank spaces at the end x = gsub(" $", "", x) return(x) } all = clean.text(all) # remove stop-words all = removeWords(all, c(stopwords("english"))) # create corpus corpus = Corpus(VectorSource(all)) # create term-document matrix tdm = TermDocumentMatrix(corpus) # convert as matrix tdm = as.matrix(tdm) # add column names colnames(tdm) = c('anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust') # Plot comparison wordcloud layout(matrix(c(1, 2), nrow=2), heights=c(1, 4)) par(mar=rep(0, 4)) plot.new() text(x=0.5, y=0.5, 'Emotion Comparison Word Cloud for tweets about Salvini') comparison.cloud(tdm, random.order=FALSE, colors = c("#00B2FF", "red", "#FF0099", "#6600CC", "green", "orange", "blue", "brown"), title.size=1.5, max.words=250) ######################################################################################### # geographical analysis: you need to have you google map API (but not if you are analyzing data from US only)! ######################################################################################### library(ggmap) library(maps) library(leaflet) api <- "YOUR GOOGLE MAP API" # your google map api code lookup_coords("usa") lookup_coords("london", apikey=api) lookup_coords("brazil", apikey=api) # What twitter will allow is for searches by geocode, and to achieve this, twitter will first check if the tweet is geocoded, # and if not, will check whether a place can be extrapolated from the user's profile information. So you could, for example, # search for tweets from the Barcelona area and twitter will deliver a lot of tweets that aren't geocoded because the users # have "Barcelona" in their profile. ## search for 2000 tweets sent from the US discussing about dinner or food ## in this case you do not need a GOOGLE MAP KEY to have these info - just for the US! rt <- search_tweets( "dinner OR food", n = 2000, geocode = lookup_coords("usa")) ## create lat/lng variables using all available tweet and profile geo-location data rtll <- lat_lng(rt) ## plot state boundaries par(mar = c(0, 0, 0, 0)) map("state", lwd = .25) ## plot lat and lng points onto state map with(rtll, points(lng, lat, pch = 20, cex = 5, col = rgb(0, .3, .7, .75))) # alternative plot via leaflet m2 <- leaflet(rtll) m2 <- addTiles(m2) # Add default OpenStreetMap map tiles m2 <- addMarkers(m2, lng=rtll$lng, lat=rtll$lat, popup=rtll$text) m2 ## if you want to geocode your analysis on some other countries, you should add your API key: rt <- search_tweets( "covid", n = 2000, geocode = lookup_coords("Italy", apikey=api)) ## create lat/lng variables using all available tweet and profile geo-location data rtll <- lat_lng(rt) ## plot state boundaries par(mar = c(0, 0, 0, 0)) map("world", "Italy", lwd = .25) ## plot lat and lng points onto state map with(rtll, points(lng, lat, pch = 20, cex = 5, col = rgb(0, .3, .7, .75))) # alternative plot via leaflet m2 <- leaflet(rtll) m2 <- addTiles(m2) # Add default OpenStreetMap map tiles m2 <- addMarkers(m2, lng=rtll$lng, lat=rtll$lat, popup=rtll$text) m2 ######################################################################################### # STREAMING data ######################################################################################### # with stream_tweets (so called Streaming API) you connect to the “stream” of tweets as they are being published # through that you capture the tweets that contain one or more of the keywords given in the track argument # There are four possible methods. (1) The default, q = "", returns a small random sample of all publicly available Twitter statuses # (1% random sample of all the tweets posted in that specific moment) # (2) To filter by keyword, provide a comma separated character string with the desired phrase(s) and keyword(s) # in this case it returns ALL the tweets, UNLESS that number of tweets is higher than 1% of *all* tweets # in a given moment # (3) Track users by providing a comma separated list of user IDs or screen names # (4) Use four latitude/longitude bounding box points to stream by geo location. This must be provided via a vector of length 4, e.g., c(-125, 26, -65, 49). # how to do that? either you get this info from the GOOGLE API or otherwise just check here: http://boundingbox.klokantech.com/ and select "csv" dt <- stream_tweets("trump", timeout = 10) print(dt$text[1:10]) # I'd like to stream tweets from US lookup_coords("USA") usa <- stream_tweets( c(-124.84897, 24.39631, -66.88544 , 49.38436 ), timeout = 10 ) print(usa$text[1:10]) # I'd like to stream tweets from Italy lookup_coords("Italy", apikey=api) ## or use the coordinates you get via lookup_coords italy2 <- stream_tweets( c(6.62672, 35.48970, 18.79760, 47.09200 ), timeout = 30 ) print(italy2 $text[1:10]) # Alternatively, we can use lookup_coords() as we did earlier! # italy <- stream_tweets(lookup_coords("Italy", apikey=api),timeout = 30) # print(italy $text[1:10]) ## create lat/lng variables using all available tweet and profile geo-location data rtll_it <- lat_lng(italy2 ) ## plot state boundaries par(mar = c(0, 0, 0, 0)) map("world", "Italy", lwd = .25) ## plot lat and lng points onto state map with(rtll_it, points(lng, lat, pch = 20, cex = 5, col = rgb(0, .3, .7, .75))) # Alternative possible plot map.data <- map_data("world", "Italy") str(map.data) points <- data.frame(x = rtll_it$lng, y = rtll_it$lat ) ggplot(map.data) + geom_map(aes(map_id = region), map = map.data, fill = "white", color = "grey20", size = 0.25)+ expand_limits(x = map.data$long, y = map.data$lat) + theme(axis.line = element_blank(), axis.text = element_blank(), axis.ticks = element_blank(), axis.title = element_blank(), plot.margin = unit(0 * c(-1.5, -1.5, -1.5, -1.5), "lines")) + geom_point(data = points, aes(x = x, y = y), size = 3, alpha = 1/3, color = "darkblue") # let's create 3 groups out of the number of followers of each single users in the database and let's plot it # value=1 for the users with a number of followers lower than the median value # value=2 for the users with a number of followers higher than (or equal to) the median value and lower than the 3rd Quartile # value=3 for the users with a number of followers higher than (or equal to) the 3rd Quartile summary(rtll_it$followers_count) points$followers[rtll_it$followers_count< quantile(rtll_it$followers_count , 0.5 )] <- 1 points$followers[rtll_it$followers_count>= quantile(rtll_it$followers_count , 0.5 ) & rtll_it$followers_count= quantile(rtll_it$followers_count , 0.75 )] <- 3 points ggplot(map.data) + geom_map(aes(map_id = region), map = map.data, fill = "white", color = "grey20", size = 0.25)+ expand_limits(x = map.data$long, y = map.data$lat) + theme(axis.line = element_blank(), axis.text = element_blank(), axis.ticks = element_blank(), axis.title = element_blank(), plot.margin = unit(0 * c(-1.5, -1.5, -1.5, -1.5), "lines")) + geom_point(data = points, aes(x = x, y = y, colour = factor(followers)), size = 3) # alternative plot via leaflet m2 <- leaflet(rtll_it) m2 <- addTiles(m2) # Add default OpenStreetMap map tiles m2 <- addMarkers(m2, lng=rtll_it$lng, lat=rtll_it$lat, popup=rtll_it$text) m2 # a lot of the markers are clumped together rather closely. # We can cluster them together by specifying clusterOptions as follows (zoom out and zoom in the graph!) # The number inside each circle represents the total number of observations in that area. # Areas with higher observations are marked by yellow circles and areas with lower incidents are marked by green circle m <- addMarkers(m2, lng=rtll_it$lng, lat=rtll_it$lat, popup=rtll_it$text, clusterOptions = markerClusterOptions()) m # let's change the colour of the tweets in the map according to some exogenous # info included in the databaset. Such as the number of followers for example summary(rtll_it$ followers_count) # let's create 3 groups out of the number of followers getColor <- function(rtll_it) { sapply(rtll_it$followers_count, function(followers_count) { if(followers_count=quantile(rtll_it$followers_count , 0.5 ) && followers_count<=quantile(rtll_it$followers_count , 0.75 )) { "orange" } else { "red" } }) } icons <- awesomeIcons( icon = 'ios-close', iconColor = 'black', library = 'ion', markerColor = getColor(rtll_it) ) m <- leaflet(rtll_it) m <- addTiles(m) m <- addAwesomeMarkers(m, lng=rtll_it$lng, lat=rtll_it$lat, icon=icons, popup=rtll_it$text, label=~as.character(followers_count)) m icons <- awesomeIcons( icon = 'glass', iconColor = 'white', markerColor = getColor(rtll_it) ) m <- leaflet(rtll_it) m <- addTiles(m) m <- addAwesomeMarkers(m, lng=rtll_it$lng, lat=rtll_it$lat, icon=icons, popup=rtll_it$text, label=~as.character(followers_count)) m # for more info: https://rstudio.github.io/leaflet/markers.html # getting tweets from London rtl <- stream_tweets(lookup_coords("london", apikey=api), timeout = 20) length(rtl$text) str(rtl)