rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()

# devtools::install_version("rtweet", version = "0.7.0", repos = "http://cran.us.r-project.org")
library(rtweet)
packageVersion("rtweet")

library(httpuv)
library(readtext)
library(quanteda)
library(ggplot2)
library(dplyr)
library(stringr)
library(quanteda.textstats)
library(quanteda.textplots )

###############################
# REST-API
###############################

# "search_tweets" command: it implements the REST API search
# it returns Twitter statuses matching a user provided search query 
# it ONLY RETURNS DATA FROM THE PAST 6-9 DAYS
# (exception: user timelines as we will seen - 3,200 most recent tweets are available)

# Search for up to 200 (non-retweeted) tweets written in English containing the rstats hashtag without retweets.
rt <- search_tweets( "#rstats", n = 200, lang = "en", include_rts = FALSE)
# print fist 5 tweets
print(rt$text[1:5])

rt <- search_tweets( "#rstats", n = 200, lang = "en", include_rts = FALSE)

# days covered by our query
since <- rt$created_at[nrow(rt)]
latest <- rt$created_at[1]
cat("Twitter data","\n",paste("From:",since),"\n",paste("  To:",latest))

# When Twitter provides a tweet, the 'created_at' field provides a timestamp for when the tweet was authored
# This timestamp is useful, but remember: it is in Greenwich Mean Time!
# Unless the tweet happens to have come from that timezone, its time needs to be adjusted to account for this discrepancy.
# Not an easy effort sometimes! 

# By default, search_tweets download first the most recent tweets. You can also specify that you want to retrieve
# tweets not only starting from the most recent tweets by typing: "type = "mixed"" to ensure you get tweets that spread 
# over a couple of days. A third alternative is to specify "type = "popular""

rt <- search_tweets( "#rstats", n = 1000, lang = "en", include_rts = FALSE, type="mixed")

# lots of document-level variables attached to each tweet
colnames(rt)
glimpse(rt)

# What are the most popular hashtags at the moment? We can use regular expressions to extract hashtags
ht <- str_extract_all(rt$text, '#[A-Za-z0-9_]+')
ht <- unlist(ht)
head(sort(table(ht), decreasing = TRUE))

# And who are the most frequently mentioned users?
handles <- str_extract_all(rt$text, '@[0-9_A-Za-z]+')
handles_vector <- unlist(handles)
head(sort(table(handles_vector), decreasing = TRUE), n=10)

# The query to be searched must be a character string not to exceed maximum of 500 characters.
# Spaces behave like boolean "AND" operator. To search for tweets containing at least one of multiple possible terms, 
# separate each search term with spaces and "OR" (in caps). 
# For example, the search q = "data science" looks for tweets containing both "data" and "science" 
# anywhere located anywhere in the tweets and in any order. 
# When "OR" is entered between search terms, query = "data OR science", Twitter should return any tweet that contains 
# either "data" or "science." 
# It is also possible to search for exact phrases using double quotes. 
# To do this, either wrap single quotes around a search query using double quotes, e.g., q = '"data science"' 
# or escape each internal double quote with a single backslash, e.g., q = "\"data science\"".
# example: rt <- search_tweets('"data science"', n = 100, lang = "en", include_rts = FALSE)

# Twitter rate limits cap the number of search results returned to 18,000 every 15 minutes. 
# To request more than that, simply set retryonratelimit = TRUE and rtweet will wait for rate limit resets for you.

## search for 20,000 tweets containing the word data (do not run it!)
## rt <- search_tweets("data", n = 20000, retryonratelimit = TRUE)

# another interesting option is the "filter" function in a search query.
# For example, let's retrieve only those tweets including some type of media(mainly photo or video) and discussing about "war"
rt1 <- search_tweets("war filter:media", n = 200, include_rts = FALSE)
length(rt1 $text)
# print tweet text
strwrap(print(rt1 $text[1:5]))
rt1$media_url[1:5]
rt1$status_url[1:5]

# just tweets including a link to news articles
rt1 <- search_tweets("war filter:news", n = 200, include_rts = FALSE)
rt1$ urls_expanded_url  [1:10]
rt1 $status_url[1:10]

# you can also run a query with both requests (i.e., tweets including either a media or a link; look at OR below)
# rt1 <- search_tweets("war filter:news OR media", n = 200, include_rts = FALSE)

# or you can filter in a negative way
# rt1 <- search_tweets("war -filter:media", n = 200, include_rts = FALSE)

# other possible filters:
# Exclude quotes via "-filter:quote"
# Exclude replies via "-filter:replies"
# Filter (return only) verified via "filter:verified"
# Filter (return only) retweets via filter:retweets

# plot time series of tweets by 1 hours
ts_plot(rt, "1 hours") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold")) +
  labs(
    x = NULL, y = NULL,
    title = "Frequency of #rstats Twitter statuses from past 6-9 days",
    subtitle = "Twitter status (tweet) counts aggregated using one-hour intervals",
    caption = "Source: Data collected from Twitter's REST API via rtweet"
  )

# plot time series of tweets by changing time-frequency
ts_plot(rt, by = "mins")
ts_plot(rt, by = "days")

# You can use the search_users() function to just see what users are tweeting using a particular hashtag. 
# This function returns just a data.frame of the users and information about their accounts.

# the last 200 users who are tweeting using #rstats 
users <- search_users("#rstats", n = 200)

# What's the difference with search_tweets? That with search_tweets you retrieve a given amount of tweets,
# with search_users you retrieve a given amount of UNIQUE users. If a user can tweets a lot about #rstats
# it will count as "1 author" when using  search_users, but his/her tweets will appear several times
# in the data.frame you get out of  search_tweets. And indeed compare the two above results:

length(unique(users$user_id))
length(unique(rt$user_id))

## search for tweets containing "rstats", including retweets
rtR <- search_tweets("#rstats", n = 100)
## plot multiple time series--retweets vs non-retweets: retweets will be always more than non-retweets!!!
ts_plot(group_by(rtR, is_retweet), "mins")

# What is the most retweeted tweet?
x <- rtR[which.max(rtR$retweet_count),]
print(x$retweet_count)
print(x$text)
print(x$screen_name)

# Get timelines
# Get the most recent 100 tweets from some important US political figures
# you need to pass to rtweet your token. 

tmls <- get_timeline(
  c("BernieSanders", "JoeBiden") , 
n = 100
)

table(tmls$name)

## group by screen name and plot each time series [same two variants!]
ts_plot(group_by(tmls, screen_name), "days")

# The cap limit for a user timeline is: 3,200 
# How to move beyond this limit? You can apply for an academic license on Twitter!
# plz take a look at here:
# https://blog.twitter.com/developer/en_us/topics/tools/2021/enabling-the-future-of-academic-research-with-the-twitter-api

#########################
## Passing your rtweet results to Quanteda
#########################

rt <- search_tweets("#rstats", n = 1000, include_rts = FALSE,  lang = "en")

myCorpusTwitter<- corpus(rt)
as.character(myCorpusTwitter)[1:2]
# number of documents
ndoc(myCorpusTwitter)
# inspect the document-level variables
head(docvars(myCorpusTwitter))

tok  <- tokens(myCorpusTwitter, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, 
remove_separators = TRUE, remove_url = TRUE)
tok   <- tokens_remove(tok   , stopwords("english"))
tok   <- tokens_wordstem (tok   , language =("english"))

myDfm <- dfm(tok)
topfeatures(myDfm , 20)  # 20 top words

# let's see the topfeatures for each day
myDfm $date <- as.Date(myDfm $created_at)
str(myDfm$created_at)
str(myDfm$date)

table(myDfm $date )
topfeatures(myDfm, 5, groups = date )

# Let me see my document-feature matrix for the first four documents and first 10 words
myDfm[1:4, 1:10]

# We already saw how to answer to these questions above, but now we can do that using directly the Quanteda functions

# What are the most popular hashtags at the moment? 
dfm_hashtag <- dfm_select(myDfm , pattern = c("#*"))
topfeatures(dfm_hashtag , 20)  # 20 top hashtags

# And who are the most frequently mentioned users?
dfm_at <- dfm_select(myDfm , pattern = c("@*"))
topfeatures(dfm_at , 20)  # 20 top accounts

# How many tweets mention "Data" - inclusing all possibile different expression, like "BigData" or "DataScience"?
kwic(tokens(tok), "*data*", window=1)
x <- kwic(tokens(tok), "*data*")
length(unique(x$docname))

# Before creating your DFM, you could be interested in cleaning your tweets
txt <- "This is a @username and #hashtag. https://twitter.com/home"
txt 

# preserve social media tags (default)
tokens(txt, remove_punct = TRUE)

# remove social media tags # (i.e., replace each "#" with ""). Once you have created a dfm, you cannot do that anymore! or at least
# not in a straightforward way!
x2 <- gsub("#","",txt) 
# then tokens x2
tokens(x2, remove_punct = TRUE)

# remove social media tags # and @ (in words replace each "#" and "@" with "")
x2 <- gsub("\\#|@","",txt) 
# then tokens x2
tokens(x2, remove_punct = TRUE)

# remove URLs
x2 <- gsub("http.*","",txt)
# then tokens x2
tokens(x2, remove_punct = TRUE)

# NOTE: you can remove URLs directly via the command tokens as we did above
tokens(txt, remove_punct = TRUE, remove_url = TRUE)

# remove social media tags and URLS at the same time
x2 <-gsub("\\#|@|http.*","",txt) 
# then tokens x2
tokens(x2, remove_punct = TRUE)

#########################
## An example of a possible analysis by applying something we learnt in Lab 1!
#########################

# let's make two queries. The first using the query "liberal*" [excluding the tweets that also use the word "conservative*" in the same tweet: 
# note the "-" in front of "conservative*"] and the second using the query "conservative*" 

lib <- search_tweets("liberal* -conservative*", n = 1000, include_rts = FALSE,  lang = "en")
cons <- search_tweets("conservative* -liberal*", n = 1000, include_rts = FALSE,  lang = "en")

print(lib $text[1:10])
print(cons $text[1:10])

## create query variable
lib$query <- "Liberal"
cons$query <- "Conservative"

## row bind into a single data frame the two queries
df <- rbind(lib, cons)

# let's graph the time-trend
ts_plot(group_by(df, query), by="15 mins")

myCorpusTwitter<- corpus(df)
tok  <- tokens(myCorpusTwitter, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, 
remove_separators = TRUE, remove_url = TRUE)
tok   <- tokens_remove(tok   , stopwords("english"))
tok   <- tokens_wordstem (tok   , language =("english"))
myDfm <- dfm(tok)

# let's see the difference in the language between Cons and Libs tweets
# FIRST: Let's compute the lexical diversity 
myDfm2 <- dfm_group(myDfm, groups = query)
lexdiv <- textstat_lexdiv(myDfm2 )
str(lexdiv)

ggplot(data=lexdiv, aes(x=document, y=TTR)) +
  geom_bar(stat="identity", color="blue", fill="white") + coord_flip()

# SECOND: Let's employ a comparison tag-cloud 
set.seed(123)
textplot_wordcloud(myDfm2, min_count = 20, comparison = TRUE)

# let's drop the two features "liber" and "conserv" from the DfM to make the graph much more interpretable!
myDfmRed <- dfm_remove(myDfm2, c("liber", "conserv"))
set.seed(123)
textplot_wordcloud(myDfmRed , min_count = 20, comparison = TRUE)

# THIRD: let's use as target the "conservative" vs. "liberal"
tstat_key <- textstat_keyness(myDfm2 ,target  ="Conservative")
textplot_keyness(tstat_key)

# let's drop once again the two features "liber" and "conserv" from the DfM to make the graph much more interpatable!
tstat_key <- textstat_keyness(myDfmRed ,target ="Conservative")
textplot_keyness(tstat_key)

# let's plot just with the statistical significant results
tstat_key2 <- tstat_key [ which(tstat_key $p<=0.05), ]
# let's add the attributes
attr(tstat_key2, "groups") <- c("Conservative", "Liberal")
textplot_keyness(tstat_key2)

# we could also easily pass our dfm to the topic model package, or to the stm package, to fit a (structural) topic model!

########################################################
### Search with a geocode
########################################################

library(ggmap)
library(maps)
library(leaflet)

# You can also run a search_tweets with a geocode. For doing that you have two options:
# 1) either employing a Google Api (if you have it) and passing the info about the geocoding you have obtained with it to search_tweets
# 2) passing directly yourself such info to search_tweets. For doing that you need two sets of information:
# a) the latitude/longitude bounding box points related to the geographical search you are interested about.  
# The info about the geocode must be provided via a vector of length 4, e.g., c(-125, 26, -65, 49).
# Why a vector of length 4? Such 4 numbers refers to the South-West lng, South-West lat, North-East lng, North-East lat 
# that identy the two diagonally opposite points of the (geographical) polygon you want to use for your query.
# How to recover such info? You can check it here:
# http://boundingbox.klokantech.com/ and select "csv"
# b) the centroid of the polygon you recovered under point a). 
# How to recover such info? You can check it here:
# https://www.latlong.net/
# Then you have have to include info about a) and b), together with the name of the geographical location you are interested
# about, in a list and then passing it to search_tweets.

# For example: let's say we want to run a query about Giorgia Meloni centered in Italy
# FIRST OPTION: let's employ your google API (if you have it!)

# api <- "YOUR GOOGLE MAP API"
# in my case I have saved it in a txt file
myText <- readtext("Google API.txt", encoding = "UTF-8")
api <- myText$text

lookup_coords("Italy", apikey=api)
rt <- search_tweets("meloni", n = 1000, geocode = lookup_coords("Italy", apikey=api))

# SECOND OPTION: let's create a list with the geographical inforamtion that you have extracted from
# the two previous URLS and let's pass it to search_tweets

my_list <- list(place = "Italy", box = c( 6.62672, 35.48970, 18.79760, 47.09200), point=c(41.87194, 12.56738))
str(my_list)
attr(my_list, "class") <- c("coords", "list")
attributes(my_list)

rt2 <- search_tweets("meloni", n = 1000, geocode = my_list)

table(rt$country)
table(rt2$country)

# NOTE that if you want to run a geographical query centered on the US you can directly write geocode = lookup_coords("United States")
# even if you do not have any google API (such geographical info is already included in rtweet)

lookup_coords("United States")
rtUSA <- search_tweets("trump", n = 1000, geocode = lookup_coords("United States"))

# let's go back now to the example about Italy 
rt <- search_tweets("meloni", n = 1000, geocode = lookup_coords("Italy", apikey=api))

# Note that rtweet will not just look for data with bbox_coords, but it will do the following:
# FIRST check if the tweet is geocoded (i.e., if it has bbox_coords)
# If not, rtweet will check whether a place can be extrapolated from the user's profile information. 
# BUT a user can write whatever she wants in her profile. 
# Having said that, this is the ONLY WAY to recover tweets with geocode PLUS a query. 

table(rt$country)
print(rt$location) # this info is defined by the user in her profile 

# these info is related to the POI selected by the user that idenfies the geographical coordinated of the POI
print(rt$place_name)
print(rt$place_full_name)
print(rt$place_type)

# these are the coordinates of the POI
print(rt$bbox_coords) # a polygon! 
table(rt$place_name)
# how many non-missing bbox_coords have we recovered from our query?
table(!is.na(rt$place_name))

## create lat/lng variables using all available tweet and profile geo-location data
rtll_it <- lat_lng(rt)

# plot the results
map.data <- map_data("world", "Italy")
str(map.data)
points <- data.frame(x = rtll_it$lng, y =  rtll_it$lat )

ggplot(map.data) + geom_map(aes(map_id = region), map = map.data, fill = "white",   
color = "grey20", size = 0.25)+ expand_limits(x = map.data$long, y = map.data$lat) + 
    theme(axis.line = element_blank(), axis.text = element_blank(), axis.ticks = element_blank(), 
        axis.title = element_blank(), plot.margin = unit(0 * c(-1.5, -1.5, -1.5, -1.5), "lines")) + geom_point(data = points, 
    aes(x = x, y = y), size = 3, alpha = 1/3, color = "darkblue")

# alternative plot via leaflet
m2 <- leaflet(rtll_it)
m2 <- addTiles(m2) # Add default OpenStreetMap map tiles
m2 <- addMarkers(m2, lng=rtll_it$lng, lat=rtll_it$lat, popup=rtll_it$text)
m2

# let's change the colour of the tweets in the map according to some exogenous info included in the databaset
# Such as the number of followers for example (a metadata NOT available in the 1.0.2. version of rtweet btw).
# Of course, we could have coloured the tweets employing some more interesting meta-data for example extracted from a topic model!

summary(rtll_it$ followers_count)

# let's create 3 groups out of the number of followers

getColor <- function(rtll_it) {
  sapply(rtll_it$followers_count, function(followers_count) {
  if(followers_count<quantile(rtll_it$followers_count , 0.5 )) {
    "green"
  } else if(followers_count>=quantile(rtll_it$followers_count , 0.5 )  & followers_count<=quantile(rtll_it$followers_count , 0.75 )) {
    "orange"
  } else {
    "red"
  } })
}

icons <- awesomeIcons(
  icon = 'ios-close',
  iconColor = 'black',
  library = 'ion',
  markerColor = getColor(rtll_it)
)

m <- leaflet(rtll_it) 
m <- addTiles(m)   
m <- addAwesomeMarkers(m, lng=rtll_it$lng, lat=rtll_it$lat, icon=icons, popup=rtll_it$text, label=~as.character(followers_count))
m

# let's go back now to the example about United States

lookup_coords("United States")
rtUSA <- search_tweets("trump", n = 1000, geocode = lookup_coords("United States"))

# how many non-missing bbox_coords have we recovered from our query?
table(!is.na(rtUSA $place_name))

## create lat/lng variables using all available tweet and profile geo-location data
rtll_usa <- lat_lng(rtUSA )

# plot the results
map.data <- map_data("usa") # in the case of the US you just specify "usa" without "world"
str(map.data)
points <- data.frame(x = rtll_usa$lng, y =  rtll_usa$lat )

ggplot(map.data) + geom_map(aes(map_id = region), map = map.data, fill = "white",   
color = "grey20", size = 0.25)+ expand_limits(x = map.data$long, y = map.data$lat) + 
    theme(axis.line = element_blank(), axis.text = element_blank(), axis.ticks = element_blank(), 
        axis.title = element_blank(), plot.margin = unit(0 * c(-1.5, -1.5, -1.5, -1.5), "lines")) + geom_point(data = points, 
    aes(x = x, y = y), size = 3, alpha = 1/3, color = "darkblue")

# alternative plot via leaflet
m2 <- leaflet(rtll_usa)
m2 <- addTiles(m2) # Add default OpenStreetMap map tiles
m2 <- addMarkers(m2, lng=rtll_usa$lng, lat=rtll_usa$lat, popup=rtll_usa$text)
m2