rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(rtweet)
library(readtext)
library(quanteda)
library(ggplot2)
library(ggmap)
library(httpuv)
library(dplyr)
library(stringr)
library(quanteda.textstats)
library(quanteda.textplots )

# "search_tweets" command: it implements the REST API search
# it returns Twitter statuses matching a user provided search query 
# it ONLY RETURNS DATA FROM THE PAST 6-9 DAYS
# (exception: user timelines as we will seen - 3,200 most recent tweets are available)

# Search for up to 200 (non-retweeted) tweets written in English containing the rstats hashtag without retweets.
rt <- search_tweets("#rstats", n = 200, lang = "en", include_rts = FALSE)
length(rt$text)
# days covered by our analysis
since <- rt$created_at[nrow(rt)]
latest <- rt$created_at[1]
cat("Twitter data","\n",paste("From:",since),"\n",paste("  To:",latest))

# When Twitter provides a tweet, the 'created_at' field provides a timestamp for when the tweet was authored
# This timestamp is useful, but remember: it is in Greenwich Mean Time!
# Unless the tweet happens to have come from that timezone, its time needs to be adjusted to account for this discrepancy.
# Not an easy effort sometimes! 

# By default, search_tweets download first the most recent tweets. You can also specify that you want to retrieve
# tweets not only starting from the most recent tweets by typing: "type = "mixed"" to ensure you get tweets that spread 
# over a couple of days. A third alternative is to specify "type = "popular""

# print tweet text
strwrap(print(rt$text[1:5]))

# lots of info about each single tweet
dplyr::glimpse(rt)
colnames(rt)

# What are the most popular hashtags at the moment? We can use regular expressions to extract hashtags
ht <- str_extract_all(rt$text, '#[A-Za-z0-9_]+')
ht <- unlist(ht)
head(sort(table(ht), decreasing = TRUE))

# And who are the most frequently mentioned users?
handles <- str_extract_all(rt$text, '@[0-9_A-Za-z]+')
handles_vector <- unlist(handles)
head(sort(table(handles_vector), decreasing = TRUE), n=10)

# How many tweets mention "Data"? if ignore.case=FALSE the pattern matching is case sensitive; if TRUE 
# the pattern matching does not take into consideration if capital letter is used or otherwise, for example, or if data is matched with other
# words such as "datascience"
length(grep("data", rt$text, ignore.case=TRUE))
grep("data", rt$text, ignore.case=TRUE, value=TRUE)

# The query to be searched must be a character string not to exceed maximum of 500 characters.
# Spaces behave like boolean "AND" operator. To search for tweets containing at least one of multiple possible terms, 
# separate each search term with spaces and "OR" (in caps). 
# For example, the search q = "data science" looks for tweets containing both "data" and "science" 
# anywhere located anywhere in the tweets and in any order. 
# When "OR" is entered between search terms, query = "data OR science", Twitter should return any tweet that contains 
# either "data" or "science." 
# It is also possible to search for exact phrases using double quotes. 
# To do this, either wrap single quotes around a search query using double quotes, e.g., q = '"data science"' 
# or escape each internal double quote with a single backslash, e.g., q = "\"data science\"".
# example: rt <- search_tweets('"data science"', n = 100, lang = "en", include_rts = FALSE)

# An obvious (but still very) important point about the query you make:
# As a researcher, when you acquire your corpus you need to ensure that the texts under examination are related to the 
# research question you are interest about and have theoretical consistency. 
# You want therefore to generate a list of keywords for your query that will include in the corpus all relevant texts 
# (i.e., minimize false negatives) and exclude any irrelevant texts (i.e., minimize false positives)

# you can then save your results as a csv file
# write_as_csv(rt, "twitter.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8")

# another interesting option is the "filter" function in a search query.
# For example, let's retrieve only those tweets including some type of media(mainly photo) and discussing about "war"
rt1 <- search_tweets("war filter:media", n = 200, include_rts = FALSE)
length(rt1 $text)
# print tweet text
strwrap(print(rt1 $text[1:5]))
dplyr::glimpse(rt1 )
rt1 $status_url[1:5]

# just tweets including a link to news articles
rt1 <- search_tweets("war filter:news", n = 200, include_rts = FALSE)
rt1 $status_url[1:10]

# you can also run a query with both requests (i.e., tweets including either a media or a link; look at OR below)
rt1 <- search_tweets("war filter:news OR media", n = 200, include_rts = FALSE)

# or you can filter in a negative way
rt1 <- search_tweets("war -filter:media", n = 200, include_rts = FALSE)
rt1 $status_url[1:5]

# other possible filters:
# Exclude quotes via "-filter:quote"
# Exclude replies via "-filter:replies"
# Filter (return only) verified via "filter:verified"
# Filter (return only) retweets via filter:retweets

## plot time series of tweets
ts_plot(rt, "1 hours") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold")) +
  labs(
    x = NULL, y = NULL,
    title = "Frequency of #rstats Twitter statuses from past 6-9 days",
    subtitle = "Twitter status (tweet) counts aggregated using one-hour intervals",
    caption = "Source: Data collected from Twitter's REST API via rtweet"
  )

## plot time series of tweets frequency
ts_plot(rt, by = "mins")
ts_plot(rt, by = "days")

# Next, let’s figure out who is tweeting about R using the #rstats hashtag.
# you can access to  users data discussing about #rstats via users_data()
users_data(rt)

# view column with screen names 
head(rt$screen_name)
# get a list of unique usernames
unique(rt$screen_name)

# You can similarly use the search_users() function to download a specific number of users with #rstats in their profiles (max=1,000)
users <- search_users("#rstats", n = 100)

# once again, you can then save your results as a csv file
# write_as_csv(users, "users.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8")

# What's the difference with search_tweets? That with search_tweets you retrieve a given amount of tweets,
# with search_users you retrieve a given amount of UNIQUE users. If a user can tweets a lot about #rstats
# it will count as "1 author" when using  search_users, but his/her tweets will appear several times
# in the data.frame you get out of  search_tweets. And indeed compare the two above results:

length(unique(users$user_id))
length(unique(rt$user_id))

# Let’s learn a bit more about these people tweeting about R
# How many languages are represented (und=undeterminated)
length(unique(users$lang))
count(users, lang, sort = TRUE)
# And from the location from where they are tweeting? More on this next week!

# Twitter rate limits cap the number of search results returned to 18,000 every 15 minutes. 
# To request more than that, simply set retryonratelimit = TRUE and rtweet will wait for rate limit resets for you.

## search for 20,000 tweets containing the word data (do not run it!)
## rt <- search_tweets("data", n = 20000, retryonratelimit = TRUE)

# As an alternative: use the bearer_token()! It will only work on valid tokens generated from a user-created Twitter app (requires
# a Twitter developer account). Unlike the default token returned by create_token, bearer token requests cannot engage in user actions (e.g., posting tweets,
# reading DMs), and the information returned by Twitter will not include user-specific variables (e.g., if the user is following a certain account).
# The upside to this authentication method is that it can afford users with more generous rate limits. For example, the rate limit for the standard search API
# is 18,000 tweets per fifteen minutes. With a bearer token, the rate limit is 45,000 tweets per fifteen minutes. 
# rt <- search_tweets("#rstats", n = 30000, token = bearer_token())

## search for tweets containing "rstats", including retweets
rtR <- search_tweets("#rstats", n = 100)
## plot multiple time series--retweets vs non-retweets
ts_plot(group_by(rtR, is_retweet), "mins")

# What is the most retweeted tweet?
x <- rtR[which.max(rtR$retweet_count),]
print(x$retweet_count)
print(x$text)
print(x$screen_name)

## Get friends
# Retrieve a list of all the accounts a user follows.

## get user IDs of accounts followed by CNN
cnn_fds <- get_friends("cnn")
str(cnn_fds)
length(cnn_fds$user_id)
## lookup data on those accounts
cnn_fds_data <- lookup_users(cnn_fds$user_id)
head(cnn_fds_data$name)

# Get followers
# Retrieve a list of the accounts following a user

## get user IDs of accounts following CNN (just the first 100 in this example)
cnn_flw <- get_followers("cnn", n = 100)
## lookup data on those accounts
cnn_flw_data <- lookup_users(cnn_flw$user_id)
head(cnn_flw_data$name)

# Or if you really want ALL of their followers:
## how many followers does Curini have?
curini_flw <- get_followers("Curini", retryonratelimit = TRUE)
length(curini_flw$user_id)
# and if you want to have the list of them you can use the command lookup_users (but it will take you a while)
# Curini <- lookup_users(curini_flw$user_id)
# head(Curini$name)

# Get favorites
# Get the 10 most recently favorited statuses by Joe Biden.
fav <- get_favorites("JoeBiden", n = 10)
print(fav$text[1:10])
print(fav$lang[1:10])

#Get trends
# Discover what’s currently trending in San Francisco.
sf <- get_trends("london")
sf$trend

# Get timelines: for example, let's get the most recent 100 tweets from some important US political figures.
# N.B. for doing that, you need to pass to rtweet your token. 

# calling my personal token
tokenTwitter <- readRDS("twittertoken.rds")

#  You should write it your own like the below example:
#  tokenTwitter <- create_token(
#  app = [your_twitter_api_app],
#  consumer_key = [your_api_consumer_key],
#  consumer_secret = your_api_consumer_secret])

# how to retrieve this info? Read this link: 
# https://cran.r-project.org/web/packages/rtweet/vignettes/auth.html

tmls <- get_timeline(
  c("BernieSanders", "JoeBiden"), token=tokenTwitter,
  n = 100
)
table(tmls$name)

## group by screen name and plot each time series [same two variants!]
ts_plot(group_by(tmls, screen_name), "days")

# The cap limit for a user timeline is: 3,200 
# How to move beyond this limit? You can apply for an academic license on Twitter!
# plz take a look at here:
# https://blog.twitter.com/developer/en_us/topics/tools/2021/enabling-the-future-of-academic-research-with-the-twitter-api

#########################
## A seach query with emoji
#########################

# for the list of emoji: https://github.com/hadley/emo
library(emo)
x <- emo::ji("smile")
x

smile <- search_tweets(x, n = 200, lang = "en", include_rts = FALSE)
print(smile $text[1:5])

#########################
## Passing your rtweet results to Quanteda
#########################

rt <- search_tweets("#rstats", n = 100, include_rts = FALSE,  lang = "en")

myCorpusTwitter<- corpus(rt)
as.character(myCorpusTwitter)[1:2]
# number of documents
ndoc(myCorpusTwitter)
# inspect the document-level variables
head(docvars(myCorpusTwitter))

tok  <- tokens(myCorpusTwitter, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, 
remove_separators = TRUE, remove_url = TRUE)
tok   <- tokens_remove(tok   , stopwords("english"))
tok   <- tokens_wordstem (tok   , language =("english"))

myDfm <- dfm(tok)
topfeatures(myDfm , 20)  # 20 top words

# Let me see my document-feature matrix for the first four documents and first 10 words
myDfm[1:4, 1:10]

# We already saw how to answer to these questions above, but now we can do that using directly the Quanteda functions
# What are the most popular hashtags at the moment? 
dfm_hashtag <- dfm_select(myDfm , pattern = c("#*"))
topfeatures(dfm_hashtag , 20)  # 20 top hashtags

# And who are the most frequently mentioned users?
dfm_at <- dfm_select(myDfm , pattern = c("@*"))
topfeatures(dfm_at , 20)  # 20 top accounts

# How many tweets mention "Data"?
kwic(tokens(tok), "*data*")
x <- kwic(tokens(tok), "*data*")
unique(x$docname)

# Before creating your DFM, to clean up tweets (if you need it...)
txt <- "This is a @username and #hashtag. https://twitter.com/home"
txt 

# preserve social media tags (default)
tokens(txt, remove_punct = TRUE)

# remove social media tags # (in words replace each "#" with "")
x2 <- gsub("#","",txt) 
# then tokens x2
tokens(x2, remove_punct = TRUE)

# remove social media tags # and @ (in words replace each "#" and "@" with "")
x2 <- gsub("\\#|@","",txt) 
# then tokens x2
tokens(x2, remove_punct = TRUE)

# remove URLs
x2 <- gsub("http.*","",txt)
# then tokens x2
tokens(x2, remove_punct = TRUE)

# NOTE: you can remove URLs directly via the command tokens as we did above
tokens(txt, remove_punct = TRUE, remove_url = TRUE)

# remove social media tags and URLS at the same time
x2 <-gsub("\\#|@|http.*","",txt) 
# then tokens x2
tokens(x2, remove_punct = TRUE)

#########################
## An example of a possible analysis by applying something we learnt in Lab 1!
#########################

# let's make two queries. One using the query "liberal*" and the second using the query "conservative*"

lib <- search_tweets("liberal*", n = 1000, include_rts = FALSE,  lang = "en")
cons <- search_tweets("conservative", n = 1000, include_rts = FALSE,  lang = "en")

print(lib $text[1:10])
print(cons $text[1:10])

## create query variable
lib$query <- "Liberal"
cons$query <- "Conservative"

## row bind into single data frame
df <- rbind(lib, cons)

# let's graph the time-trend
ts_plot(group_by(df, query), by="15 mins")

myCorpusTwitter<- corpus(df)
tok  <- tokens(myCorpusTwitter, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, 
remove_separators = TRUE, remove_url = TRUE)
tok   <- tokens_remove(tok   , stopwords("english"))
tok   <- tokens_wordstem (tok   , language =("english"))
myDfm <- dfm(tok)
topfeatures(myDfm , 20) 

# let's see the difference in the language between Cons and Libs tweets
# FIRST: Let's compute the lexical diversity 
myDfm2 <- dfm_group(myDfm, groups = query)
lexdiv <- textstat_lexdiv(myDfm2 )
str(lexdiv)

ggplot(data=lexdiv, aes(x=document, y=TTR)) +
  geom_bar(stat="identity", color="blue", fill="white") + coord_flip()

# SECOND: Let's employ a comparison tag-cloud 
set.seed(123)
textplot_wordcloud(dfm_trim(myDfm2, min_termfreq = 20, verbose = FALSE), comparison = TRUE)

# let's drop the two features "liber" and "conserv" from the DfM to make the graph much more interpatable!
myDfm2 <- dfm_remove(myDfm2, c("liber", "conserv"))
set.seed(123)
textplot_wordcloud(dfm_trim(myDfm2, min_termfreq = 20, verbose = FALSE), comparison = TRUE)

# THIRD: let's use as target the "conservative" vs. "liberal"
table(myDfm$query)
tstat_key <- textstat_keyness(myDfm ,target =myDfm$query =="Conservative")
textplot_keyness(tstat_key)

# let's drop once again the two features "liber" and "conserv" from the DfM to make the graph much more interpatable!
myDfm2 <- dfm_remove(myDfm, c("liber", "conserv"))
tstat_key <- textstat_keyness(myDfm2 ,target =myDfm$query =="Conservative")
textplot_keyness(tstat_key)
head(tstat_key , 10)
tail(tstat_key, 10)