rm(list=ls(all=TRUE))
getwd()
# setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL")
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(rtweet)
library(readtext)
library(quanteda)
library(ggplot2)
library(ggmap)
library(httpuv)
library(dplyr)
library(stringr)

# "search_tweets" command: it implements the REST API search
# it returns Twitter statuses matching a user provided search query 
# it ONLY RETURNS DATA FROM THE PAST 6-9 DAYS
# (exception: user timelines as we will seen - 3,200 most recent tweets are available)

# Search for up to 100 (non-retweeted) tweets written in English containing the rstats hashtag without retweets.
rt <- search_tweets( "#rstats", n = 100, lang = "en", include_rts = FALSE)
length(rt$text)
# days covered by our analysis
since <- rt$created_at[100]
latest <- rt$created_at[1]
cat("Twitter data","\n",paste("From:",since),"\n",paste("  To:",latest))

# print tweet text
print(rt$text[1:5])

# lots of info about each single tweet
colnames(rt)

# What are the most popular hashtags at the moment? We’ll use regular expressions to extract hashtags.
ht <- str_extract_all(rt$text, '#[A-Za-z0-9_]+')
ht <- unlist(ht)
head(sort(table(ht), decreasing = TRUE))

# And who are the most frequently mentioned users?
handles <- str_extract_all(rt$text, '@[0-9_A-Za-z]+')
handles_vector <- unlist(handles)
head(sort(table(handles_vector), decreasing = TRUE), n=10)

# How many tweets mention "Data"?
length(grep("data", rt$text, ignore.case=TRUE))

# The query to be searched must be a character string not to exceed maximum of 500 characters.
# Spaces behave like boolean "AND" operator. To search for tweets containing at least one of multiple possible terms, 
# separate each search term with spaces and "OR" (in caps). 
# For example, the search q = "data science" looks for tweets containing both "data" and "science" 
# anywhere located anywhere in the tweets and in any order. 
# When "OR" is entered between search terms, query = "data OR science", Twitter should return any tweet that contains 
# either "data" or "science." 
# It is also possible to search for exact phrases using double quotes. 
# To do this, either wrap single quotes around a search query using double quotes, e.g., q = '"data science"' 
# or escape each internal double quote with a single backslash, e.g., q = "\"data science\"".
# example: rt <- search_tweets('"data science"', n = 100, lang = "en", include_rts = FALSE)

# you can then save your results as a csv file
write_as_csv(rt, "twitter.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8")

## plot time series of tweets
ts_plot(rt, "1 hours") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold")) +
  labs(
    x = NULL, y = NULL,
    title = "Frequency of #rstats Twitter statuses from past 6-9 days",
    subtitle = "Twitter status (tweet) counts aggregated using one-hour intervals",
    caption = "Source: Data collected from Twitter's REST API via rtweet"
  )

## plot time series of tweets frequency
ts_plot(rt, by = "mins")
ts_plot(rt, by = "days")

# Next, let’s figure out who is tweeting about R using the #rstats hashtag.
# you can access to  users data discussing about #rstats via users_data()
users_data(rt)

# view column with screen names 
head(rt$screen_name)
# get a list of unique usernames
unique(rt$screen_name)

# You can similarly use the search_users() function to just see what users are tweeting using a particular hashtag. 
# This function returns just a data.frame of the users and information about their accounts.

# what users are tweeting with #rstats (max=100)
users <- search_users("#rstats", n = 100)

# once again, you can then save your results as a csv file
write_as_csv(users, "users.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8")

# What's the difference with search_tweets? That with search_tweets you retrieve a given amount of tweets,
# with search_users you retrieve a given amount of UNIQUE users. If a user can tweets a lot about #rstats
# it will count as "1 author" when using  search_users, but his/her tweets will appear several times
# in the data.frame you get out of  search_tweets. And indeed compare the two above results:

length(unique(users$user_id))
length(unique(rt$user_id))

# Let’s learn a bit more about these people tweeting about R. First, where are they from?

# how many languages are represented (und=undeterminated)
length(unique(users$lang))
count(users, lang, sort = TRUE)

# how many locations are represented
length(unique(users$location))
count(users, location, sort = TRUE)

# Let’s sort by count and just plot the top 4 locations. To do this you use top_n(). 
# Note that in this case you are grouping your data by user. 

count <- count(users, location, sort = TRUE)
str(count)
count <- count [-which(count$location == ""), ]
str(count)
count <- mutate(count, location = reorder(location, n))
count <- top_n(count, 4)
ggplot(count, aes(x = location, y = n)) +
  geom_col() +
  coord_flip() +
      labs(x = "Count",
      y = "Location",
      title = "Where Twitter users are from - unique locations ")

# Twitter rate limits cap the number of search results returned to 18,000 every 15 minutes. 
# To request more than that, simply set retryonratelimit = TRUE and rtweet will wait for rate limit resets for you.

## search for 20,000 tweets containing the word data (do not run it!)
## rt <- search_tweets("data", n = 20000, retryonratelimit = TRUE)

## search for tweets containing "rstats", including retweets
rtR <- search_tweets("#rstats", n = 100)
## plot multiple time series--retweets vs non-retweets
ts_plot(group_by(rtR, is_retweet), "mins")

# What is the most retweeted tweet?
x <- rtR[which.max(rtR$retweet_count),]
print(x$retweet_count)
print(x$text)
print(x$screen_name)

## Get friends
# Retrieve a list of all the accounts a user follows.

## get user IDs of accounts followed by CNN
cnn_fds <- get_friends("cnn")
str(cnn_fds)
length(cnn_fds$user_id)
## lookup data on those accounts
cnn_fds_data <- lookup_users(cnn_fds$user_id)
head(cnn_fds_data$name)

# Get followers
# Retrieve a list of the accounts following a user

## get user IDs of accounts following CNN (just the first 100 in this example)
cnn_flw <- get_followers("cnn", n = 100)
## lookup data on those accounts
cnn_flw_data <- lookup_users(cnn_flw$user_id)
head(cnn_flw_data$name)

# Or if you really want ALL of their followers:
## how many followers does Curini have?
curini_flw <- get_followers("Curini", retryonratelimit = TRUE)
length(curini_flw$user_id)
# and if you want to have the list of them you can use the command lookup_users (but it will take you a while)
# Curini <- lookup_users(curini_flw$user_id)
# head(Curini$name)

# Get favorites
# Get the 10 most recently favorited statuses by Joe Biden.
fav <- get_favorites("JoeBiden", n = 10)
print(fav$text[1:10])
print(fav$lang[1:10])

#Get trends
# Discover what’s currently trending in San Francisco.
sf <- get_trends("san francisco")
sf$trend

# Get timelines
# Get the most recent 100 tweets from some important US political figures
tmls <- get_timeline(
  c("BernieSanders", "JoeBiden"),
  n = 100
)
table(tmls$name)

## group by screen name and plot each time series [same two variants!]
ts_plot(group_by(tmls, screen_name), "days")

#########################
## Passing your rtweet results to Quanteda
#########################

rt <- search_tweets("#rstats", n = 100, include_rts = FALSE,  lang = "en")
print(rt$lang[1:20])
colnames(rt)
# I want to convert the POSIXct time format to a date (here you can also change the time zone by selecting tz="Hongkong" for example)
# here I choose Greenwich Mean Time (GMT)
str(rt$created_at)
rt$date <- as.Date(rt$created_at, "GMT")
str(rt$date)

myCorpusTwitter<- corpus(rt)
texts(myCorpusTwitter)[1:2]
# number of documents
ndoc(myCorpusTwitter)
# inspect the document-level variables
head(docvars(myCorpusTwitter))

myDfm <- dfm(myCorpusTwitter , remove = stopwords("english"),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, stem = TRUE, remove_url = TRUE)
topfeatures(myDfm , 20)  # 20 top words
# Let me see my document-feature matrix for the first four documents and first 10 words
myDfm[1:4, 1:10]
topfeatures(myDfm)

# let's draw a network of the 50 most used features in our dfm (you will learn A LOT about network analysis with Rob Fahey!)
top_feat <- names(topfeatures(myDfm, 50))
dfm_select <- dfm_select(myDfm , pattern = top_feat, selection = "keep")
str(dfm_select) # 50 features
topfeatures(dfm_select)

# let's construct a feature co-occurrence matrix of the most recurring features (measuring the co-occurrences of features)
tag_fcm <- fcm(dfm_select)
head(tag_fcm)

# Let's plot the results!
# textplot_network allows you to plot a network directly from Quanteda. 
# Currently the size of the network is limited to 1000. To draw much complex networks you should use the package "igraph" (see below)

jpeg('network1.jpg')
set.seed(144)
textplot_network(tag_fcm, min_freq = 0.8)
dev.off()

# if you are using RStudio, rather than the old-Skool R GUI as myself,
# then the graph should appear to you also by simply typing 
textplot_network(tag_fcm, min_freq = 0.8)

# N.B. you could also decide to plot directly the dfm "dfm_select" rather than the fcm "tag_fcm" by writing:
# textplot_network(dfm_select, min_freq = 0.8)
# You will get the same graph as above. In this case textplot_network first builds a fcm for you, and then it plots it!

#########################
## An example of a possible analysis [review it by yourself!]
#########################

rt <- search_tweets("liberal OR conservative", n = 1000, include_rts = FALSE,  lang = "en")
print(rt$text[1:10])
myCorpusTwitter<- corpus(rt)
myDfmTwitter <- dfm(myCorpusTwitter , remove = stopwords("english"),
remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, stem = FALSE, remove_url = TRUE)
topfeatures(myDfmTwitter)

dict <- dictionary(list(lib = c("liberal*"), cons = c("conservativ*")))
dict
# let's see how many times the word liberal* and conservativ* appear in each tweet
dfm_lookup(myDfmTwitter, dict)[,1]
dfm_lookup(myDfmTwitter, dict)[,2]
# let's save this info and let's merge them
liberal <- dfm_lookup(myDfmTwitter, dict)[,1]
conservative <- dfm_lookup(myDfmTwitter, dict)[,2]
df_tot <- merge(liberal, conservative, by = "doc_id")
str(df_tot)

# let's identify a tweet as a "conservative" one if it uses the word conservative more than liberal
df_tot <- mutate(df_tot, conservative = if_else(cons > lib, 1, 0)) 
str(df_tot)
# let's add back this info to our dfm
docvars(myDfmTwitter)$conservative <- df_tot$conservative 

# let's see the difference in the language
tstat_key <- textstat_keyness(myDfmTwitter,target = myDfmTwitter$conservative == 1)
textplot_keyness(tstat_key)
head(tstat_key , 10)
tail(tstat_key, 10)

# let's plot two different features network
dfm_cons <- dfm_subset(myDfmTwitter, conservative == 1)
dfm_lib <- dfm_subset(myDfmTwitter, conservative == 0)
feat_cons <- names(topfeatures(dfm_cons , 50))
feat_lib <- names(topfeatures(dfm_lib , 50))
dfm_cons_select <- dfm_select(dfm_cons , pattern = feat_cons , selection = "keep")
dfm_lib_select <- dfm_select(dfm_lib , pattern = feat_lib , selection = "keep")

tag_fcm_con <- fcm(dfm_cons_select)
head(tag_fcm_con)

tag_fcm_lib <- fcm(dfm_lib_select)
head(tag_fcm_lib)

library (cowplot)
pdf("network together.pdf") 
set.seed(144)
a <- textplot_network(tag_fcm_con, min_freq = 0.8)
b <- textplot_network(tag_fcm_lib, min_freq = 0.8)
plot_grid(a , b, labels = c( 'Conservative', "Liberal"))
dev.off()