rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()

# devtools::install_version("rtweet", version = "0.7.0", repos = "http://cran.us.r-project.org")
library(rtweet)
packageVersion("rtweet")

library(httpuv)
library(readtext)
library(quanteda)
library(ggplot2)
library(dplyr)
library(stringr)
library(quanteda.textstats)
library(quanteda.textplots )

# let's see if there is any difference between liberal and non-liberal US newspapers when covering a set
# of keyword-topics

# I have ran the query below on Monday 20/March at 12:20 pm. 
# tmls <- get_timeline(
#   c("nytimes", "washingtonpost", "nypost", "WSJ") , 
# n = 400
# )
# saveRDS(tmls , file = "tmls.rds")

tmls <- readRDS("tmls.rds")
table(tmls$name)

# let's create a new document level variables that differentiates liberal vs. non-liberal newspapers 
tmls$liberal <- ifelse(tmls$name=="The New York Times" | tmls$name=="The Washington Post", "Liberal", "NotLberal")
table(tmls$liberal)
colnames(tmls)

myCorpusTwitter<- corpus(tmls)
head(docvars(myCorpusTwitter))

tok  <- tokens(myCorpusTwitter, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, 
remove_separators = TRUE, remove_url = TRUE)
# no stemming here! REMEMBER! That's not a good thing with keyATM unless you write the list of keywords accordingly 
tok   <- tokens_remove(tok   , stopwords("english"))
myDfm <- dfm(tok)

dfmt <- dfm_remove(myDfm , stopwords('en'), min_nchar = 2)
dfmt <- dfm_trim(dfmt, min_termfreq = 15, docfreq_type = "prop")
table(ntoken(dfmt) > 0)
dfmt <- dfm_subset(dfmt, ntoken(dfmt) >0)
table(ntoken(dfmt) > 0)

library(keyATM)
library(magrittr)

keyATM_docs <- keyATM_read(texts = dfmt)
summary(keyATM_docs)

keywords <- list(biden = c("joe", "biden", "president", "government"),
                        war = c("ukraine", "putin", "war", "russia", "zelensky"),
                        bank = c("bank", "savings", "svb", "credit", "suisse"))
keywords 

key_viz <- visualize_keywords(docs = keyATM_docs, keywords = keywords)
key_viz

# adding covariates
table(dfmt@docvars$liberal)

vars_selected <- as.data.frame(dfmt@docvars$liberal) # let's extract in a data frame the "liberal" covariate
str(vars_selected)
names(vars_selected )[1] <- "liberal" # let's rename it
str(vars_selected)

# let's suppose that no_keyword_topics = 2 is a good value. How to be sure about it?
# You should estimate the average value for topic coherence and exclusivity to compare 
# different model specifications as we did for topic models and STM! This is higly recommended!
# Take a look at the EXTRA script in the home-page of the course for doing it

system.time(out <- keyATM(docs              = keyATM_docs,
              no_keyword_topics = 2,
              keywords          = keywords,
              model             = "covariates",
              model_settings    = list(covariates_data    = vars_selected,
                                       covariates_formula = ~ liberal),
              options           = list(seed = 123),
                keep              = c("Z", "S")  ))

# diagnostic
plot_modelfit(out)
plot_pi(out)

top_words(out, 10)

# covariate's impact
covariates_info(out)

# let's compute and plot the avg. estimated thetas (i.e. the salinece) for Liberal and non-Liberal newspapers when discussing
# about the 3 keyword-topics

strata_topic <- by_strata_DocTopic(out, by_var = "liberalNotLberal", labels = c("Liberal", "Not Liberal"))
plot(strata_topic, var_name = "liberalNotLberal", show_topic = c(1:3))

# We could stop it here or try to understand if there is any statistically significant difference between the thetas for liberal 
# and non-liberal newspapers as we highlighted above. In doing that however note that we don't have a coefficient and a s.e. to compute 
# such difference! Under a bayesian framework we have a distribution in the posterior mean of the thetas for each value of our 
# covariate (i.e., for liberal and non-liberal newspapers). We have therefore to understand if there is a significant difference
# between these two distributions. 
# Let's for example estimate a 95% credible interval of such difference.
# Remember: the 95% credible interval is simply the central portion of the posterior distribution 
# (in our case the difference between liberal and not liberal thetas) that contains 95% of the values. 

# distribution in the posterior mean of the thetas for liberal newspapers: 
str( strata_topic$theta[[1]])
# let's plot such distribution of the keyword-topic 1 (i.e., Biden)
plot( strata_topic$theta[[1]][1])
# distribution in the posterior mean of the thetas for non-liberal newspapers: 
str( strata_topic$theta[[2]])
# let's plot such distribution of the keyword-topic 2 (i.e., Biden)
plot( strata_topic$theta[[2]][1])

theta_diff_quantile <- apply(strata_topic$theta[[1]]- strata_topic$theta[[2]] , 2, quantile, c(0.025, 0.5, 0.975))
# there is a statistically significant difference at 95% c.i. only for the bank (i.e., liberal newspapers talk less about it
# compared to non-liberal ones)
theta_diff_quantile

# let's plot it (for our 3 keyword-topics)
lower <- theta_diff_quantile[1,1:3]
mean <- theta_diff_quantile[2,1:3]
higher <- theta_diff_quantile[3,1:3]
res <- as.data.frame(cbind(lower, mean, higher))
str(res)
res$topic <- as.factor(row.names(res))

library(ggplot2)

ggplot(res, aes(topic, mean)) +        
  geom_point() +
 geom_linerange(aes(ymin = lower, ymax = higher)) + geom_hline(yintercept=0, linetype="dashed", color = "red") +
theme_bw() + ylab("Mean of the difference with 95% credible interval: Liberal - Non Liberal Media") + coord_flip()

# difference in the language
strata_tw <- by_strata_TopicWord(out, keyATM_docs, by = as.vector(vars_selected$liberal))
top_words(strata_tw, n = 10)