rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
# devtools::install_version("rtweet", version = "0.7.0", repos = "http://cran.us.r-project.org")
library(rtweet)
packageVersion("rtweet")
library(httpuv)
library(readtext)
library(quanteda)
library(ggplot2)
library(dplyr)
library(stringr)
library(quanteda.textstats)
library(quanteda.textplots )
# let's see if there is any difference between liberal and non-liberal US newspapers when covering a set
# of keyword-topics
# I have ran the query below on Monday 20/March at 12:20 pm.
# tmls <- get_timeline(
# c("nytimes", "washingtonpost", "nypost", "WSJ") ,
# n = 400
# )
# saveRDS(tmls , file = "tmls.rds")
tmls <- readRDS("tmls.rds")
table(tmls$name)
# let's create a new document level variables that differentiates liberal vs. non-liberal newspapers
tmls$liberal <- ifelse(tmls$name=="The New York Times" | tmls$name=="The Washington Post", "Liberal", "NotLberal")
table(tmls$liberal)
colnames(tmls)
myCorpusTwitter<- corpus(tmls)
head(docvars(myCorpusTwitter))
tok <- tokens(myCorpusTwitter, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE,
remove_separators = TRUE, remove_url = TRUE)
# no stemming here! REMEMBER! That's not a good thing with keyATM unless you write the list of keywords accordingly
tok <- tokens_remove(tok , stopwords("english"))
myDfm <- dfm(tok)
dfmt <- dfm_remove(myDfm , stopwords('en'), min_nchar = 2)
dfmt <- dfm_trim(dfmt, min_termfreq = 15, docfreq_type = "prop")
table(ntoken(dfmt) > 0)
dfmt <- dfm_subset(dfmt, ntoken(dfmt) >0)
table(ntoken(dfmt) > 0)
library(keyATM)
library(magrittr)
keyATM_docs <- keyATM_read(texts = dfmt)
summary(keyATM_docs)
keywords <- list(biden = c("joe", "biden", "president", "government"),
war = c("ukraine", "putin", "war", "russia", "zelensky"),
bank = c("bank", "savings", "svb", "credit", "suisse"))
keywords
key_viz <- visualize_keywords(docs = keyATM_docs, keywords = keywords)
key_viz
# adding covariates
table(dfmt@docvars$liberal)
vars_selected <- as.data.frame(dfmt@docvars$liberal) # let's extract in a data frame the "liberal" covariate
str(vars_selected)
names(vars_selected )[1] <- "liberal" # let's rename it
str(vars_selected)
# let's suppose that no_keyword_topics = 2 is a good value. How to be sure about it?
# You should estimate the average value for topic coherence and exclusivity to compare
# different model specifications as we did for topic models and STM! This is higly recommended!
# Take a look at the EXTRA script in the home-page of the course for doing it
system.time(out <- keyATM(docs = keyATM_docs,
no_keyword_topics = 2,
keywords = keywords,
model = "covariates",
model_settings = list(covariates_data = vars_selected,
covariates_formula = ~ liberal),
options = list(seed = 123),
keep = c("Z", "S") ))
# diagnostic
plot_modelfit(out)
plot_pi(out)
top_words(out, 10)
# covariate's impact
covariates_info(out)
# let's compute and plot the avg. estimated thetas (i.e. the salinece) for Liberal and non-Liberal newspapers when discussing
# about the 3 keyword-topics
strata_topic <- by_strata_DocTopic(out, by_var = "liberalNotLberal", labels = c("Liberal", "Not Liberal"))
plot(strata_topic, var_name = "liberalNotLberal", show_topic = c(1:3))
# We could stop it here or try to understand if there is any statistically significant difference between the thetas for liberal
# and non-liberal newspapers as we highlighted above. In doing that however note that we don't have a coefficient and a s.e. to compute
# such difference! Under a bayesian framework we have a distribution in the posterior mean of the thetas for each value of our
# covariate (i.e., for liberal and non-liberal newspapers). We have therefore to understand if there is a significant difference
# between these two distributions.
# Let's for example estimate a 95% credible interval of such difference.
# Remember: the 95% credible interval is simply the central portion of the posterior distribution
# (in our case the difference between liberal and not liberal thetas) that contains 95% of the values.
# distribution in the posterior mean of the thetas for liberal newspapers:
str( strata_topic$theta[[1]])
# let's plot such distribution of the keyword-topic 1 (i.e., Biden)
plot( strata_topic$theta[[1]][1])
# distribution in the posterior mean of the thetas for non-liberal newspapers:
str( strata_topic$theta[[2]])
# let's plot such distribution of the keyword-topic 2 (i.e., Biden)
plot( strata_topic$theta[[2]][1])
theta_diff_quantile <- apply(strata_topic$theta[[1]]- strata_topic$theta[[2]] , 2, quantile, c(0.025, 0.5, 0.975))
# there is a statistically significant difference at 95% c.i. only for the bank (i.e., liberal newspapers talk less about it
# compared to non-liberal ones)
theta_diff_quantile
# let's plot it (for our 3 keyword-topics)
lower <- theta_diff_quantile[1,1:3]
mean <- theta_diff_quantile[2,1:3]
higher <- theta_diff_quantile[3,1:3]
res <- as.data.frame(cbind(lower, mean, higher))
str(res)
res$topic <- as.factor(row.names(res))
library(ggplot2)
ggplot(res, aes(topic, mean)) +
geom_point() +
geom_linerange(aes(ymin = lower, ymax = higher)) + geom_hline(yintercept=0, linetype="dashed", color = "red") +
theme_bw() + ylab("Mean of the difference with 95% credible interval: Liberal - Non Liberal Media") + coord_flip()
# difference in the language
strata_tw <- by_strata_TopicWord(out, keyATM_docs, by = as.vector(vars_selected$liberal))
top_words(strata_tw, n = 10)