rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() # devtools::install_version("rtweet", version = "0.7.0", repos = "http://cran.us.r-project.org") library(rtweet) packageVersion("rtweet") library(httpuv) library(readtext) library(quanteda) library(ggplot2) library(dplyr) library(stringr) library(quanteda.textstats) library(quanteda.textplots ) # let's see if there is any difference between liberal and non-liberal US newspapers when covering a set # of keyword-topics # I have ran the query below on Monday 20/March at 12:20 pm. # tmls <- get_timeline( # c("nytimes", "washingtonpost", "nypost", "WSJ") , # n = 400 # ) # saveRDS(tmls , file = "tmls.rds") tmls <- readRDS("tmls.rds") table(tmls$name) # let's create a new document level variables that differentiates liberal vs. non-liberal newspapers tmls$liberal <- ifelse(tmls$name=="The New York Times" | tmls$name=="The Washington Post", "Liberal", "NotLberal") table(tmls$liberal) colnames(tmls) myCorpusTwitter<- corpus(tmls) head(docvars(myCorpusTwitter)) tok <- tokens(myCorpusTwitter, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_url = TRUE) # no stemming here! REMEMBER! That's not a good thing with keyATM unless you write the list of keywords accordingly tok <- tokens_remove(tok , stopwords("english")) myDfm <- dfm(tok) dfmt <- dfm_remove(myDfm , stopwords('en'), min_nchar = 2) dfmt <- dfm_trim(dfmt, min_termfreq = 15, docfreq_type = "prop") table(ntoken(dfmt) > 0) dfmt <- dfm_subset(dfmt, ntoken(dfmt) >0) table(ntoken(dfmt) > 0) library(keyATM) library(magrittr) keyATM_docs <- keyATM_read(texts = dfmt) summary(keyATM_docs) keywords <- list(biden = c("joe", "biden", "president", "government"), war = c("ukraine", "putin", "war", "russia", "zelensky"), bank = c("bank", "savings", "svb", "credit", "suisse")) keywords key_viz <- visualize_keywords(docs = keyATM_docs, keywords = keywords) key_viz # adding covariates table(dfmt@docvars$liberal) vars_selected <- as.data.frame(dfmt@docvars$liberal) # let's extract in a data frame the "liberal" covariate str(vars_selected) names(vars_selected )[1] <- "liberal" # let's rename it str(vars_selected) # let's suppose that no_keyword_topics = 2 is a good value. How to be sure about it? # You should estimate the average value for topic coherence and exclusivity to compare # different model specifications as we did for topic models and STM! This is higly recommended! # Take a look at the EXTRA script in the home-page of the course for doing it system.time(out <- keyATM(docs = keyATM_docs, no_keyword_topics = 2, keywords = keywords, model = "covariates", model_settings = list(covariates_data = vars_selected, covariates_formula = ~ liberal), options = list(seed = 123), keep = c("Z", "S") )) # diagnostic plot_modelfit(out) plot_pi(out) top_words(out, 10) # covariate's impact covariates_info(out) # let's compute and plot the avg. estimated thetas (i.e. the salinece) for Liberal and non-Liberal newspapers when discussing # about the 3 keyword-topics strata_topic <- by_strata_DocTopic(out, by_var = "liberalNotLberal", labels = c("Liberal", "Not Liberal")) plot(strata_topic, var_name = "liberalNotLberal", show_topic = c(1:3)) # We could stop it here or try to understand if there is any statistically significant difference between the thetas for liberal # and non-liberal newspapers as we highlighted above. In doing that however note that we don't have a coefficient and a s.e. to compute # such difference! Under a bayesian framework we have a distribution in the posterior mean of the thetas for each value of our # covariate (i.e., for liberal and non-liberal newspapers). We have therefore to understand if there is a significant difference # between these two distributions. # Let's for example estimate a 95% credible interval of such difference. # Remember: the 95% credible interval is simply the central portion of the posterior distribution # (in our case the difference between liberal and not liberal thetas) that contains 95% of the values. # distribution in the posterior mean of the thetas for liberal newspapers: str( strata_topic$theta[[1]]) # let's plot such distribution of the keyword-topic 1 (i.e., Biden) plot( strata_topic$theta[[1]][1]) # distribution in the posterior mean of the thetas for non-liberal newspapers: str( strata_topic$theta[[2]]) # let's plot such distribution of the keyword-topic 2 (i.e., Biden) plot( strata_topic$theta[[2]][1]) theta_diff_quantile <- apply(strata_topic$theta[[1]]- strata_topic$theta[[2]] , 2, quantile, c(0.025, 0.5, 0.975)) # there is a statistically significant difference at 95% c.i. only for the bank (i.e., liberal newspapers talk less about it # compared to non-liberal ones) theta_diff_quantile # let's plot it (for our 3 keyword-topics) lower <- theta_diff_quantile[1,1:3] mean <- theta_diff_quantile[2,1:3] higher <- theta_diff_quantile[3,1:3] res <- as.data.frame(cbind(lower, mean, higher)) str(res) res$topic <- as.factor(row.names(res)) library(ggplot2) ggplot(res, aes(topic, mean)) + geom_point() + geom_linerange(aes(ymin = lower, ymax = higher)) + geom_hline(yintercept=0, linetype="dashed", color = "red") + theme_bw() + ylab("Mean of the difference with 95% credible interval: Liberal - Non Liberal Media") + coord_flip() # difference in the language strata_tw <- by_strata_TopicWord(out, keyATM_docs, by = as.vector(vars_selected$liberal)) top_words(strata_tw, n = 10)