rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/") library(readtext) library(quanteda) library(quanteda.textmodels) library(cowplot) library(PerformanceAnalytics) library(psych) library(quanteda.textplots) library(quanteda.textstats) library(ggplot2) library(ggrepel) library(dplyr) ######################################################################### ######################################################################### # Creating the Corpus of the UK electoral programs 1992, 1997 ######################################################################### ######################################################################### myText <- readtext("Lecture 2/Wordscores manifestos/UK/*.txt", docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year")) str(myText) myText$text <- gsub("'"," ",myText$text) testCorpus <- corpus(myText ) summary(testCorpus) # I rename the name of the documents docnames(testCorpus) <- gsub(".txt", "", docnames(testCorpus )) summary(testCorpus) tok2 <- tokens(testCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) myDfm <- dfm(tok2 ) topfeatures(myDfm , 20) # 20 top words # let's keep just those features with at least 2 characters (to remove for example the "s") myDfm <- dfm_remove(myDfm, min_nchar=2) topfeatures(myDfm , 20) # 20 top words ######################################################################### ######################################################################### # Using Wordfish ######################################################################### ######################################################################### # dir indicates which two documents are used for global identification purposes # (the first document to the left of the second one); # this matters ONLY for the interpretation of the results (i.e., for the direction of the scores # along the latent dimension - which positive / which negative), not for the estimation per-se docvars(myDfm) # here: LAB 92 to the left of CONS 92 wfm <- textmodel_wordfish(myDfm, dir = c(3, 1)) summary(wfm) # here: CONS 92 to the left of LAB 92 wfm2 <- textmodel_wordfish(myDfm, dir = c(1, 3)) summary(wfm2) # compare with previous case! summary(wfm) summary(wfm2) str(wfm) cor(wfm$theta, wfm2$theta) # what does it happen if now you put LIB 92 to the left of CONS 92? summary(testCorpus) wfm3 <- textmodel_wordfish(myDfm, dir = c(5, 1)) summary(wfm3) # Nothing! cor(wfm$theta, wfm3$theta) identical(wfm$theta, wfm3$theta) # Always do some Diagnostic by analyzing word discrimination parameters # Weights with large values mean that these features are estimated to be on the extremes of the dimension # Let's learn how to extract the estimates of the model, starting with extracting the features str(wfm) scores_words <-data.frame(wfm$features, wfm$beta, wfm$psi) str(scores_words) # Let's check for the correlation between psi and beta cor(abs(scores_words$wfm.beta), scores_words$wfm.psi) # as expected, the correlation is very negative: the largest is psi, the lower is the absolute value of # beta and viceversa # Plot estimated word positions textplot_scale1d(wfm, margin = "features") # Alternative way of plotting the psi/beta via ggplot: we do see the typical triangle shape # with points ggplot(scores_words, aes(wfm.beta, wfm.psi)) + geom_point() + theme_light() + labs(title = "Scatterplot for UK-manifestoes", x = "Estimated Beta", y = "Estimated Psi") # with labels ggplot(scores_words, aes(wfm.beta, wfm.psi, label= wfm.features))+ geom_point() +geom_text(hjust=0, vjust=0) + theme_light() + labs(title = "Scatterplot for UK-manifestoes", x = "Estimated Beta", y = "Estimated Psi") # Let's try to give an interpretation of our results by focusing on the words with the largest # betas in absolute value # top 40 features for negative beta: renounce, democrat, banks, enterprises, pollutants, nationals head(scores_words[order(scores_words$wfm.beta),], 40) # top 40 words for positive beta: producers, preservation, efforts, enrich, illegal, socialism tail(scores_words[order(scores_words$wfm.beta),], 40) # in this case we have just 6 documents and it's not very clear the meaning of the latent dimension just # by looking at betas (at least the first 40 features). Perhaps progressive vs. conservative? # we can also plot our results (for a visual representation) # let's extract the top 10 words with either the largest positive or negative beta scores_words2 <- top_n(scores_words, 10, wfm.beta ) scores_words2 scores_words3 <- top_n(scores_words, -10, wfm.beta ) scores_words3 scores_words_new <- rbind(scores_words2, scores_words3) # reorder the features scores_words_new <- mutate(scores_words_new, Feature= reorder(wfm.features, wfm.beta)) ggplot(scores_words_new , aes(Feature, wfm.beta)) + geom_col(aes(fill = wfm.psi)) + scale_fill_viridis_c() + coord_flip() + theme_light() + labs(title = "\nTop 10 words with the highest/lowest beta-value\n", x = "", y = "Beta", fill = "Psi") # Plot estimated word positions by also highlighting specific words textplot_scale1d(wfm, margin = "features", highlighted = c("renounce", "pollutants", "democrat", "producers", "illegal", "enrich"), highlighted_color = "red", alpha=0.3) # alternatively let's just show the features with abs(beta)>3 # note that to avoid labels overlap we use geom_text_repel rather than geom_text ggplot(scores_words, aes(wfm.beta, wfm.psi))+ geom_point() + theme_light() + labs(title = "Scatterplot for UK-manifestoes", x = "Estimated Beta", y = "Estimated Psi") + geom_text_repel(data=filter(scores_words, abs(wfm.beta)>3), aes(label=wfm.features)) # Plot estimated document positions summary(testCorpus) textplot_scale1d(wfm, margin = "documents") textplot_scale1d(wfm, margin = "documents", groups = docvars(testCorpus, "Party")) textplot_scale1d(wfm, margin = "documents", groups = docvars(testCorpus, "Year")) ######################################################################### ######################################################################### # Let's compare the results we get from Wordfish with the raw score ones # we get from Wordscores using the economic policy position ######################################################################### ######################################################################### docvars(myDfm) # Wordscores ws <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1))) pr_all <- predict(ws, interval = "confidence") # Comparing wordscores vs wordfish wordscores <- textplot_scale1d(pr_all) wordfish <- textplot_scale1d(wfm, margin = "documents") plot_grid(wordscores , wordfish , labels = c('Wordscores', 'Wordfish')) # check for the correlation party <- wfm$docs score_wf <-wfm$theta score_ws <- pr_all$fit scores_texts <-data.frame(party, score_wf, score_ws) str(scores_texts) colnames(scores_texts)[3] <- "score_ws" str(scores_texts) # high but not perfect correlation. Two different dimensions? cor(scores_texts$score_ws, scores_texts$score_wf) # you can also draw a scatter, with a fit lines and party names ggplot(scores_texts, aes(score_ws, score_wf, label=party)) + geom_point() +geom_text(hjust=0, vjust=0) + theme_light() + labs(title = "Scatterplot for UK-manifestoes", x = "Wordscores Economy", y = "Wordfish") + geom_smooth(method = "lm", se = TRUE) ######################################################################### ######################################################################### # Using wordfish: US Presidential Inaugural Speech after 1980 ######################################################################### ######################################################################### # apply wordfish by first considering Reagan 1981 to the right of Obama 2009; # and then Trump 2017 to the right of Obama 2009: any change? corpus_Pres <- corpus_subset(data_corpus_inaugural, Year > 1980) tok2 <- tokens(corpus_Pres , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) presDfm <- dfm(tok2) docvars(presDfm) # Obama 2009 to the left of Reagan 1981 wfm <- textmodel_wordfish(presDfm , dir = c(8, 1)) summary(wfm) words2 <- wfm$features beta2 <-wfm$beta psi2 <-wfm$psi scores_words2 <-data.frame(words2, beta2, psi2) str(scores_words2) # top 40 words for negative beta: tyranni; ideolog; murder; tyrant; unlimit head(scores_words2[order(scores_words2$beta2),], 40) # top 40 words for positive beta: rainbow; western; sovereignti; cheer tail(scores_words2[order(scores_words2$beta2),], 40) # meaning of the latent dimension: pessimism/realistic vs. optimism side of power? # for use, the latent does not appear to be related to ideology! As you can also see below # when you plot the documents (according to face validity) # Highlight specific words textplot_scale1d(wfm, margin = "features", highlighted = c("tyranni", "murder", "mortal", "cheer", "rainbow", "sovereignti"), highlighted_color = "red", alpha=0.3) # Plot estimated document positions textplot_scale1d(wfm, margin = "documents") # Obama 2009 to the left of Trump 2017 wfm2 <- textmodel_wordfish(presDfm , dir = c(8, 10)) summary(wfm2) # Plot estimated document positions textplot_scale1d(wfm2, margin = "documents") # check for the correlation score_reagan <-wfm$theta score_trump <-wfm2$theta cor(score_reagan, score_trump) identical(score_reagan, score_trump)