rm(list=ls(all=TRUE)) getwd() # set here the folder where you have saved the datasets you are going to use in this class # setwd("*****") # in my case # setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/") setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/") getwd() library(readtext) library(quanteda) library(quanteda.textmodels) library(cowplot) library(PerformanceAnalytics) library(psych) ######################################################################### ######################################################################### # Creating the Corpus of the UK electoral programs 1992, 1997 ######################################################################### ######################################################################### myText <- readtext("Lecture 2/Wordscores manifestos/UK/*.txt", docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year")) str(myText) testCorpus <- corpus(myText ) summary(testCorpus) # I rename the name of the documents docnames(testCorpus) <- gsub(".txt", "", docnames(testCorpus )) summary(testCorpus) ##################### # Remember! Comparing the results with and w/o stopwords, with and w/o stemming is a good practice. ##################### myDfm <- dfm(testCorpus , remove = stopwords("english"), tolower = TRUE, stem = TRUE, remove_punct = TRUE, remove_numbers=TRUE) topfeatures(myDfm , 20) # 20 top words ######################################################################### ######################################################################### # Using wordfish ######################################################################### ######################################################################### # dir indicates which two documents are used for global identification purposes # (the first document to the left of the second one); # this matters usually more for the interpretatio of the results (i.e., for the direction of the scores # along the latent dimension (which positive, which negative ones)), rather than for the estimation per-se # here: LAB 92 to the left of CONS 92 summary(testCorpus) wfm <- textmodel_wordfish(myDfm, dir = c(3, 1)) summary(wfm) # here: CONS 92 to the left of LAB 92 wfm2 <- textmodel_wordfish(myDfm, dir = c(1, 3)) summary(wfm2) # compare with previous case! summary(wfm) summary(wfm2) cor(wfm$theta, wfm2$theta) # what does it happen if now you put LIB 92 to the left of CONS 92? summary(testCorpus) wfm3 <- textmodel_wordfish(myDfm, dir = c(5, 1)) summary(wfm3) # Nothing! cor(wfm$theta, wfm3$theta) identical(wfm$theta, wfm3$theta) # Always do Diagnostic! # A good start for diagnostics is the analysis of word discrimination parameters. # Weights with large values mean that these words are estimated to be on the extremes of the dimension # Plot estimated word positions textplot_scale1d(wfm, margin = "features") # Highlight specific words textplot_scale1d(wfm, margin = "features", highlighted = c("government", "global", "children", "bank", "economy", "citizenship", "productivity", "deficit", "nation", "freedom", "histor", "inflat"), highlighted_color = "red") # let's learn how to extract the estimates of the model and save them # let's start with extracting the features str(wfm) words2 <- wfm$features beta2 <-wfm$beta psi2 <-wfm$psi scores_words2 <-data.frame(words2, beta2, psi2) str(scores_words2) # top 20 words for negative beta head(scores_words2[order(scores_words2$beta2),], 20) # top 20 words for positive beta tail(scores_words2[order(scores_words2$beta2),], 20) write.csv(scores_words2, "result_wordfish_words.csv") # Plot estimated document positions summary(testCorpus) textplot_scale1d(wfm, margin = "documents") textplot_scale1d(wfm, margin = "documents", groups = docvars(testCorpus, "Party")) textplot_scale1d(wfm, margin = "documents", groups = docvars(testCorpus, "Year")) # extract estimates of the model and save them # documents party <- wfm$docs theta <-wfm$theta se.theta <-wfm$se.theta scores_texts <-data.frame(party , theta , se.theta) str(scores_texts) scores_texts$lower <- scores_texts$theta +1.96*scores_texts$se.theta scores_texts$upper <- scores_texts$theta -1.96*scores_texts$se.theta str(scores_texts) write.csv(scores_texts, "result_wordfish_texts.csv") ######################################################################### ######################################################################### # Using wordfish: US Presidential Inaugural Speech after 1980 ######################################################################### ######################################################################### # apply wordfish by first considering Reagan 1981 to the right of Obama 2009; # and then Trump 2017 to the right of Obama 2009: any change? # create a dfm from inaugural addresses from Reagan onwards presDfm <- dfm(corpus_subset(data_corpus_inaugural, Year > 1980), remove = stopwords("english"), stem = TRUE, remove_punct = TRUE) presDfm [, 1:10] str(presDfm) presDfm@Dimnames$docs # Obama 2009 to the left of Reagan 1981 wfm <- textmodel_wordfish(presDfm , dir = c(8, 1)) summary(wfm) # Plot estimated word positions textplot_scale1d(wfm, margin = "features") # Plot estimated document positions textplot_scale1d(wfm, margin = "documents") # Obama 2009 to the left of Trump 2017 wfm2 <- textmodel_wordfish(presDfm , dir = c(8, 10)) summary(wfm2) # Plot estimated document positions textplot_scale1d(wfm2, margin = "documents") # check for the correlation score_reagan <-wfm$theta score_trump <-wfm2$theta cor(score_reagan, score_trump) identical(score_reagan, score_trump) # Comparing wordfish with Reagan or Trump in "dir" library(cowplot) reagan <- textplot_scale1d(wfm, margin = "documents") trump <- textplot_scale1d(wfm2, margin = "documents") plot_grid(reagan , trump , labels = c('Reagan=1', 'Trump=1'))