rm(list=ls(all=TRUE)) getwd() # set here the folder where you have saved the datasets you are going to use in this class # setwd("*****") # in my case setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/") getwd() library(readtext) library(quanteda) ######################################################################### ######################################################################### # Creating the Corpus of the UK electoral programs 1992, 1997 ######################################################################### ######################################################################### myText <- readtext("Lecture 2/Wordscores manifestos/UK/*.txt", docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year")) str(myText) testCorpus <- corpus(myText ) summary(testCorpus) # I rename the texts docnames(testCorpus) <- gsub(".txt", "", docnames(testCorpus )) summary(testCorpus) ##################### # Remember! Comparing the results with and w/o stopwords, with and w/o stemming is always a good practice. ##################### myDfm <- dfm(testCorpus , remove = stopwords("english"), tolower = TRUE, stem = TRUE, remove_punct = TRUE, remove_numbers=TRUE) topfeatures(myDfm , 20) # 20 top words ######################################################################### ######################################################################### # Using wordfish ######################################################################### ######################################################################### # dir indicates which two documents are used for global identification purposes # (first document to the left of the second one); # this matters usually more for the interpretatio of the results (i.e., for the direction of the scores # along the latent dimension (which positive, which negative ones)), rather than for the estimation per-se # here: LAB 92 to the left of CONS 92 summary(testCorpus) wfm <- textmodel_wordfish(myDfm, dir = c(3, 1)) summary(wfm) str(wfm) # here: CONS 92 to the left of LAB 92 wfm2 <- textmodel_wordfish(myDfm, dir = c(1, 3)) summary(wfm2) # compare with previous case! summary(wfm) summary(wfm2) cor(wfm$theta, wfm2$theta) # what does it happen if now you put LIB 92 to the left of CONS 92? summary(testCorpus) wfm3 <- textmodel_wordfish(myDfm, dir = c(5, 1)) summary(wfm3) # Nothing! cor(wfm$theta, wfm3$theta) identical(wfm$theta, wfm3$theta) # Always do Diagnostic! # A good start for diagnostics is the analysis of word discrimination parameters. # Weights with large values mean that these words are estimated to be on the extremes of the dimension # Plot estimated word positions textplot_scale1d(wfm, margin = "features") textplot_scale1d(wfm, margin = "features", highlighted = c("government", "global", "children", "bank", "economy", "citizenship", "productivity", "deficit", "nation", "freedom", "histor", "inflat"), highlighted_color = "red") # Plot estimated document positions summary(testCorpus) textplot_scale1d(wfm, margin = "documents") textplot_scale1d(wfm, margin = "documents", groups = docvars(testCorpus, "Party")) textplot_scale1d(wfm, margin = "documents", groups = docvars(testCorpus, "Year")) # extract estimates of the model and save them str(wfm) words2 <- wfm$features beta2 <-wfm$beta psi2 <-wfm$psi scores_words2 <-data.frame(words2, beta2, psi2) str(scores_words2) write.csv(scores_words2, "result_wordfish_words.csv") party <- wfm$docs theta <-wfm$theta se.theta <-wfm$se.theta scores_texts <-data.frame(party , theta , se.theta) str(scores_texts) scores_texts$lower <- scores_texts$theta +1.96*scores_texts$se.theta scores_texts$upper <- scores_texts$theta -1.96*scores_texts$se.theta str(scores_texts) write.csv(scores_texts, "result_wordfish_texts.csv") ######################################################################### ######################################################################### # Obtaining bootstrapped c.i. ######################################################################### ######################################################################### # here: LAB 92 to the left of CONS 92 summary(testCorpus) wfm <- textmodel_wordfish(myDfm, dir = c(3, 1)) textplot_scale1d(wfm, margin = "documents") set.seed(10) # let's set a seed so that we can always get the same results (for replicability) # let's resample 10 dfm, i.e., n=10 - usually the number of samples should be much larger! 500 or more bt <- bootstrap_dfm(testCorpus, n = 10, verbose = TRUE) str(bt) for (p in bt ) { print((p)) } library(magicfor) magic_for(print, silent = TRUE) # call magic_for() #let's iterate wordfish across our samples for (p in bt ) { bbb<-textmodel_wordfish((p), dir = c(3, 1)) print(bbb$theta) } x2 <- magic_result_as_vector() # let's save the different estimated thetas in a vector str(x2) magic_free() # let's close magic! is_magicalized() x3 <- split(x2 , ceiling(seq_along(x2 )/length(docnames(testCorpus)))) # I split the vector according to the number of parties in the corpus x3 p <- matrix(unlist(x3),nc=length(bt)) # The number of columns in the matrix are = to the total number of worfish estimates (=tot. number of extracted samples+1) rownames(p) <- docnames(testCorpus)# let's name the rows according to the name of parties in the corpus p emnB <- apply(p, 1, mean) # let's estimate the average thetas across samples emnB conf.level <- 0.95 eLB <- apply(p, 1, function(x) quantile(x, (1 - conf.level)/2)) # let's estimate the lower bound for 95% c.i. eUB <- apply(p, 1, function(x) quantile(x, 1 - (1 - conf.level)/2)) # let's estimate the upper bound for 95% c.i. id <- order(emnB) # let's order the thetas from the largest to the lowest one emnB <- emnB[id] eLB <- eLB[id] eUB <- eUB[id] dotchart(emnB, main = "Documents' position\n(Bootstrapped c.i.)", xlab = "Wordfish dimension", xlim = c(min(eLB) * 0.9, max(eUB) * 1.1), pch = 19) for (i in 1:length(emnB)) { lines(x = c(eLB[i], eUB[i]), y = c(i, i), col = "red") } ######################################################################### ######################################################################### # Using wordfish: US Presidential Inaugural Speech after 1980 ######################################################################### ######################################################################### # apply wordfish by first considering Reagan 1981 to the right of Obama 2009; # and then Trump 2017 to the right of Obama 2009: any change?