rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/") getwd() library(readtext) library(quanteda) library(quanteda.textmodels) library(quanteda.textplots) library(quanteda.textstats) library(cowplot) library(psych) library(PerformanceAnalytics) ######################################################################### ######################################################################### # Creating the Corpus of the UK electoral programs 1992, 1997 ######################################################################### ######################################################################### myText <- readtext("Lecture 2/Wordscores manifestos/UK/*.txt", docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year")) str(myText) testCorpus <- corpus(myText ) summary(testCorpus) # I rename the name of the documents docnames(testCorpus) <- gsub(".txt", "", docnames(testCorpus )) summary(testCorpus) tok2 <- tokens(testCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) ##################### # Remember! Comparing the results with and w/o stopwords, with and w/o stemming is a good practice. ##################### myDfm <- dfm(tok2) topfeatures(myDfm , 20) # 20 top words # let's keep just those features with at least 2 characters myDfm <- dfm_remove(myDfm, min_nchar=2) ######################################################################### ######################################################################### # Obtaining bootstrapped c.i. ######################################################################### ######################################################################### # here: LAB 92 to the left of CONS 92 summary(testCorpus) wfm <- textmodel_wordfish(myDfm, dir = c(3, 1)) textplot_scale1d(wfm, margin = "documents") set.seed(1234) # let's set a seed so that we can always get the same results (for replicability) # let's resample 10 dfm, i.e., n=10 - usually the number of samples should be much larger! 500 or more bt <- bootstrap_dfm(testCorpus, n = 10, verbose = TRUE, remove = stopwords("english"), tolower = TRUE, stem = TRUE, remove_punct = TRUE, remove_numbers=TRUE) str(bt) for (i in bt ) { wf<-textmodel_wordfish((i), dir = c(3, 1)) # run wordfish df<-wf$theta # save the estimates as a data frame df.name<-paste0("wordfish",(i+runif(1, min=0, max=100))) # create the name for the object that will save the wordfish matrix for each loop (=10) assign(df.name,df) } # check that you have created 10 wordfish estimates (plus the original one) ls() x <- lapply(ls(pattern="wordfish"), get) # let's create a list from the bootstrapped data.frame you created y <- as.data.frame(do.call(cbind,x)) # let's now create a unique data.frame with all the bootstrapped data.frame rownames(y) <- docnames(testCorpus) # let's name the rows according to the name of parties in the corpus y emnB <- apply(y, 1, mean) # let's estimate the average thetas across samples emnB conf.level <- 0.95 eLB <- apply(y, 1, function(x) quantile(x, (1 - conf.level)/2)) # let's estimate the lower bound for 95% c.i. eUB <- apply(y, 1, function(x) quantile(x, 1 - (1 - conf.level)/2)) # let's estimate the upper bound for 95% c.i. id <- order(emnB) # let's order the thetas from the largest to the lowest one emnB <- emnB[id] eLB <- eLB[id] eUB <- eUB[id] dotchart(emnB, main = "Documents' position\n(Bootstrapped c.i.)", xlab = "Wordfish dimension", xlim = c(min(eLB) * 0.9, max(eUB) * 1.1), pch = 19) for (i in 1:length(emnB)) { lines(x = c(eLB[i], eUB[i]), y = c(i, i), col = "red") }