rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/")
getwd()

library(readtext)
library(quanteda)
library(quanteda.textmodels)
library(quanteda.textplots)
library(quanteda.textstats)
library(cowplot)
library(psych)
library(PerformanceAnalytics)

#########################################################################
#########################################################################
# Creating the Corpus of the UK electoral programs 1992, 1997
#########################################################################
#########################################################################

myText <- readtext("Lecture 2/Wordscores manifestos/UK/*.txt", 
docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year"))
str(myText)

testCorpus <- corpus(myText )
summary(testCorpus)
# I rename the name of the documents
docnames(testCorpus) <- gsub(".txt", "", docnames(testCorpus ))
summary(testCorpus)

tok2 <- tokens(testCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)

#####################
# Remember! Comparing the results with and w/o stopwords, with and w/o stemming is a good practice.
#####################

myDfm <- dfm(tok2)
topfeatures(myDfm , 20)  # 20 top words
# let's keep just those features with at least 2 characters
myDfm <- dfm_remove(myDfm, min_nchar=2)

#########################################################################
#########################################################################
# Obtaining bootstrapped c.i. 
#########################################################################
#########################################################################

# here: LAB 92 to the left of CONS 92
summary(testCorpus)
wfm <- textmodel_wordfish(myDfm, dir = c(3, 1))
textplot_scale1d(wfm, margin = "documents")

set.seed(1234) # let's set a seed so that we can always get the same results (for replicability)
# let's resample 10 dfm, i.e., n=10 - usually the number of samples should be much larger! 500 or more
bt <- bootstrap_dfm(testCorpus, n = 10, verbose = TRUE, remove = stopwords("english"), tolower = TRUE, stem = TRUE,
                remove_punct = TRUE, remove_numbers=TRUE)
str(bt)

for (i  in bt ) 
{ 
     wf<-textmodel_wordfish((i), dir = c(3, 1)) # run wordfish
     df<-wf$theta # save the estimates as a data frame
     df.name<-paste0("wordfish",(i+runif(1, min=0, max=100))) # create the name for the object that will save the wordfish matrix for each loop (=10)  
     assign(df.name,df)
     }

# check that you have created 10 wordfish estimates (plus the original one)
ls()

x <- lapply(ls(pattern="wordfish"), get) # let's create a list from the bootstrapped data.frame you created
y <- as.data.frame(do.call(cbind,x)) # let's now create a unique data.frame with all the  bootstrapped data.frame 
rownames(y) <- docnames(testCorpus) # let's name the rows according to the name of parties in the corpus
y

emnB  <- apply(y, 1, mean) # let's estimate the average thetas across samples
emnB  
conf.level <- 0.95
eLB  <- apply(y, 1, function(x) quantile(x, (1 - 
   conf.level)/2)) # let's estimate the lower bound for 95% c.i.
eUB  <- apply(y, 1, function(x) quantile(x, 1 - (1 - 
 conf.level)/2)) # let's estimate the upper bound for 95% c.i.

id <- order(emnB) # let's order the thetas from the largest to the lowest one
emnB <- emnB[id]
eLB <- eLB[id]
eUB <- eUB[id]

dotchart(emnB, main = "Documents' position\n(Bootstrapped c.i.)", 
            xlab = "Wordfish dimension", xlim = c(min(eLB) * 
                0.9, max(eUB) * 1.1), pch = 19)
        for (i in 1:length(emnB)) {
            lines(x = c(eLB[i], eUB[i]), y = c(i, i), col = "red")
        }