rm(list=ls(all=TRUE))
getwd()
# set here the folder where you have saved the datasets you are going to use in this class
# setwd("*****")
# in my case
setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK")
getwd()

library(readtext)
library(quanteda)
library(cowplot)

#########################################################################
#########################################################################
# Creating and Working with a Corpus
#########################################################################
#########################################################################

myText <- readtext("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK/*.txt", 
docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year"))
str(myText)
testCorpus <- corpus(myText )
summary(testCorpus)
testCorpus <- corpus(myText, docid_field = "doc_id")
summary(testCorpus)
docnames(testCorpus) <- gsub(".txt", "", docnames(testCorpus ))
summary(testCorpus)

# Comparing the results with and w/o stopwords, with and w/o stemming is always a good practice.
# For example, given that in Wordfish word-fixed effects are estimated, removing stopwords not needed as in other programs...
myDfm <- dfm(testCorpus , remove = stopwords("english"), tolower = TRUE, stem = TRUE,
                remove_punct = TRUE, remove_numbers=TRUE)
topfeatures(myDfm , 20)  # 20 top words

#########################################################################
#########################################################################
# Using wordfish 
#########################################################################
#########################################################################

# READ the pdf file!

# dir indicates which two documents are used for global identification purposes 
# (first document to the left of the second one); this matters usually more for the interpretatio of the results (i.e., for the direction of the scores 
# along the latent dimension (which positive, which negative ones)), rather than for the estimation per-se

# here: LAB 92 to the left of CONS 92
summary(testCorpus)
wfm <- textmodel_wordfish(myDfm, dir = c(3, 1))
summary(wfm)
str(wfm)

# here: CONS 92 to the left of LAB 92
wfm2 <- textmodel_wordfish(myDfm, dir = c(1, 3))
summary(wfm2)

# compare with previous case!
summary(wfm)
summary(wfm2)
cor(wfm$theta, wfm2$theta)

# and what if now LIB97 to the left of CONS 97?
summary(testCorpus)
wfm3 <- textmodel_wordfish(myDfm, dir = c(6, 2))
summary(wfm3)

# compare with the first case:
cor(wfm$theta, wfm3$theta)

# always do Diagnostic!
# A good start for diagnostics is the analysis of word discrimination parameters.
# Weights with large values mean that these words are estimated to be on the extremes of the dimension

# Plot estimated word positions
textplot_scale1d(wfm, margin = "features")

textplot_scale1d(wfm, margin = "features", 
                 highlighted = c("government", "global", "children", 
                                 "bank", "economy", "citizenship",
                                 "productivity", "deficit", "april"), 
                 highlighted_color = "red")

# Plot estimated document positions
textplot_scale1d(wfm, margin = "documents")
textplot_scale1d(wfm, margin = "documents",  groups = docvars(testCorpus, "Party"))
textplot_scale1d(wfm, margin = "documents",  groups = docvars(testCorpus, "Year"))

# extract estimates of the model and save them
str(wfm)
words <- wfm$features
beta <-wfm$beta
psi <-wfm$psi
scores_words <-data.frame(words, beta, psi)
str(scores_words) 
head(scores_words)

# showing the 10 words with the largest positive beta value
step <-order(-scores_words$beta )
scores_words<- scores_words[step , ]
head(scores_words, 10)
# showing the 10 words with the largest negative beta value
tail(scores_words, 10)

# save the results in a csv file
write.csv(scores_words, "result_wordfish_words.csv")

party <- wfm$docs
theta <-wfm$theta
se.theta <-wfm$se.theta
scores_texts <-data.frame(party , theta , se.theta)
str(scores_texts) 
scores_texts$lower <- scores_texts$theta +1.96*scores_texts$se.theta
scores_texts$upper <- scores_texts$theta -1.96*scores_texts$se.theta
str(scores_texts)

# save the results in a csv file
write.csv(scores_texts, "result_wordfish_texts.csv")

#########################################################################
# Let's compare the results we get from Wordfish with the row score ones 
# we get from Wordscores using the economic policy position
#########################################################################
#########################################################################

# set reference scores
ws <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1)))
summary(ws) 
pr_all <- predict(ws, se.fit = TRUE)
pr_all
doclab <- docnames(testCorpus)
doclab

# Comparing wordscores vs wordfish
wordscores <- textplot_scale1d(pr_all, margin = "documents",  doclabels = doclab)
wordfish <- textplot_scale1d(wfm, margin = "documents")
plot_grid(wordscores , wordfish , labels = c('Wordscores', 'Wordfish'))

# insights: a) same movement of parties between 1992 and 1997: that's a good thing!; 
# b) same position of cons but not of other 2 parties: what are we measuring with wordfish goes beyond "economic policy" issues?

# check for the correlation
party <- wfm$docs
score_wf <-wfm$theta
score_ws <- pr_all$fit
scores_texts <-data.frame(party, score_wf, score_ws)
str(scores_texts) 
cor(scores_texts$score_ws, scores_texts$score_wf)

# you can also draw a scatter, with a fit lines and party names
plot(scores_texts$score_ws, scores_texts$score_wf, main="Scatterplot", 
  	xlab="Wordscores", ylab="Wordfish", pch=19)
text(scores_texts$score_ws, scores_texts$score_wf, labels = scores_texts$party, pos = 4,  col = "royalblue" , cex = 0.8)
abline(lm(scores_texts$score_wf ~scores_texts$score_ws ), col="red") # regression line (y~x) 

#########################################################################
#########################################################################
# Using wordfish: US Presidential Inaugural Speech after 1980
#########################################################################
#########################################################################

# apply wordfish by first considering Reagan 1981 to the right of Obama 2009; 
# and then Trump 2017 to the right of Obama 2009: any change? Compare such results to what you got in wordscores when
# Obama 2009=-1, and Reagan 1981=1 OR Trump 2017=1

# create a dfm from inaugural addresses from Reagan onwards
presDfm <- dfm(corpus_subset(data_corpus_inaugural, Year > 1980), 
               remove = stopwords("english"), stem = TRUE, remove_punct = TRUE)
presDfm [, 1:5]

# Obama 2009 to the left of Reagan 1981
wfm <- textmodel_wordfish(presDfm , dir = c(8, 1))
summary(wfm)

# Plot estimated word positions
textplot_scale1d(wfm, margin = "features")

# Plot estimated document positions
textplot_scale1d(wfm, margin = "documents")

# Obama 2009 to the left of Trump 2017
wfm2 <- textmodel_wordfish(presDfm , dir = c(8, 10))
summary(wfm2)

# Plot estimated document positions
textplot_scale1d(wfm2, margin = "documents")

# Comparing wordfish with Reagan or Trump in "dir"
# library(cowplot)
reagan <- textplot_scale1d(wfm, margin = "documents")
trump <- textplot_scale1d(wfm2, margin = "documents")
plot_grid(reagan , trump , labels = c('Reagan=on the right of Obama', 'Trump=on the right of Obama'))

# check for the correlation
score_reagan <-wfm$theta
score_trump <-wfm2$theta
cor(score_reagan, score_trump)

# estimate wordscores with Reagan 1981=1 and Obama 2009=-1
ws_pres <- textmodel_wordscores(presDfm, c(1, NA, NA, 0, NA, NA, NA, -1, NA, NA))
summary(ws_pres) 
# scaling all texts (including the reference ones) 
pr_all_pres <- predict(ws_pres, se.fit = TRUE )
pr_all_pres

# estimates wordscores with Trump 2017=1 and Obama 2009=-1
ws_pres2 <- textmodel_wordscores(presDfm, c(NA, NA, NA, 0, NA, NA, NA, -1, NA, 1))
summary(ws_pres2) 
# scaling all texts (including the reference ones) 
pr_all_pres2 <- predict(ws_pres2, se.fit = TRUE)
pr_all_pres2

summary(corpus_subset(data_corpus_inaugural, Year > 1980))
doclab <- docnames(corpus_subset(data_corpus_inaugural, Year > 1980))
doclab

# Comparing wordscores with Reagan or Trump in "dir"
reagan_ws <- textplot_scale1d(pr_all_pres, margin = "documents",  doclabels = doclab)
trump_ws <- textplot_scale1d(pr_all_pres2 , margin = "documents",  doclabels = doclab)
plot_grid(reagan , trump , reagan_ws, trump_ws, labels = c('Reagan_WF', 'Trump_WF', "Reagan_WS=1", "Trump_WS=1"))

# check for the correlation
score_ws_reagan <- pr_all_pres$fit
score_ws_trump <- pr_all_pres2$fit
set_scores <- data.frame(score_reagan, score_trump, score_ws_reagan, score_ws_trump)
str(set_scores)
cor(set_scores, use="complete.obs", method="kendall") 

# Compared to wordscores, you see that a) the results do not change according to document selection in "dir"; b) on the contary, 
# the results you get in wordscores are highly sensitive to that; 
# c) finally wordscores with Reagan=1 much more resemble the results you get via Wordfish