rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/IE")
getwd()

library(readtext)
library(quanteda)
library(Hmisc)
library(cowplot)

#########################################################################
#########################################################################
# Creating and Working with a Corpus
#########################################################################
#########################################################################

myText <- readtext("*.txt",
docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Year", "Party"))
str(myText)
testCorpus <- corpus(myText)
summary(testCorpus)

docnames(testCorpus) <- paste(myText$Party, myText$Year, sep = " ")
summary(testCorpus)

# Comparing the results with and w/o stopwords, with and w/o stemming is always a good practice
myDfm <- dfm(testCorpus , remove = stopwords("english"), tolower = TRUE, stem = TRUE,
                remove_punct = TRUE, remove_numbers=TRUE)
topfeatures(myDfm , 20)  # 20 top words

# alternative way to set reference scores
ws <- textmodel_wordscores(myDfm, c(4.5, 13.13, 15, 6.88, 17.63, rep(NA, 5)))
summary(ws) 

# scaling all texts (including the reference ones) 
pr_all <- predict(ws, se.fit = TRUE)
pr_all
doclab <- docnames(testCorpus)
doclab

# wordfish
# here: DL 92 to the left of FG 92
summary(testCorpus)
wfm <- textmodel_wordfish(myDfm, dir = c(1, 3))
summary(wfm)

# Plot estimated word positions
textplot_scale1d(wfm, margin = "features")
textplot_scale1d(wfm, margin = "documents")

# Comparing wordscores vs wordfish
# library(cowplot)
wordscores <- textplot_scale1d(pr_all, margin = "documents",  doclabels = doclab)
wordfish <- textplot_scale1d(wfm, margin = "documents")
plot_grid(wordscores , wordfish , labels = c('Wordscores', 'Wordfish'))

# check for the correlation
party <- wfm$docs
score_wf <-wfm$theta
score_ws <- pr_all$fit
scores_texts <-data.frame(party, score_wf, score_ws)
str(scores_texts) 
cor(scores_texts$score_ws, scores_texts$score_wf)

# you can also draw a scatter, with a fit lines and party names
plot(scores_texts$score_ws, scores_texts$score_wf, main="Scatterplot", 
  	xlab="Wordscores", ylab="Wordfish", pch=19)
text(scores_texts$score_ws, scores_texts$score_wf, labels = scores_texts$party, pos = 4,  col = "royalblue" , cex = 0.8)
abline(lm(scores_texts$score_wf ~scores_texts$score_ws ), col="red") # regression line (y~x) 

# https://en.wikipedia.org/wiki/24th_Government_of_Ireland
# hint: government situation before 1992 elections (FF); before 1997 elections (rainbow coalition between FG, Labour and DL,
# before that, FF cabinet)

# rescale using the LBG transformation
pr_lbg <- predict(ws, rescaling = "lbg", newdata = myDfm[c(6:10), ], interval = "confidence")
pr_lbg

rescaled <- as.data.frame(pr_lbg$fit)
rescaled$party <- rownames(rescaled)
str(rescaled )

################################
# Worfish and Wordscores only 1997! comparison
################################

ch <- merge(rescaled, scores_texts , by=c("party")) 
str(ch)

cor(ch$score_wf, ch$fit)

# you can also draw a scatter, with a fit lines and party names
plot(ch$fit, ch$score_wf, main="Scatterplot", 
  	xlab="Wordscores", ylab="Wordfish", pch=19)
text(ch$fit, ch$score_wf, labels = ch$party, pos = 4,  col = "royalblue" , cex = 0.8)
abline(lm(ch$score_wf ~ch$fit ), col="red") # regression line (y~x)