rm(list=ls(all=TRUE))
getwd()
# set here the folder where you have saved the datasets you are going to use in this class
# setwd("*****")
# in my case
setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK")
getwd()

library(readtext)
library(quanteda)
library(ggplot2)
library(cowplot)

#########################################################################
#########################################################################
# Creating the Corpus of the UK electoral programs 1992, 1997
#########################################################################
#########################################################################

myText <- readtext("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK/*.txt",
docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year"))
str(myText)
testCorpus <- corpus(myText)
summary(testCorpus)

# I want to name the texts in my just created corpus in a more proper way!
# several possibilities: 
# first one
testCorpus <- corpus(myText, docid_field = "doc_id")
summary(testCorpus)
docnames(testCorpus) <- gsub(".txt", "", docnames(testCorpus ))
summary(testCorpus)

# second one
testCorpus2 <- corpus(myText)
summary(testCorpus2)
docnames(testCorpus2) <- paste(myText$Party, myText$Year, sep = " ")
summary(testCorpus2)

# As discussed, always a good idea to delete stopwords when using Wordscores!
myDfm <- dfm(testCorpus , remove = stopwords("english"), tolower = TRUE, stem = TRUE,
                remove_punct = TRUE, remove_numbers=TRUE)

#########################################################################
#########################################################################
# Checking for length of texts, drop unique words and check correlation across documents 
#########################################################################
#########################################################################
summary(testCorpus)

# keep only words occurring >1 times
myDfm <- dfm_trim(myDfm, min_termfreq = 1)

# compute some document similarities [
Simil <- textstat_simil(myDfm , margin = "documents", method = "correlation")
Simil

#########################################################################
#########################################################################
# Using wordscores: UK example with economic policy positions 
#########################################################################
#########################################################################

# reference texts: 1992 parties manifestos
# reference texts scores: 1992 parties manifestos. Lab: 5.35; LibDem: 8.21; Cons: 17.21
# reference scores derived from an expert survey

# FIRST step:
# Set reference scores 
refscores <- c(17.21, NA, 5.35, NA, 8.21, NA)
refscores

# SECOND step: 
# Assign the reference scores to your dfm
ws <- textmodel_wordscores(myDfm, refscores)
summary(ws) 

# Plot estimated word positions in the reference texts (highlight words and print them in red) 
# it shows the frequency vs. the word-score

textplot_scale1d(ws)
textplot_scale1d(ws, 
                 highlighted = c( "budget", "green", "millennium"), 
                 highlighted_color = "red")

# Doing FIRST and SECOND step in one single step
ws2 <- textmodel_wordscores(myDfm, c(17.21, NA, 5.35, NA, 8.21, NA))
summary(ws2) 

# alternative way to set reference scores
ws3 <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1)))
summary(ws3) 

# THIRD step: we predict the raw Wordscores for all the texts (reference and virgin ones)
pr_raw <- predict(ws, se.fit = TRUE, newdata = myDfm)
pr_raw
textplot_scale1d(pr_raw)

# alternative way (with c.i. rather than with s.e.)
pr_all2 <- predict(ws, interval = "confidence")
pr_all2
textplot_scale1d(pr_all2)

# Plot estimated document positions and group by "party" or "year" variable 
summary(testCorpus)
textplot_scale1d(pr_all2, margin = "documents",  groups = docvars(testCorpus, "Party"))
textplot_scale1d(pr_all2, margin = "documents",  groups = docvars(testCorpus, "Year"))

# we want to predict only the virgin texts using the rescaling LGB option
summary(ws) 
pr_lbg <- predict(ws, rescaling = "lbg", newdata = myDfm[c(2, 4, 6), ])
pr_lbg

# obtaining the corresponding confidence interval
pr_lbg <- predict(ws, rescaling = "lbg", newdata = myDfm[c(2, 4, 6), ], interval = "confidence")
pr_lbg

# save the estimates - features scores
words <- ws$wordscores
str(words)
write.csv(words, "result_wordscores_words.csv")

# save the estimates - document scores
str(pr_all2)
pr_all2
pr_all2$fit
ch <- as.data.frame(pr_all2$fit)
ch$Party <- rownames(ch)
str(ch)
write.csv(ch, "parties.csv")

#### ALTERNATIVE ways to plot confidence intervals with ggplot2 [raw scores]

str(ch)
ch$ci <- ch$fit-ch$lwr
str(ch)

ggplot(ch, aes(x=reorder (Party, fit), y=fit, group=1)) +
    geom_point(aes()) +
    geom_errorbar(width=.1, aes(ymin=fit-ci, ymax=fit+ci), colour="darkred") + coord_flip() + xlab("Position") + ylab("Party")

#### ALTERNATIVE ways to plot confidence intervals with ggplot2 [LBG transformed scores]

# Plot estimated document positions [transformed scores - LBG]
pr2 <- predict(ws, rescaling = "lbg", newdata = myDfm[c(2, 4, 6), ], interval = "confidence")
pr2
str(pr2)
pr2$fit

rescaled <- as.data.frame(pr2$fit)
rescaled$Party <- rownames(rescaled)
str(rescaled )

str(rescaled )
rescaled $ci <- rescaled $fit-rescaled $lwr
str(rescaled )

ggplot(rescaled , aes(x=reorder (Party, fit), y=fit, group=1)) +
    geom_point(aes()) +
    geom_errorbar(width=.1, aes(ymin=fit-ci, ymax=fit+ci), colour="darkred") + coord_flip() + xlab("Position") + ylab("Party")

# if you want to add to the previous graph also the reference scores:
str(ws) 
fit <- ws$y
Party <-  ws$x@Dimnames$docs
original_scores <-data.frame(Party , fit)
original_scores$Party <- as.character(original_scores$Party )
str(original_scores)
original_scores <- na.omit(original_scores)
original_scores$lwr <- 0
original_scores$upr <- 0
str(original_scores)
original_scores$ci <- original_scores$upr-original_scores$lwr
str(original_scores)
str(rescaled)

rescaled  <- rbind(rescaled, original_scores) 
str(rescaled )

ggplot(rescaled , aes(x=reorder (Party, fit), y=fit, group=1)) +
    geom_point(aes()) +
    geom_errorbar(width=.1, aes(ymin=fit-ci, ymax=fit+ci), colour="darkred") + coord_flip() + xlab("Position") + ylab("Party")

#########################################################################
#########################################################################
# Using wordscores: UK example with social policy positions 
#########################################################################
#########################################################################

# reference texts: 1992 parties manifestos
# reference texts scores: 1992 parties manifestos. Lab: 6.87; LibDem: 6.53; Cons: 15.34
# Run the analysis by focusing on the raw scores

#########################################################################
#########################################################################
# Wordscores vs. Wordfish
#########################################################################
#########################################################################

#########################################################################
# Let's compare the results we get from Wordfish with the raw score ones 
# we get from Wordscores using the economic policy position
#########################################################################
#########################################################################

# Wordscores
ws <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1)))
pr_all <- predict(ws, interval = "confidence")

# Wordfish with LAB 92 to the left of CONS 92
wfm <- textmodel_wordfish(myDfm, dir = c(3, 1))

# Comparing wordscores vs wordfish
wordscores <- textplot_scale1d(pr_all)
wordfish <- textplot_scale1d(wfm, margin = "documents")
plot_grid(wordscores , wordfish , labels = c('Wordscores', 'Wordfish'))

# insights: a) same movement of parties between 1992 and 1997: that's a good thing!; 
# b) same position of Cons but not of other 2 parties: what are we measuring with wordfish goes beyond "economic policy" issues?

# check for the correlation
party <- wfm$docs
score_wf <-wfm$theta
score_ws <- pr_all$fit

scores_texts <-data.frame(party, score_wf, score_ws)
str(scores_texts) 
colnames(scores_texts)[3] <- "score_ws"
str(scores_texts) 

cor(scores_texts$score_ws, scores_texts$score_wf)

# you can also draw a scatter, with a fit lines and party names
plot(scores_texts$score_ws, scores_texts$score_wf, main="Scatterplot", 
  	xlab="Wordscores", ylab="Wordfish", pch=19)
text(scores_texts$score_ws, scores_texts$score_wf, labels = scores_texts$party, pos = 4,  col = "royalblue" , cex = 0.8)
abline(lm(scores_texts$score_wf ~scores_texts$score_ws ), col="red") # regression line (y~x) 

#########################################################################
#########################################################################
# Replicate the comparison between Wordfish and Wordscores estimates using
# for Wordscores the social policy position
#########################################################################
#########################################################################

abline(lm(scores_texts_soc$score_wf ~scores_texts_soc$score_ws ), col="red") # regression line (y~x)