rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK")
getwd()

library(readtext)
library(quanteda)
library(quanteda.textmodels)
library(cowplot)
library(PerformanceAnalytics)
library(psych)
library(quanteda.textplots)
library(quanteda.textstats)
library(ggplot2)

#########################################################################
#########################################################################
# Creating and Working with a Corpus
#########################################################################
#########################################################################

myText <- readtext("C:/Users/luigi/Dropbox/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK/*.txt", 
docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year"))
str(myText)
# Text pre-processing: let's make some cleaning;
# for example the apostrophe (Quanteda struggles with that...)
myText$text <- gsub("'"," ",myText$text)

testCorpus <- corpus(myText )
summary(testCorpus)
# I rename the name of the documents
docnames(testCorpus) <- gsub(".txt", "", docnames(testCorpus ))
summary(testCorpus)

# Alternatively I could create a new variable
myText$code <- paste(myText$Party, as.character(myText$Year), sep = " ")
# and the pass it as the name of the documents to be considered
testCorpus2 <- corpus(myText, docid_field = "code")
summary(testCorpus2)

# Comparing the results with and w/o stopwords, with and w/o stemming is always a good practice.
tok2 <- tokens(testCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
myDfm <- dfm(tok2 )
topfeatures(myDfm , 20)  # 20 top words
# let's keep just those features with at least 2 characters (to remove for example the "s")
myDfm <- dfm_remove(myDfm, min_nchar=2)
topfeatures(myDfm , 20)  # 20 top words

#########################################################################
#########################################################################
# Drop unique words and check similarity across documents 
#########################################################################
#########################################################################
summary(testCorpus)

# keep only words occurring >1 times
myDfm <- dfm_trim(myDfm, min_termfreq = 2)

# compute some document similarities
Simil <- textstat_simil(myDfm , method = "cosine")
Simil

#########################################################################
#########################################################################
# Using wordscores: UK example with economic policy positions 
#########################################################################
#########################################################################

# reference texts: 1992 parties manifestos
# reference texts scores: 1992 parties manifestos. Lab: 5.35; LibDem: 8.21; Cons: 17.21
# reference scores derived from an expert survey

# FIRST step:
# Set reference scores 
refscores <- c(17.21, NA, 5.35, NA, 8.21, NA)
refscores

# SECOND step: 
# Assign the reference scores to your dfm
ws <- textmodel_wordscores(myDfm, refscores)
summary(ws) 

# Plot estimated word positions in the reference texts (highlight words and print them in red) 
# it shows the frequency vs. the word-score

textplot_scale1d(ws, 
                 highlighted = c( "budget", "green", "millennium"), 
                 highlighted_color = "red")

# Doing FIRST and SECOND step in one single step
ws2 <- textmodel_wordscores(myDfm, c(17.21, NA, 5.35, NA, 8.21, NA))
summary(ws2) 

# alternative way to set reference scores
ws3 <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1)))
summary(ws3) 

# THIRD step: we predict the raw Wordscores for all the texts (reference and virgin ones)
pr_raw <- predict(ws, se.fit = TRUE, newdata = myDfm)
pr_raw
textplot_scale1d(pr_raw)

# alternative way (with c.i. rather than with s.e.)
pr_all2 <- predict(ws, interval = "confidence", newdata = myDfm)
pr_all2
textplot_scale1d(pr_all2)

# Plot estimated document positions and group by "party" or "year" variable 
summary(testCorpus)
textplot_scale1d(pr_all2, margin = "documents",  groups = docvars(testCorpus, "Party"))
textplot_scale1d(pr_all2, margin = "documents",  groups = docvars(testCorpus, "Year"))

# we want to predict only the virgin texts using the rescaling LGB option
summary(ws) 
pr_lbg <- predict(ws, rescaling = "lbg", newdata = myDfm[c(2, 4, 6), ])
pr_lbg

# obtaining the corresponding confidence interval
pr_lbg <- predict(ws, rescaling = "lbg", newdata = myDfm[c(2, 4, 6), ], interval = "confidence")
pr_lbg

#############################################
#### ALTERNATIVE ways to plot confidence intervals with ggplot2 [raw scores]
#############################################

str(pr_all2)
pr_all2
pr_all2$fit
ch <- as.data.frame(pr_all2$fit)
ch$Party <- rownames(ch)
str(ch)
ch$ci <- ch$fit-ch$lwr
str(ch)

ggplot(ch, aes(x=reorder (Party, fit), y=fit, group=1)) +
    geom_point(aes()) +
    geom_errorbar(width=.1, aes(ymin=fit-ci, ymax=fit+ci), colour="darkred") + coord_flip() + xlab("Position") + ylab("Party")

#### ALTERNATIVE ways to plot confidence intervals with ggplot2 [LBG transformed scores]

# Plot estimated document positions [transformed scores - LBG]
pr2 <- predict(ws, rescaling = "lbg", newdata = myDfm[c(2, 4, 6), ], interval = "confidence")
pr2
str(pr2)
pr2$fit

rescaled <- as.data.frame(pr2$fit)
rescaled$Party <- rownames(rescaled)
str(rescaled )

str(rescaled )
rescaled $ci <- rescaled $fit-rescaled $lwr
str(rescaled )

ggplot(rescaled , aes(x=reorder (Party, fit), y=fit, group=1)) +
    geom_point(aes()) +
    geom_errorbar(width=.1, aes(ymin=fit-ci, ymax=fit+ci), colour="darkred") + coord_flip() + xlab("Position") + ylab("Party")

# if you want to add to the previous graph also the reference scores:
str(ws) 
fit <- ws$y
Party <-  ws$x@Dimnames$docs
original_scores <-data.frame(Party , fit)
original_scores$Party <- as.character(original_scores$Party )
str(original_scores)
original_scores <- na.omit(original_scores)
original_scores$lwr <- 0
original_scores$upr <- 0
str(original_scores)
original_scores$ci <- original_scores$upr-original_scores$lwr
str(original_scores)
str(rescaled)

rescaled  <- rbind(rescaled, original_scores) 
str(rescaled )

ggplot(rescaled , aes(x=reorder (Party, fit), y=fit, group=1)) +
    geom_point(aes()) +
    geom_errorbar(width=.1, aes(ymin=fit-ci, ymax=fit+ci), colour="darkred") + coord_flip() + xlab("Position") + ylab("Party")

#########################################################################
#########################################################################
# Using wordscores: UK example with social policy positions 
#########################################################################
#########################################################################

# reference texts: 1992 parties manifestos
# reference texts scores: 1992 parties manifestos. Lab: 6.87; LibDem: 6.53; Cons: 15.34
# Run the analysis by focusing on the raw scores

# FIRST step:
# Set reference scores 
refscores <- c(15.34, NA, 6.87, NA, 6.53, NA)
refscores

# SECOND step: 
# Assign the reference scores to your dfm
ws <- textmodel_wordscores(myDfm, refscores)
summary(ws) 

# THIRD step: we predict the raw Wordscores for all the texts (reference and virgin ones)
pr_all2 <- predict(ws, interval = "confidence")
pr_all2
textplot_scale1d(pr_all2)

# let's compare results we got via economic vs. social policy dimension
# Wordscores
ws <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1)))
pr_eco <- predict(ws, interval = "confidence")
eco <- textplot_scale1d(pr_eco)
soc <- textplot_scale1d(pr_all2)

plot_grid(eco , soc , labels = c('Economic', 'Social'))

str(ws)
str(pr_all2)
str(pr_eco)

# check for the correlation
party <- ws$x@Dimnames$docs
score_soc <- pr_all2$fit
score_eco <- pr_eco$fit

scores_texts <-data.frame(party, score_soc, score_eco )
str(scores_texts) 
colnames(scores_texts)[2] <- "scoreSOC"
colnames(scores_texts)[5] <- "scoreECO"
str(scores_texts) 

cor(scores_texts$scoreSOC, scores_texts$scoreECO)

# Plotting the 2-D policy space

plot(scores_texts$scoreECO, scores_texts$scoreSOC, main = "UK 2-D policy space",
     xlab = "Economic scale", ylab = "Social Scale",
     pch = 19, frame = FALSE, xlim=c(9,12), ylim=c(9,11))
text(scores_texts$scoreECO, scores_texts$scoreSOC, labels = scores_texts$party, pos = 3)

#########################################################################
#########################################################################
# Wordscores vs. Wordfish
#########################################################################
#########################################################################

#########################################################################
# Let's compare the results we get from Wordfish with the raw score ones 
# we get from Wordscores using the economic policy position
#########################################################################
#########################################################################

# Wordscores
ws <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1)))
pr_all <- predict(ws, interval = "confidence")

# Wordfish with LAB 92 to the left of CONS 92
wfm <- textmodel_wordfish(myDfm, dir = c(3, 1))

# Comparing wordscores vs wordfish
wordscores <- textplot_scale1d(pr_all)
wordfish <- textplot_scale1d(wfm, margin = "documents")
plot_grid(wordscores , wordfish , labels = c('Wordscores', 'Wordfish'))

# insights: a) same movement of parties between 1992 and 1997: that's a good thing!; 
# b) same position of Cons but not of other 2 parties: what are we measuring with wordfish goes beyond "economic policy" issues?

# check for the correlation
party <- wfm$docs
score_wf <-wfm$theta
score_ws <- pr_all$fit

scores_texts <-data.frame(party, score_wf, score_ws)
str(scores_texts) 
colnames(scores_texts)[3] <- "score_ws"
str(scores_texts) 

cor(scores_texts$score_ws, scores_texts$score_wf)

# you can also draw a scatter, with a fit lines and party names
plot(scores_texts$score_ws, scores_texts$score_wf, main="Scatterplot", 
  	xlab="Wordscores", ylab="Wordfish", pch=19)
text(scores_texts$score_ws, scores_texts$score_wf, labels = scores_texts$party, pos = 4,  col = "royalblue" , cex = 0.8)
abline(lm(scores_texts$score_wf ~scores_texts$score_ws ), col="red") # regression line (y~x) 

#########################################################################
#########################################################################
# Replicate the comparison between Wordfish and Wordscores estimates using
# for Wordscores the social policy position
#########################################################################
#########################################################################

# reference texts: 1992 parties manifestos
# reference texts scores: 1992 parties manifestos. Lab: 6.87; LibDem: 6.53; Cons: 15.34
# Run the analysis by focusing on the raw scores

# Wordscores
ws <- textmodel_wordscores(myDfm, c(15.34, rep(NA,1), 6.87, rep(NA,1), 6.53, rep(NA,1)))
pr_all <- predict(ws, interval = "confidence")

# Wordfish with LAB 92 to the left of CONS 92
wfm <- textmodel_wordfish(myDfm, dir = c(3, 1))

# Comparing wordscores vs wordfish
wordscores <- textplot_scale1d(pr_all)
wordfish <- textplot_scale1d(wfm, margin = "documents")
plot_grid(wordscores , wordfish , labels = c('Wordscores', 'Wordfish'))

# check for the correlation
party <- wfm$docs
score_wf <-wfm$theta
score_ws <- pr_all$fit

scores_texts_soc <-data.frame(party, score_wf, score_ws)
str(scores_texts_soc) 
colnames(scores_texts_soc)[3] <- "score_ws"
str(scores_texts_soc) 

# higher correlation with social policy rather than economic policy (.88 vs. .81). Wordfish scale more related to this dimension?
cor(scores_texts_soc$score_ws, scores_texts_soc$score_wf)

# you can also draw a scatter, with a fit lines and party names
plot(scores_texts_soc$score_ws, scores_texts_soc$score_wf, main="Scatterplot", 
  	xlab="Wordscores", ylab="Wordfish", pch=19)
text(scores_texts_soc$score_ws, scores_texts_soc$score_wf, labels = scores_texts_soc$party, pos = 4,  col = "royalblue" , cex = 0.8)
abline(lm(scores_texts_soc$score_wf ~scores_texts_soc$score_ws ), col="red") # regression line (y~x)