rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/")

library(readtext)
library(quanteda)
library(quanteda.textmodels)
library(cowplot)
library(PerformanceAnalytics)
library(psych)

#########################################################################
#########################################################################
# Creating the Corpus of the UK electoral programs 1992, 1997
#########################################################################
#########################################################################

myText <- readtext("Lecture 2/Wordscores manifestos/UK/*.txt", 
docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year"))
str(myText)

testCorpus <- corpus(myText )
summary(testCorpus)
# I rename the name of the documents
docnames(testCorpus) <- gsub(".txt", "", docnames(testCorpus ))
summary(testCorpus)

#####################
# Remember! Comparing the results with and w/o stopwords, with and w/o stemming is always a good practice
#####################

myDfm <- dfm(testCorpus , remove = stopwords("english"), tolower = TRUE, stem = TRUE,
                remove_punct = TRUE, remove_numbers=TRUE, split_hyphens = TRUE)
topfeatures(myDfm , 20)  # 20 top words
myDfm <- dfm_remove(myDfm, min_nchar=2)

#########################################################################
#########################################################################
# Using Wordfish 
#########################################################################
#########################################################################

# dir indicates which two documents are used for global identification purposes 
# (the first document to the left of the second one); 
# this matters usually more for the interpretatio of the results (i.e., for the direction of the scores 
# along the latent dimension (which positive, which negative ones)), rather than for the estimation per-se

# here: LAB 92 to the left of CONS 92
summary(testCorpus)
wfm <- textmodel_wordfish(myDfm, dir = c(3, 1))
summary(wfm)

# here: CONS 92 to the left of LAB 92
wfm2 <- textmodel_wordfish(myDfm, dir = c(1, 3))
summary(wfm2)

# compare with previous case!
summary(wfm)
summary(wfm2)
cor(wfm$theta, wfm2$theta)

# what does it happen if now you put LIB 92 to the left of CONS 92?
summary(testCorpus)
wfm3 <- textmodel_wordfish(myDfm, dir = c(5, 1))
summary(wfm3)
# Nothing!
cor(wfm$theta, wfm3$theta)
identical(wfm$theta, wfm3$theta)

# Always do Diagnostic!
# A good start for diagnostics is the analysis of word discrimination parameters.
# Weights with large values mean that these features are estimated to be on the extremes of the dimension

# let's learn how to extract the estimates of the model and save them
# let's start with extracting the features

str(wfm)
words2 <- wfm$features
beta2 <-wfm$beta
psi2 <-wfm$psi
scores_words2 <-data.frame(words2, beta2, psi2)
str(scores_words2) 

# top 20 features for negative beta
head(scores_words2[order(scores_words2$beta2),], 40)
# top 20 words for positive beta
tail(scores_words2[order(scores_words2$beta2),], 40)

# in this case we have just 3 documents and it's not very clear the meaning of the latent dimension just
# by looking at betas (at least the first 40 features). Perhaps pessimism vs. optimism in economy?

# Plot estimated word positions by also highlighting specific words
textplot_scale1d(wfm, margin = "features", 
                 highlighted = c("inflationari", "renov", "petrol", 
                                 "supremaci", "dream", "renaiss"), 
                 highlighted_color = "red")

# Plot estimated document positions
summary(testCorpus)

textplot_scale1d(wfm, margin = "documents")
textplot_scale1d(wfm, margin = "documents",  groups = docvars(testCorpus, "Party"))
textplot_scale1d(wfm, margin = "documents",  groups = docvars(testCorpus, "Year"))

#########################################################################
#########################################################################
# Using wordfish: US Presidential Inaugural Speech after 1980
#########################################################################
#########################################################################

# apply wordfish by first considering Reagan 1981 to the right of Obama 2009; 
# and then Trump 2017 to the right of Obama 2009: any change? 

# create a dfm from inaugural addresses from Reagan onwards
presDfm <- dfm(corpus_subset(data_corpus_inaugural, Year > 1980), 
               remove = stopwords("english"), stem = TRUE, remove_punct = TRUE, remove_numbers=TRUE, split_hyphens = TRUE)
presDfm [, 1:10]
str(presDfm)
presDfm@Dimnames$docs

# Obama 2009 to the left of Reagan 1981
wfm <- textmodel_wordfish(presDfm , dir = c(8, 1))
summary(wfm)

words2 <- wfm$features
beta2 <-wfm$beta
psi2 <-wfm$psi
scores_words2 <-data.frame(words2, beta2, psi2)
str(scores_words2) 

# top 20 words for negative beta
head(scores_words2[order(scores_words2$beta2),], 40)
# top 20 words for positive beta
tail(scores_words2[order(scores_words2$beta2),], 40)

# meaning of the latent dimension: dark vs. bright side of power?

# Highlight specific words
textplot_scale1d(wfm, margin = "features", 
                 highlighted = c("tyranni", "murder", "mortal", 
                                 "cheer", "rainbow", "sovereignti"), 
                 highlighted_color = "red")

# Plot estimated document positions
textplot_scale1d(wfm, margin = "documents")

# Obama 2009 to the left of Trump 2017
wfm2 <- textmodel_wordfish(presDfm , dir = c(8, 10))
summary(wfm2)

# Plot estimated document positions
textplot_scale1d(wfm2, margin = "documents")

# check for the correlation
score_reagan <-wfm$theta
score_trump <-wfm2$theta
cor(score_reagan, score_trump)
identical(score_reagan, score_trump)

#########################################################################
#########################################################################
# Using Wordscores
#########################################################################
#########################################################################

#########################################################################
#########################################################################
# Drop unique words and check correlation across documents 
#########################################################################
#########################################################################
summary(testCorpus)

# keep only words occurring >1 times
myDfm <- dfm_trim(myDfm, min_termfreq = 2)

# compute some document similarities
Simil <- textstat_simil(myDfm , method = "correlation")
Simil

#########################################################################
#########################################################################
# Using wordscores: UK example with economic policy positions 
#########################################################################
#########################################################################

# reference texts: 1992 parties manifestos
# reference texts scores: 1992 parties manifestos. Lab: 5.35; LibDem: 8.21; Cons: 17.21
# reference scores derived from an expert survey

###############
# FIRST step:
# Set reference scores 
###############
refscores <- c(17.21, NA, 5.35, NA, 8.21, NA)
refscores

###############
# SECOND step: 
# Assign the reference scores to your dfm
###############
ws <- textmodel_wordscores(myDfm, refscores)
summary(ws) 

# Plot estimated word positions in the reference texts. It shows the frequency vs. the word-score

textplot_scale1d(ws)

# Doing the FIRST and SECOND step in one single step
ws2 <- textmodel_wordscores(myDfm, c(17.21, NA, 5.35, NA, 8.21, NA))
summary(ws2) 

# alternative way to set reference scores
ws3 <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1)))
summary(ws3) 

###############
# THIRD step: we predict the raw Wordscores for all the texts (reference and virgin ones)
###############
pr_raw <- predict(ws, se.fit = TRUE, newdata = myDfm)
pr_raw
textplot_scale1d(pr_raw)

# alternative way (with c.i. rather than with s.e.)
pr_all2 <- predict(ws, interval = "confidence", newdata = myDfm)
pr_all2
textplot_scale1d(pr_all2)

# Plot estimated document positions and group by "party" or "year" variable 
summary(testCorpus)
textplot_scale1d(pr_all2, margin = "documents",  groups = docvars(testCorpus, "Party"))
textplot_scale1d(pr_all2, margin = "documents",  groups = docvars(testCorpus, "Year"))

# we want to predict only the virgin texts using the rescaling LGB option
summary(ws) 
pr_lbg <- predict(ws, rescaling = "lbg", newdata = myDfm[c(2, 4, 6), ])
pr_lbg

# obtaining the corresponding confidence interval
pr_lbg <- predict(ws, rescaling = "lbg", newdata = myDfm[c(2, 4, 6), ], interval = "confidence")
pr_lbg

#############################################
#### ALTERNATIVE ways to plot confidence intervals with ggplot2 [raw scores]
#############################################

# save the estimates - document scores
str(pr_all2)
pr_all2
pr_all2$fit
ch <- as.data.frame(pr_all2$fit)
ch$Party <- rownames(ch)
str(ch)
ch$ci <- ch$fit-ch$lwr
str(ch)

ggplot(ch, aes(x=reorder (Party, fit), y=fit, group=1)) +
    geom_point(aes()) +
    geom_errorbar(width=.1, aes(ymin=fit-ci, ymax=fit+ci), colour="darkred") + coord_flip() + xlab("Position") + ylab("Party")

#### ALTERNATIVE ways to plot confidence intervals with ggplot2 [LBG transformed scores]

# Plot estimated document positions [transformed scores - LBG]
pr2 <- predict(ws, rescaling = "lbg", newdata = myDfm[c(2, 4, 6), ], interval = "confidence")
pr2
str(pr2)
pr2$fit

rescaled <- as.data.frame(pr2$fit)
rescaled$Party <- rownames(rescaled)
str(rescaled )

str(rescaled )
rescaled $ci <- rescaled $fit-rescaled $lwr
str(rescaled )

ggplot(rescaled , aes(x=reorder (Party, fit), y=fit, group=1)) +
    geom_point(aes()) +
    geom_errorbar(width=.1, aes(ymin=fit-ci, ymax=fit+ci), colour="darkred") + coord_flip() + xlab("Position") + ylab("Party")

# if you want to add to the previous graph also the reference scores:
str(ws) 
fit <- ws$y
Party <-  ws$x@Dimnames$docs
original_scores <-data.frame(Party , fit)
original_scores$Party <- as.character(original_scores$Party )
str(original_scores)
original_scores <- na.omit(original_scores)
original_scores$lwr <- 0
original_scores$upr <- 0
str(original_scores)
original_scores$ci <- original_scores$upr-original_scores$lwr
str(original_scores)
str(rescaled)

rescaled  <- rbind(rescaled, original_scores) 
str(rescaled )

ggplot(rescaled , aes(x=reorder (Party, fit), y=fit, group=1)) +
    geom_point(aes()) +
    geom_errorbar(width=.1, aes(ymin=fit-ci, ymax=fit+ci), colour="darkred") + coord_flip() + xlab("Position") + ylab("Party")

#########################################################################
#########################################################################
# Using wordscores: UK example with social policy positions 
#########################################################################
#########################################################################

# reference texts: 1992 parties manifestos
# reference texts scores: 1992 parties manifestos. Lab: 6.87; LibDem: 6.53; Cons: 15.34
# Run the analysis by focusing on the raw scores

# FIRST step:
# Set reference scores 
refscores <- c(15.34, NA, 6.87, NA, 6.53, NA)
refscores

# SECOND step: 
# Assign the reference scores to your dfm
ws <- textmodel_wordscores(myDfm, refscores)
summary(ws) 

# Plot estimated word positions in the reference texts (highlight words and print them in red) 
# it shows the frequency vs. the word-score

textplot_scale1d(ws)

# THIRD step: we predict the raw Wordscores for all the texts (reference and virgin ones)
pr_all2 <- predict(ws, interval = "confidence")
pr_all2
textplot_scale1d(pr_all2)

# let's compare results we got via economic vs. social policy dimension
# Wordscores
ws <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1)))
pr_eco <- predict(ws, interval = "confidence")
eco <- textplot_scale1d(pr_eco)
soc <- textplot_scale1d(pr_all2)

plot_grid(eco , soc , labels = c('Economic', 'Social'))

str(ws)
str(pr_all2)
str(pr_eco)

# check for the correlation
party <- ws$x@Dimnames$docs
score_soc <- pr_all2$fit
score_eco <- pr_eco$fit

scores_texts <-data.frame(party, score_soc, score_eco )
str(scores_texts) 
colnames(scores_texts)[2] <- "scoreSOC"
colnames(scores_texts)[5] <- "scoreECO"
str(scores_texts) 

cor(scores_texts$scoreSOC, scores_texts$scoreECO)

# Plotting the 2-D policy space

plot(scores_texts$scoreECO, scores_texts$scoreSOC, main = "UK 2-D policy space",
     xlab = "Economic scale", ylab = "Social Scale",
     pch = 19, frame = FALSE, xlim=c(9,12), ylim=c(9,11))
text(scores_texts$scoreECO, scores_texts$scoreSOC, labels = scores_texts$party, pos = 3)

#########################################################################
#########################################################################
# Wordscores vs. Wordfish
#########################################################################
#########################################################################

#########################################################################
# Let's compare the results we get from Wordfish with the raw score ones 
# we get from Wordscores using the economic policy position
#########################################################################
#########################################################################

# Wordscores
ws <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1)))
pr_all <- predict(ws, interval = "confidence")

# Wordfish with LAB 92 to the left of CONS 92
wfm <- textmodel_wordfish(myDfm, dir = c(3, 1))

# Comparing wordscores vs wordfish
wordscores <- textplot_scale1d(pr_all)
wordfish <- textplot_scale1d(wfm, margin = "documents")
plot_grid(wordscores , wordfish , labels = c('Wordscores', 'Wordfish'))

# insights: a) same movement of parties between 1992 and 1997: that's a good thing!; 
# b) same position of Cons but not of other 2 parties: what are we measuring with wordfish goes beyond "economic policy" issues?

# check for the correlation
party <- wfm$docs
score_wf <-wfm$theta
score_ws <- pr_all$fit

scores_texts <-data.frame(party, score_wf, score_ws)
str(scores_texts) 
colnames(scores_texts)[3] <- "score_ws"
str(scores_texts) 

cor(scores_texts$score_ws, scores_texts$score_wf)

# you can also draw a scatter, with a fit lines and party names
plot(scores_texts$score_ws, scores_texts$score_wf, main="Scatterplot", 
  	xlab="Wordscores", ylab="Wordfish", pch=19)
text(scores_texts$score_ws, scores_texts$score_wf, labels = scores_texts$party, pos = 4,  col = "royalblue" , cex = 0.8)
abline(lm(scores_texts$score_wf ~scores_texts$score_ws ), col="red") # regression line (y~x) 

#########################################################################
#########################################################################
# Replicate the comparison between Wordfish and Wordscores estimates using
# for Wordscores the social policy position
#########################################################################
#########################################################################

# reference texts: 1992 parties manifestos
# reference texts scores: 1992 parties manifestos. Lab: 6.87; LibDem: 6.53; Cons: 15.34
# Run the analysis by focusing on the raw scores

# Wordscores
ws <- textmodel_wordscores(myDfm, c(15.34, rep(NA,1), 6.87, rep(NA,1), 6.53, rep(NA,1)))
pr_all <- predict(ws, interval = "confidence")

# Wordfish with LAB 92 to the left of CONS 92
wfm <- textmodel_wordfish(myDfm, dir = c(3, 1))

# Comparing wordscores vs wordfish
wordscores <- textplot_scale1d(pr_all)
wordfish <- textplot_scale1d(wfm, margin = "documents")
plot_grid(wordscores , wordfish , labels = c('Wordscores', 'Wordfish'))

# check for the correlation
party <- wfm$docs
score_wf <-wfm$theta
score_ws <- pr_all$fit

scores_texts_soc <-data.frame(party, score_wf, score_ws)
str(scores_texts_soc) 
colnames(scores_texts_soc)[3] <- "score_ws"
str(scores_texts_soc) 

# higher correlation with social policy rather than economic policy (.89 vs. .82). Wordfish scale more related to this dimension?
cor(scores_texts_soc$score_ws, scores_texts_soc$score_wf)

# you can also draw a scatter, with a fit lines and party names
plot(scores_texts_soc$score_ws, scores_texts_soc$score_wf, main="Scatterplot", 
  	xlab="Wordscores", ylab="Wordfish", pch=19)
text(scores_texts_soc$score_ws, scores_texts_soc$score_wf, labels = scores_texts_soc$party, pos = 4,  col = "royalblue" , cex = 0.8)
abline(lm(scores_texts_soc$score_wf ~scores_texts_soc$score_ws ), col="red") # regression line (y~x)