rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/")

library(readtext)
library(quanteda)
library(quanteda.textmodels)
library(cowplot)
library(PerformanceAnalytics)
library(psych)
library(quanteda.textplots)
library(quanteda.textstats)
library(ggplot2)
library(ggrepel)
library(dplyr)

#########################################################################
#########################################################################
# Creating the Corpus of the UK electoral programs 1992, 1997
#########################################################################
#########################################################################

myText <- readtext("Lecture 2/Wordscores manifestos/UK/*.txt", 
docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year"))
str(myText)
myText$text <- gsub("'"," ",myText$text)
testCorpus <- corpus(myText )
summary(testCorpus)
# I rename the name of the documents
docnames(testCorpus) <- gsub(".txt", "", docnames(testCorpus ))
summary(testCorpus)

tok2 <- tokens(testCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
myDfm <- dfm(tok2 )
topfeatures(myDfm , 20)  # 20 top words
# let's keep just those features with at least 2 characters (to remove for example the "s")
myDfm <- dfm_remove(myDfm, min_nchar=2)
topfeatures(myDfm , 20)  # 20 top words

#########################################################################
#########################################################################
# Using Wordfish 
#########################################################################
#########################################################################

# dir indicates which two documents are used for global identification purposes 
# (the first document to the left of the second one); 
# this matters ONLY for the interpretation of the results (i.e., for the direction of the scores 
# along the latent dimension - which positive / which negative), not for the estimation per-se

docvars(myDfm)
# here: LAB 92 to the left of CONS 92
wfm <- textmodel_wordfish(myDfm, dir = c(3, 1))
summary(wfm)

# here: CONS 92 to the left of LAB 92
wfm2 <- textmodel_wordfish(myDfm, dir = c(1, 3))
summary(wfm2)

# compare with previous case!
summary(wfm)
summary(wfm2)
str(wfm)
cor(wfm$theta, wfm2$theta)

# what does it happen if now you put LIB 92 to the left of CONS 92?
summary(testCorpus)
wfm3 <- textmodel_wordfish(myDfm, dir = c(5, 1))
summary(wfm3)
# Nothing!
cor(wfm$theta, wfm3$theta)
identical(wfm$theta, wfm3$theta)

# Always do some Diagnostic by analyzing word discrimination parameters
# Weights with large values mean that these features are estimated to be on the extremes of the dimension

# Let's learn how to extract the estimates of the model, starting with extracting the features

str(wfm)
scores_words <-data.frame(wfm$features, wfm$beta, wfm$psi)
str(scores_words) 
# Let's check for the correlation between psi and beta
cor(abs(scores_words$wfm.beta), scores_words$wfm.psi)
# as expected, the correlation is very negative: the largest is psi, the lower is the absolute value of
# beta and viceversa

# Plot estimated word positions
textplot_scale1d(wfm, margin = "features")

# Alternative way of plotting the psi/beta via ggplot: we do see the typical triangle shape   

# with points
ggplot(scores_words, aes(wfm.beta, wfm.psi)) + 
  geom_point() + 
  theme_light() +
  labs(title  = "Scatterplot for UK-manifestoes",
       x = "Estimated Beta",
       y = "Estimated Psi")

# with labels
ggplot(scores_words, aes(wfm.beta, wfm.psi, label= wfm.features))+
  geom_point() +geom_text(hjust=0, vjust=0) + 
  theme_light() +
  labs(title  = "Scatterplot for UK-manifestoes",
       x = "Estimated Beta",
       y = "Estimated Psi")

# Let's try to give an interpretation of our results by focusing on the words with the largest
# betas in absolute value

# top 40 features for negative beta: renounce, democrat, banks, enterprises, pollutants, nationals  
head(scores_words[order(scores_words$wfm.beta),], 40)
# top 40 words for positive beta: producers, preservation, efforts, enrich, illegal, socialism
tail(scores_words[order(scores_words$wfm.beta),], 40) 

# in this case we have just 6 documents and it's not very clear the meaning of the latent dimension just
# by looking at betas (at least the first 40 features). Perhaps progressive vs. conservative?

# we can also plot our results (for a visual representation)

# let's extract the top 10 words with either the largest positive or negative beta
scores_words2 <- top_n(scores_words, 10, wfm.beta ) 
scores_words2
scores_words3 <- top_n(scores_words, -10, wfm.beta ) 
scores_words3
scores_words_new <- rbind(scores_words2, scores_words3)
# reorder the features
scores_words_new  <-  mutate(scores_words_new, Feature= reorder(wfm.features, wfm.beta))

 ggplot(scores_words_new , aes(Feature, wfm.beta))  +
  geom_col(aes(fill = wfm.psi)) +
  scale_fill_viridis_c() + 
  coord_flip() + 
  theme_light() + 
  labs(title = "\nTop 10 words with the highest/lowest beta-value\n",
       x = "",
       y = "Beta",
       fill = "Psi")

# Plot estimated word positions by also highlighting specific words
textplot_scale1d(wfm, margin = "features", 
                 highlighted = c("renounce", "pollutants", "democrat", 
                                 "producers", "illegal", "enrich"), 
                 highlighted_color = "red", alpha=0.3)

# alternatively let's just show the features with abs(beta)>3
# note that to avoid labels overlap we use geom_text_repel rather than geom_text

ggplot(scores_words, aes(wfm.beta, wfm.psi))+
  geom_point() +
  theme_light() +
  labs(title  = "Scatterplot for UK-manifestoes",
       x = "Estimated Beta",
       y = "Estimated Psi") +
geom_text_repel(data=filter(scores_words, abs(wfm.beta)>3), aes(label=wfm.features))

# Plot estimated document positions
summary(testCorpus)

textplot_scale1d(wfm, margin = "documents")
textplot_scale1d(wfm, margin = "documents",  groups = docvars(testCorpus, "Party"))
textplot_scale1d(wfm, margin = "documents",  groups = docvars(testCorpus, "Year"))

#########################################################################
#########################################################################
# Let's compare the results we get from Wordfish with the raw score ones 
# we get from Wordscores using the economic policy position
#########################################################################
#########################################################################

docvars(myDfm)
# Wordscores
ws <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1)))
pr_all <- predict(ws, interval = "confidence")

# Comparing wordscores vs wordfish
wordscores <- textplot_scale1d(pr_all)
wordfish <- textplot_scale1d(wfm, margin = "documents")
plot_grid(wordscores , wordfish , labels = c('Wordscores', 'Wordfish'))

# check for the correlation
party <- wfm$docs
score_wf <-wfm$theta
score_ws <- pr_all$fit

scores_texts <-data.frame(party, score_wf, score_ws)
str(scores_texts) 
colnames(scores_texts)[3] <- "score_ws"
str(scores_texts) 

# high but not perfect correlation. Two different dimensions?
cor(scores_texts$score_ws, scores_texts$score_wf)

# you can also draw a scatter, with a fit lines and party names

ggplot(scores_texts, aes(score_ws, score_wf, label=party)) +
      geom_point()  +geom_text(hjust=0, vjust=0) + 
  theme_light() +
  labs(title  = "Scatterplot for UK-manifestoes",
       x = "Wordscores Economy",
       y = "Wordfish") + 
  geom_smooth(method = "lm",
              se = TRUE) 

#########################################################################
#########################################################################
# Using wordfish: US Presidential Inaugural Speech after 1980
#########################################################################
#########################################################################

# apply wordfish by first considering Reagan 1981 to the right of Obama 2009; 
# and then Trump 2017 to the right of Obama 2009: any change? 

corpus_Pres <- corpus_subset(data_corpus_inaugural, Year > 1980)
tok2 <- tokens(corpus_Pres , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
presDfm <- dfm(tok2)

docvars(presDfm)
# Obama 2009 to the left of Reagan 1981
wfm <- textmodel_wordfish(presDfm , dir = c(8, 1))
summary(wfm)

words2 <- wfm$features
beta2 <-wfm$beta
psi2 <-wfm$psi
scores_words2 <-data.frame(words2, beta2, psi2)
str(scores_words2) 

# top 40 words for negative beta: tyranni; ideolog; murder; tyrant; unlimit
head(scores_words2[order(scores_words2$beta2),], 40)
# top 40 words for positive beta: rainbow; western; sovereignti; cheer
tail(scores_words2[order(scores_words2$beta2),], 40)

# meaning of the latent dimension: pessimism/realistic vs. optimism side of power?
# for use, the latent does not appear to be related to ideology! As you can also see below
# when you plot the documents (according to face validity)

# Highlight specific words
textplot_scale1d(wfm, margin = "features", 
                 highlighted = c("tyranni", "murder", "mortal", 
                                 "cheer", "rainbow", "sovereignti"), 
                 highlighted_color = "red", alpha=0.3)

# Plot estimated document positions
textplot_scale1d(wfm, margin = "documents")

# Obama 2009 to the left of Trump 2017
wfm2 <- textmodel_wordfish(presDfm , dir = c(8, 10))
summary(wfm2)

# Plot estimated document positions
textplot_scale1d(wfm2, margin = "documents")

# check for the correlation
score_reagan <-wfm$theta
score_trump <-wfm2$theta
cor(score_reagan, score_trump)
identical(score_reagan, score_trump)