rm(list=ls(all=TRUE))
getwd()
# set here the folder where you have saved the datasets you are going to use in this class
# setwd("*****")
# in my case
# setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/")
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/")
getwd()

library(readtext)
library(quanteda)
library(quanteda.textmodels)
library(cowplot)
library(PerformanceAnalytics)
library(psych)

#########################################################################
#########################################################################
# Creating the Corpus of the UK electoral programs 1992, 1997
#########################################################################
#########################################################################

myText <- readtext("Lecture 2/Wordscores manifestos/UK/*.txt", 
docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year"))
str(myText)

testCorpus <- corpus(myText )
summary(testCorpus)
# I rename the name of the documents
docnames(testCorpus) <- gsub(".txt", "", docnames(testCorpus ))
summary(testCorpus)

#####################
# Remember! Comparing the results with and w/o stopwords, with and w/o stemming is a good practice.
#####################

myDfm <- dfm(testCorpus , remove = stopwords("english"), tolower = TRUE, stem = TRUE,
                remove_punct = TRUE, remove_numbers=TRUE)
topfeatures(myDfm , 20)  # 20 top words

#########################################################################
#########################################################################
# Using wordfish 
#########################################################################
#########################################################################

# dir indicates which two documents are used for global identification purposes 
# (the first document to the left of the second one); 
# this matters usually more for the interpretatio of the results (i.e., for the direction of the scores 
# along the latent dimension (which positive, which negative ones)), rather than for the estimation per-se

# here: LAB 92 to the left of CONS 92
summary(testCorpus)
wfm <- textmodel_wordfish(myDfm, dir = c(3, 1))
summary(wfm)

# here: CONS 92 to the left of LAB 92
wfm2 <- textmodel_wordfish(myDfm, dir = c(1, 3))
summary(wfm2)

# compare with previous case!
summary(wfm)
summary(wfm2)
cor(wfm$theta, wfm2$theta)

# what does it happen if now you put LIB 92 to the left of CONS 92?
summary(testCorpus)
wfm3 <- textmodel_wordfish(myDfm, dir = c(5, 1))
summary(wfm3)
# Nothing!
cor(wfm$theta, wfm3$theta)
identical(wfm$theta, wfm3$theta)

# Always do Diagnostic!
# A good start for diagnostics is the analysis of word discrimination parameters.
# Weights with large values mean that these words are estimated to be on the extremes of the dimension

# Plot estimated word positions
textplot_scale1d(wfm, margin = "features")

# Highlight specific words
textplot_scale1d(wfm, margin = "features", 
                 highlighted = c("government", "global", "children", 
                                 "bank", "economy", "citizenship",
                                 "productivity", "deficit", "nation", "freedom", "histor", "inflat"), 
                 highlighted_color = "red")

# let's learn how to extract the estimates of the model and save them
# let's start with extracting the features
str(wfm)
words2 <- wfm$features
beta2 <-wfm$beta
psi2 <-wfm$psi
scores_words2 <-data.frame(words2, beta2, psi2)
str(scores_words2) 

# top 20 words for negative beta
head(scores_words2[order(scores_words2$beta2),], 20)
# top 20 words for positive beta
tail(scores_words2[order(scores_words2$beta2),], 20)

write.csv(scores_words2, "result_wordfish_words.csv")

# Plot estimated document positions
summary(testCorpus)

textplot_scale1d(wfm, margin = "documents")
textplot_scale1d(wfm, margin = "documents",  groups = docvars(testCorpus, "Party"))
textplot_scale1d(wfm, margin = "documents",  groups = docvars(testCorpus, "Year"))

# extract estimates of the model and save them
# documents
party <- wfm$docs
theta <-wfm$theta
se.theta <-wfm$se.theta
scores_texts <-data.frame(party , theta , se.theta)
str(scores_texts) 
scores_texts$lower <- scores_texts$theta +1.96*scores_texts$se.theta
scores_texts$upper <- scores_texts$theta -1.96*scores_texts$se.theta
str(scores_texts)

write.csv(scores_texts, "result_wordfish_texts.csv")

#########################################################################
#########################################################################
# Using wordfish: US Presidential Inaugural Speech after 1980
#########################################################################
#########################################################################

# apply wordfish by first considering Reagan 1981 to the right of Obama 2009; 
# and then Trump 2017 to the right of Obama 2009: any change? 

# create a dfm from inaugural addresses from Reagan onwards
presDfm <- dfm(corpus_subset(data_corpus_inaugural, Year > 1980), 
               remove = stopwords("english"), stem = TRUE, remove_punct = TRUE)
presDfm [, 1:10]
str(presDfm)
presDfm@Dimnames$docs

# Obama 2009 to the left of Reagan 1981
wfm <- textmodel_wordfish(presDfm , dir = c(8, 1))
summary(wfm)

# Plot estimated word positions
textplot_scale1d(wfm, margin = "features")

# Plot estimated document positions
textplot_scale1d(wfm, margin = "documents")

# Obama 2009 to the left of Trump 2017
wfm2 <- textmodel_wordfish(presDfm , dir = c(8, 10))
summary(wfm2)

# Plot estimated document positions
textplot_scale1d(wfm2, margin = "documents")

# check for the correlation
score_reagan <-wfm$theta
score_trump <-wfm2$theta
cor(score_reagan, score_trump)
identical(score_reagan, score_trump)

# Comparing wordfish with Reagan or Trump in "dir"
library(cowplot)
reagan <- textplot_scale1d(wfm, margin = "documents")
trump <- textplot_scale1d(wfm2, margin = "documents")
plot_grid(reagan , trump , labels = c('Reagan=1', 'Trump=1'))