rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK")
getwd()

library(readtext)
library(quanteda)
library(quanteda.textmodels)
library(quanteda.textplots)
library(quanteda.textstats)
library(ggplot2)

#########################################################################
#########################################################################
# Creating and Working with a Corpus
# Let's create a corpus based on the UK parties' manifesto from 1992 and 1997 general elections
#########################################################################
#########################################################################

myText <- readtext("C:/Users/luigi/Dropbox/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK/*.txt", 
docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year"))
str(myText)
# Text pre-processing: let's make some cleaning;
# for example the apostrophe (Quanteda struggles with that...)
myText$text <- gsub("'"," ",myText$text)

testCorpus <- corpus(myText )
summary(testCorpus)
# I rename the name of the documents
docnames(testCorpus) <- gsub(".txt", "", docnames(testCorpus ))
summary(testCorpus)

# Alternatively I could create a new variable
myText$code <- paste(myText$Party, as.character(myText$Year), sep = " ")
# and the pass it as the name of the documents to be considered
testCorpus2 <- corpus(myText, docid_field = "code")
summary(testCorpus2)

# Comparing the results with and w/o stopwords, with and w/o stemming is always a good practice.
tok2 <- tokens(testCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
myDfm <- dfm(tok2 )
topfeatures(myDfm , 20)  # 20 top words
# let's keep just those features with at least 2 characters (to remove for example the "s")
myDfm <- dfm_remove(myDfm, min_nchar=2)
topfeatures(myDfm , 20)  # 20 top words

#########################################################################
#########################################################################
# Check similarity across documents 
#########################################################################
#########################################################################
summary(testCorpus)

# compute some document similarities
Simil <- textstat_simil(myDfm , method = "cosine")
Simil

#########################################################################
#########################################################################
# Using wordscores: UK example with economic policy positions 
#########################################################################
#########################################################################

# reference texts: 1992 parties manifestos
# reference texts scores: 1992 parties manifestos - Lab: 5.35; LibDem: 8.21; Cons: 17.21 (the lower (higher) the score the more (less) pro-State in economy)
# reference scores derived from the Laver-Garry expert survey

# FIRST step:
# Set reference scores 
refscores <- c(17.21, NA, 5.35, NA, 8.21, NA)
refscores

# SECOND step: 
# Assign the reference scores to your dfm
ws <- textmodel_wordscores(myDfm, refscores)
summary(ws) 

# Plot estimated word positions in the reference texts (highlight words and print them in red) 
# it shows the frequency vs. the word-score

textplot_scale1d(ws, 
                 highlighted = c( "budget", "green", "millennium"), 
                 highlighted_color = "red")

# Doing FIRST and SECOND step in one single step
ws2 <- textmodel_wordscores(myDfm, c(17.21, NA, 5.35, NA, 8.21, NA))
summary(ws2) 

# alternative way to set reference scores
ws3 <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1)))
summary(ws3) 

# THIRD step: we predict the raw Wordscores for all the texts (reference and virgin ones)
pr_raw <- predict(ws, se.fit = TRUE, newdata = myDfm)
# Why that warning? Cause we have more words in the DfM than in the reference texts! So some words included in the DfM
# cannot get a wordscore out of the reference texts!

nfeat(myDfm) # 6559 features in our DfM in our corpus
length(ws$wordscores) # 5061 features in the reference texts only
nfeat(myDfm)-length(ws$wordscores)

# let's see the results 
pr_raw
textplot_scale1d(pr_raw)

# alternative way (with c.i. rather than with s.e.)
pr_all2 <- predict(ws, interval = "confidence", newdata = myDfm)
pr_all2
textplot_scale1d(pr_all2)

# Plot estimated document positions and group by "party" or "year" variable 
summary(testCorpus)
textplot_scale1d(pr_all2, margin = "documents",  groups = docvars(testCorpus, "Party"))
textplot_scale1d(pr_all2, margin = "documents",  groups = docvars(testCorpus, "Year"))

# we want to predict only the virgin texts using the rescaling LGB option
summary(ws) 
pr_lbg <- predict(ws, rescaling = "lbg", newdata = myDfm[c(2, 4, 6), ])
pr_lbg

# obtaining the corresponding confidence interval
pr_lbg <- predict(ws, rescaling = "lbg", newdata = myDfm[c(2, 4, 6), ], interval = "confidence")
pr_lbg

#########################################################################
#########################################################################
# Using wordscores: UK example with decentralization positions 
#########################################################################
#########################################################################

# reference texts: 1992 parties manifestos
# reference texts scores: 1992 parties manifestos. Lab: 10.21, LibDem: 5.26; Cons: 15.61 (the higher (lower) the score the less (more) pro-decentralization)

# Run the analysis by focusing on the raw scores

# FIRST step:
# Set reference scores 
refscores <- c(15.61, NA, 10.21, NA, 5.26, NA)
refscores

# SECOND step: 
# Assign the reference scores to your dfm
ws <- textmodel_wordscores(myDfm, refscores)
summary(ws) 

# THIRD step: we predict the raw Wordscores for all the texts (reference and virgin ones)
pr_all2 <- predict(ws, interval = "confidence")
pr_all2
textplot_scale1d(pr_all2)

# let's compare results we got via economic vs. social policy dimension
# Wordscores
ws <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1)))
pr_eco <- predict(ws, interval = "confidence")
eco <- textplot_scale1d(pr_eco)
dec <- textplot_scale1d(pr_all2)

plot_grid(eco , soc , labels = c('Economic', 'Decentralization'))

str(ws)
str(pr_all2)
str(pr_eco)

# check for the correlation
party <- ws$x@Dimnames$docs
score_dec <- pr_all2$fit
score_eco <- pr_eco$fit

scores_texts <-data.frame(party, score_dec, score_eco )
str(scores_texts) 
colnames(scores_texts)[2] <- "scoreDEC"
colnames(scores_texts)[5] <- "scoreECO"
str(scores_texts) 

cor(scores_texts$scoreDEC, scores_texts$scoreECO)

# Plotting the 2-D policy space

ggplot(scores_texts, aes(x=scoreECO, y=scoreDEC)) + geom_point() + 
  geom_text(label=scores_texts$party, vjust=-1) +  
ylab(label="Decentralization policy") +  xlab("Economic policy")  + 
  theme_light()