rm(list=ls(all=TRUE))
getwd()
# set here the folder where you have saved the datasets you are going to use in this class
# setwd("*****")
# in my case
setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK")
getwd()

library(readtext)
library(quanteda)

#########################################################################
#########################################################################
# Creating the Corpus of the UK electoral programs 1992, 1997
#########################################################################
#########################################################################

myText <- readtext("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK/*.txt",
docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year"))
str(myText)
testCorpus <- corpus(myText)
summary(testCorpus)

# I want to name the texts in my just created corpus in a more proper way!
# several possibilities: 
# first one
testCorpus <- corpus(myText, docid_field = "doc_id")
summary(testCorpus)
docnames(testCorpus) <- gsub(".txt", "", docnames(testCorpus ))
summary(testCorpus)

# second one
testCorpus2 <- corpus(myText)
summary(testCorpus2)
docnames(testCorpus2) <- paste(myText$Party, myText$Year, sep = " ")
summary(testCorpus2)

# Comparing the results with and w/o stopwords, with and w/o stemming is always a good practice
myDfm <- dfm(testCorpus , remove = stopwords("english"), tolower = TRUE, stem = TRUE,
                remove_punct = TRUE, remove_numbers=TRUE)
topfeatures(myDfm , 20)  # 20 top words

#########################################################################
#########################################################################
# Using wordscores: UK example with economic policy positions 
#########################################################################
#########################################################################

# reference texts: 1992 parties manifestos
# reference texts scores: 1992 parties manifestos. Lab: 5.35; LibDem: 8.21; Cons: 17.21
# Set reference scores 
refscores <- c(17.21, NA, 5.35, NA, 8.21, NA)
refscores 
ws <- textmodel_wordscores(myDfm, refscores)
summary(ws) 

# alternatively
ws2 <- textmodel_wordscores(myDfm, c(17.21, NA, 5.35, NA, 8.21, NA))
summary(ws2) 

# alternative way to set reference scores
ws3 <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1)))
summary(ws3) 

# predict the scores for the raw scores
pr_raw <- predict(ws, newdata = myDfm[c(2, 4, 6), ])
pr_raw

# rescale using the LBG transformation
pr <- predict(ws, rescaling = "lbg", newdata = myDfm[c(2, 4, 6), ])
pr

# obtaining the corresponding confidence interval
pr2 <- predict(ws, rescaling = "lbg", newdata = myDfm[c(2, 4, 6), ], interval = "confidence")
pr2

# scaling all texts (including the reference ones, and get the corresponding confidence interval) 
# look at the raw score of the LIBDEM 97 vs. LIBDEM 92 now, 
# and contrasts it with the previous situation...the same with CONS 92 vs. CONS 97
pr_all <- predict(ws, interval = "confidence")
pr_all

# Plot estimated word positions in the reference texts (highlight words and print them in red) 
# it shows the frequency vs. the word-score

textplot_scale1d(ws)

textplot_scale1d(ws, 
                 highlighted = c( "budget", "green", "millennium"), 
                 highlighted_color = "red")

# Plot estimated document positions and group by "party" or "year" variable [row scores: this is easy!]

summary(testCorpus)
doclab <- docnames(testCorpus)
doclab

# Get predictions
pred <- predict(ws, se.fit = TRUE)
pred

textplot_scale1d(pred, margin = "documents",  doclabels = doclab)

textplot_scale1d(pred, margin = "documents",
                 doclabels = doclab,
                 groups = docvars(testCorpus, "Party"))

textplot_scale1d(pred, margin = "documents",
                 doclabels = doclab,
                 groups = docvars(testCorpus, "Year"))

# save the estimates - features scores
ws <- textmodel_wordscores(myDfm, refscores)
str(ws)
words <- ws$wordscores
str(words)
write.csv(words, "result_wordscores_words.csv")

# save the estimates - document scores
str(pr_all)
pr_all
pr_all$fit
ch <- as.data.frame(pr_all$fit)
ch$Party <- rownames(ch)
str(ch)
ch$se <- pr_all$se.fit
str(ch)
write.csv(ch, "parties.csv")

# Alternative way for plotting estimated document positions [raw scores]

# ordering the parties in a ascending order (otherwise: step <-order(-scores_texts$score)
step <-order(ch$fit )
ch<- ch[step , ]
str(head(ch))

# Plotting the graph
title <- "Parties positions with 95% Confidence Intervals"
dotchart(ch$fit, labels=ch$Party , col="blue", xlim=c(floor(min(ch$lwr)), ceiling(max(ch$upr))), main=title )
for (i in 1:nrow(ch)){
    lines(x=c(ch$lwr[i],ch$upr[i]), y=c(i,i))
}

# Plot estimated document positions [transformed scores - LBG]
pr2 <- predict(ws, rescaling = "lbg", newdata = myDfm[c(2, 4, 6), ], interval = "confidence")
pr2
str(pr2)
pr2$fit

rescaled <- as.data.frame(pr2$fit)
rescaled$Party <- rownames(rescaled)
str(rescaled )

# ordering the parties in a ascending order 
step <-order(rescaled$fit )
rescaled<- rescaled[step, ]
str(head(rescaled))

# Plotting the graph
title <- "Parties positions with 95% Confidence Intervals"
dotchart(rescaled$fit, labels=rescaled$Party , col="blue", xlim=c(floor(min(rescaled$lwr)), ceiling(max(rescaled$upr))), main=title )
for (i in 1:nrow(rescaled)){
    lines(x=c(rescaled$lwr[i],rescaled$upr[i]), y=c(i,i))
}

# if you want to add to the previous graph also the reference-scores:
ws <- textmodel_wordscores(myDfm, refscores)
str(ws) 
fit <- ws$y
Party <-  ws$x@Dimnames$docs

original_scores <-data.frame(Party , fit)
original_scores$Party <- as.character(original_scores$Party )
str(original_scores)
original_scores <- na.omit(original_scores)
str(original_scores)
str(rescaled)

original_scores$lwr <- 0
original_scores$upr <- 0
str(original_scores)

mydata <- rbind(rescaled, original_scores) 
str(mydata)

# ordering the parties in a ascending order (otherwise: step <-order(-scores_texts$score)
step <-order(mydata $fit )
mydata <- mydata [step, ]
str(head(mydata ))

# Plotting the graph
title <- "Parties positions with 95% Confidence Intervals"
dotchart(mydata $fit, labels=mydata $Party , col="blue", xlim=c(floor(min(mydata $lwr)), ceiling(max(mydata $upr))), main=title )
for (i in 1:nrow(mydata )){
    lines(x=c(mydata $lwr[i],mydata $upr[i]), y=c(i,i))
}

#########################################################################
#########################################################################
# Using wordscores: UK example with social policy positions 
#########################################################################
#########################################################################

# reference texts: 1992 parties manifestos
# reference texts scores: 1992 parties manifestos. Lab: 6.87; LibDem: 6.53; Cons: 15.34
# Run the analysis (with stemming!) by focusing on the row scores

#########################################################################
#########################################################################
# Using wordscores: US Presidential Inaugural Speech after 1980
#########################################################################
#########################################################################

# apply wordscores by estimating both the row scores as well as the LBG method
# reference scores: Reagan 1981: +1; Obama 2009: -1; Clinton 1993: 0

#########################################################################
#########################################################################
# Using wordscores: UK example with economic policy positions but without stemming
#########################################################################
#########################################################################

# replicate the previous analysis to estimate the row scores with all parties, and compare the results
# with what you get with stemming
# Note: This is just a suggested homework!