rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK")
getwd()

library(readtext)
library(quanteda)

#########################################################################
#########################################################################
# The Irish case
#########################################################################
#########################################################################

myText <- readtext("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/IE/*.txt",
docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Year", "Party"))
str(myText)
testCorpus <- corpus(myText)
summary(testCorpus)

# Comparing the results with and w/o stopwords, with and w/o stemming is always a good practice
myDfm <- dfm(testCorpus , remove = stopwords("english"), tolower = TRUE, stem = TRUE,
                remove_punct = TRUE, remove_numbers=TRUE)
topfeatures(myDfm , 20)  # 20 top words

# set reference scores
ws <- textmodel_wordscores(myDfm, c(4.5, 13.13, 15, 6.88, 17.63, rep(NA, 5)))
summary(ws) 

# rescale using the LBG transformation
pr <- predict(ws, rescaling = "lbg", newdata = myDfm[c(6:10), ], interval = "confidence")
pr

# scaling all texts (including the reference ones) 
pr_all <- predict(ws, newdata = myDfm)
pr_all

# Plot estimated document positions 
# Plot estimated document positions and group by "party" or "year" variable [row scores: this is easy!]

summary(testCorpus)
doclab <- docnames(testCorpus)
doclab

# Get predictions
pred <- predict(ws, se.fit = TRUE)
pred

textplot_scale1d(pred, margin = "documents",  doclabels = doclab)

textplot_scale1d(pred, margin = "documents",
                 doclabels = doclab,
                 groups = docvars(testCorpus, "Party"))

textplot_scale1d(pred, margin = "documents",
                 doclabels = doclab,
                 groups = docvars(testCorpus, "Year"))

# Plot estimated document positions [transformed scores - LBG]

pr2 <- predict(ws, rescaling = "lbg", newdata = myDfm[c(6:10), ], interval = "confidence")
pr2
str(pr2)
pr2$fit

# first way of plotting a graph
rescaled <- as.data.frame(pr2$fit)
rescaled$Party <- rownames(rescaled)
str(rescaled )

# ordering the parties in a ascending order (otherwise: step <-order(-scores_texts$score)
step <-order(rescaled$fit )
rescaled<- rescaled[step, ]
str(head(rescaled))

# Plotting the graph
title <- "Parties positions with 95% Confidence Intervals"
dotchart(rescaled$fit, labels=rescaled$Party , col="blue", xlim=c(floor(min(rescaled$lwr)), ceiling(max(rescaled$upr))), main=title )
for (i in 1:nrow(rescaled)){
    lines(x=c(rescaled$lwr[i],rescaled$upr[i]), y=c(i,i))
}

# if you want to add to the previous graph also the reference-scores:
ws <- textmodel_wordscores(myDfm)
str(ws) 
fit <- ws$y
Party <-  ws$x@Dimnames$docs

original_scores <-data.frame(Party , fit)
original_scores$Party <- as.character(original_scores$Party )
str(original_scores)
original_scores <- na.omit(original_scores)
str(original_scores)
str(rescaled)

original_scores$lwr <- 0
original_scores$upr <- 0
str(original_scores)

mydata <- rbind(rescaled, original_scores) 
str(mydata)

# ordering the parties in a ascending order (otherwise: step <-order(-scores_texts$score)
step <-order(mydata $fit )
mydata <- mydata [step, ]
str(head(mydata ))

# Plotting the graph
title <- "Parties positions with 95% Confidence Intervals"
dotchart(mydata $fit, labels=mydata $Party , col="blue", xlim=c(floor(min(mydata $lwr)), ceiling(max(mydata $upr))), main=title )
for (i in 1:nrow(mydata )){
    lines(x=c(mydata $lwr[i],mydata $upr[i]), y=c(i,i))
}

# second way of plotting a graph
str(mydata)
mydata$ci <- mydata$fit-mydata$lwr
str(mydata)
mydata$ci[mydata$lwr==0] <- 0
str(head(mydata ))

library(ggplot2)

ggplot(mydata, aes(x=reorder (Party, fit), y=fit, group=1)) +
    geom_point(aes()) +
    geom_errorbar(width=.1, aes(ymin=fit-ci, ymax=fit+ci), colour="darkred") + coord_flip() + xlab("Position") + ylab("Party") + theme_classic()

# comparing raw with transformed scores
library(cowplot)

x <- textplot_scale1d(pred, margin = "documents",  doclabels = doclab)
y <- ggplot(mydata, aes(x=reorder (Party, fit), y=fit, group=1)) +
    geom_point(aes()) +
    geom_errorbar(width=.1, aes(ymin=fit-ci, ymax=fit+ci), colour="darkred") + coord_flip() + xlab("Position") + ylab("Party") + theme_classic()

plot_grid(x , y, labels = c('Raw', 'Transformed') )

#########################################################################
#########################################################################
# Using wordscores: US Presidential Inaugural Speech after 1980
#########################################################################
#########################################################################

# apply wordscores by estimating both the row scores as well as the LBG method
# reference scores: Reagan 1981: +1; Obama 2009: -1; Clinton 1993: 0

summary(corpus_subset(data_corpus_inaugural, Year > 1980))

# create a dfm from inaugural addresses from Reagan onwards
presDfm <- dfm(corpus_subset(data_corpus_inaugural, Year > 1980), 
               remove = stopwords("english"), stem = TRUE, remove_punct = TRUE)
presDfm [, 1:5]

ws_pres <- textmodel_wordscores(presDfm, c(1, NA, NA, 0, NA, NA, NA, -1, NA, NA))
summary(ws_pres) 

################################
# scaling all texts (including the reference ones) 
################################

pr_all_pres <- predict(ws_pres, interval = "confidence" )
pr_all_pres

# Plot estimated document positions 
summary(data_corpus_inaugural)
doclab <- docnames(corpus_subset(data_corpus_inaugural, Year > 1980))
doclab

pred <- predict(ws_pres, se.fit = TRUE)
pred

textplot_scale1d(pred, margin = "documents",  doclabels = doclab)

################################
# Trump 2017: +1; Obama 2009: -1; Clinton 1993: 0
################################

ws_pres2 <- textmodel_wordscores(presDfm, c(rep(NA, 3), 0, rep(NA, 3), -1, rep(NA, 1), 1))
summary(ws_pres2) 

# scaling all texts (including the reference ones) 
pr_all_pres2 <- predict(ws_pres2, interval = "confidence" )
pr_all_pres2

pred2 <- predict(ws_pres2, se.fit = TRUE)
pred2

textplot_scale1d(pred2, margin = "documents",  doclabels = doclab)

################################
# comparison!
################################

library(cowplot)
x <-textplot_scale1d(pred, margin = "documents",  doclabels = doclab)
y <-textplot_scale1d(pred2, margin = "documents",  doclabels = doclab)
plot_grid(x , y, labels = c('Reagan1981:+1', 'Trump2017:+1') )

ch <- as.data.frame(pr_all_pres$fit)
ch$Party <- rownames(ch)
str(ch)

ch2 <- as.data.frame(pr_all_pres2$fit)
str(ch2)
ch$fit2 <- ch2$fit
str(ch)

cor(ch$fit, ch$fit2)

plot(ch$fit, ch$fit2 , main="Scatterplot Example", 
  	xlab="Reagan81=1 ", ylab="Trump2017=1", pch=19)
text(ch$fit, ch$fit2 , labels = ch$Party, pos = 4,  col = "royalblue" , cex = 0.8)
abline(lm(ch$fit2 ~ ch$fit ), col="red") # regression line (y~x) 

################################
# using the LBG method
################################

pr_all_pres_lbg <- predict(ws_pres, rescaling = "lbg", newdata = presDfm[c(2:3, 5:7, 9:10), ],  interval = "confidence")
pr_all_pres_lbg

rescaled <- as.data.frame(pr_all_pres_lbg$fit)
rescaled$Party <- rownames(rescaled)
str(rescaled )

# ordering the parties in a ascending order (otherwise: step <-order(-scores_texts$score)
step <-order(rescaled$fit )
rescaled<- rescaled[step, ]
str(head(rescaled))

# Plotting the graph
title <- "Presidents positions with 95% Confidence Intervals"
dotchart(rescaled$fit, labels=rescaled$Party , col="blue", xlim=c(floor(min(rescaled$lwr)), ceiling(max(rescaled$upr))), main=title )
for (i in 1:nrow(rescaled)){
    lines(x=c(rescaled$lwr[i],rescaled$upr[i]), y=c(i,i))
}

# if you want to add to the previous graph also the reference-scores:
ws_pres <- textmodel_wordscores(presDfm, c(1, NA, NA, 0, NA, NA, NA, -1, NA, NA))
summary(ws_pres) 
fit <- ws_pres $y
Party <-  ws_pres $x@Dimnames$docs

original_scores <-data.frame(Party , fit)
original_scores$Party <- as.character(original_scores$Party )
str(original_scores)
original_scores <- na.omit(original_scores)
str(original_scores)
str(rescaled)

original_scores$lwr <- 0
original_scores$upr <- 0
str(original_scores)

mydata <- rbind(rescaled, original_scores) 
str(mydata)

# ordering the parties in a ascending order (otherwise: step <-order(-scores_texts$score)
step <-order(mydata $fit )
mydata <- mydata [step, ]
str(head(mydata ))

# Plotting the graph
title <- "Presidents positions with 95% Confidence Intervals"
dotchart(mydata $fit, labels=mydata $Party , col="blue", xlim=c(floor(min(mydata $lwr)), ceiling(max(mydata $upr))), main=title )
for (i in 1:nrow(mydata )){
    lines(x=c(mydata $lwr[i],mydata $upr[i]), y=c(i,i))
}

################################
# Trump 2017: +1; Obama 2009: -1; Clinton 1993: 0
################################

summary(corpus_subset(data_corpus_inaugural, Year > 1980))

pr_all_pres_lbg2 <- predict(ws_pres2, rescaling = "lbg", newdata = presDfm[c(1:3, 5:7, 9), ],  interval = "confidence")
pr_all_pres_lbg2

rescaled2 <- as.data.frame(pr_all_pres_lbg2$fit)
rescaled2$Party <- rownames(rescaled2)
str(rescaled2 )

# ordering the parties in a ascending order (otherwise: step <-order(-scores_texts$score)
step2 <-order(rescaled2$fit )
rescaled2<- rescaled2[step2, ]
str(head(rescaled2))

# Plotting the graph
title <- "Presidents positions with 95% Confidence Intervals"
dotchart(rescaled2$fit, labels=rescaled2$Party , col="blue", xlim=c(floor(min(rescaled2$lwr)), ceiling(max(rescaled2$upr))), main=title )
for (i in 1:nrow(rescaled2)){
    lines(x=c(rescaled2$lwr[i],rescaled2$upr[i]), y=c(i,i))
}

################################
# comparisons!
################################

str(rescaled )
str(rescaled2 )
colnames(rescaled2 )[1] <- "fit2"
str(rescaled2 )
ch <- merge(rescaled , rescaled2 , by=c("Party")) 
edit(ch )
str(ch )

cor(ch$fit, ch$fit2)

plot(ch$fit, ch$fit2 , main="Scatterplot Example", 
  	xlab="Reagan81=1 ", ylab="Trump2017=1", pch=19)
text(ch$fit, ch$fit2 , labels = ch$Party, pos = 4,  col = "royalblue" , cex = 0.8)
abline(lm(ch$fit2 ~ ch$fit ), col="red") # regression line (y~x)