rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/")
library(readtext)
library(quanteda)
library(wordshoal)
library(ggplot2)
library(plotrix)

###########################
###########################
# WORDSHOAL
###########################
###########################

# Irish Dail speeches from 2007-2011
head(docvars(data_corpus_irish30))

tok2 <- tokens(data_corpus_irish30, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
iedfm<- dfm(tok2 )
topfeatures(iedfm, 20)  # 20 top words

# 72 MPs Members...
length(unique(iedfm@docvars$memberID))
length(unique(iedfm@docvars$member.name)) 

# ...that speak over 10 debates at least once
length(unique(iedfm@docvars$debateID))

# overall the 72 Members speak 127 times - some MP speaks in more than 1 speech
length(iedfm@docvars$debateID) 

# take a look at the matrix
x <- iedfm@docvars
fix(x)

# let's run wordshoal by identifying the polarity as in a usual wordfish (here Costello to the left of Ahern)
# "groups" refers to the debates; "authors" refers to the single MP

unique(iedfm@docvars$member.name)

wordshoalfit <- textmodel_wordshoal(iedfm, dir = c(7,1),
                        groups = docvars(data_corpus_irish30, "debateID"), 
                        authors = docvars(data_corpus_irish30, "member.name"))

summary(wordshoalfit) # the overall theta for each MPs along the single overall latent dimension
# Let's see the columns we get out of the textmodel_wordshoal command
str(wordshoalfit)   
# theta refers to document positions [in this case Members position across debates];
# beta refers to debate marginal effects [the higher the absolute value of beta, the more a debate is associate with 
# the overall scale] you can use these values (and its positive/negative sign) 
# to understand which debates are most related to the latent dimension extracted
# Let's see and example:

debateID <- unique(wordshoalfit$groups)
beta <- wordshoalfit$beta
matrix <- as.data.frame(cbind(debateID , beta))
# as you can see, debate 1 and 6 are negatively correlated with the thetas, while debate 7 and 10 are positively correlated
matrix [order(matrix $beta),]

# alpha refers to group/debate fixed effects;
# psi	refers to author group-level positions [position of legislator i in debate j]
# note that "psi*beta" = speech position of legislator i on debate j (calibrated to the general scale)

# let's plot the positions of Authors along the single latent dimension (w/o and with the c.i.)

scores <- wordshoalfit$theta
se <- wordshoalfit$se.theta
ci <- se*1.96
mp <- as.character(unique(wordshoalfit$authors))

ch <-as.data.frame(cbind(mp, scores, ci))
str(ch)
ch$ scores<- as.numeric(paste(ch$ scores))
ch$ci <- as.numeric(paste(ch$ci))
str(ch)

ggplot(ch, aes(x=reorder (mp, scores), y=scores, group=1)) +
    geom_point(aes()) +
    coord_flip() + xlab("Position") + ylab("MP")

ggplot(ch, aes(x=reorder (mp, scores), y=scores, group=1)) +
    geom_point(aes()) +
    geom_errorbar(width=.1, aes(ymin=scores-ci, ymax=scores+ci), colour="darkred") + coord_flip() + xlab("Position") + ylab("MP")

# let's extract the thetas
author_positions <- summary(wordshoalfit)$estimated.author.positions
author_positions$row_names <- rownames(author_positions)
str(author_positions)

# let's merge with the metadata present in the original corpus
head(docvars(data_corpus_irish30))
fitdf <- merge(author_positions,
               docvars(data_corpus_irish30), 
               by.x = "row_names", by.y = "member.name")

str(fitdf)

# let's estimate the average position of each party along theta and let's plot it
x <- as.matrix(aggregate(theta ~ party.name, data = fitdf, 
          FUN = function(x) c(mean = mean(x), se = std.error(x))))
x2 <- as.data.frame(x)
str(x2)
x2$theta.mean<- as.numeric(x2$theta.mean)
x2$theta.se <- as.numeric(x2$theta.se)
str(x2)
x2$ci <- x2$theta.se*1.96
str(x2)

ggplot(x2, aes(x=reorder (party.name, theta.mean), y=theta.mean, group=1)) +
    geom_point(aes()) +
   coord_flip() + xlab("Position") + ylab("Party")

ggplot(x2, aes(x=reorder (party.name, theta.mean), y=theta.mean, group=1)) +
    geom_point(aes()) +
    geom_errorbar(width=.1, aes(ymin=theta.mean-ci, ymax=theta.mean+ci), colour="darkred") + coord_flip() 
+ xlab("Position") + ylab("Party")

# let's extract also the psi and let's plot it

document_id <- as.numeric(as.character(wordshoalfit$groups)) 
document_id # 127 repeated speeches
debateID <- unique(document_id ) # 10 debates
debateID
document_beta <- as.data.frame(cbind(wordshoalfit$beta,debateID))
str(document_beta)
names(document_beta)[1] <- "beta"
str(document_beta)

fitdf2 <- merge(fitdf,
               document_beta, 
               by = "debateID")

str(fitdf2)
# note that "psi*beta" = speech position of legislator i on debate j (calibrated to the general scale)
fitdf2$psi <- wordshoalfit$psi
str(fitdf2)
fitdf2$mp_debate_pos <- fitdf2$psi*fitdf2$beta # rescaling values to compare with wordfish
str(fitdf2)

fitdf2$debateIDstr <- as.character(fitdf2$debateID)
str(fitdf2)
fitdf2$MPdebate <- paste(fitdf2$row_names, fitdf2$debateIDstr, sep="")
str(fitdf2)

ggplot(fitdf2, aes(x=reorder (MPdebate , mp_debate_pos), y=mp_debate_pos, group=1)) +
    geom_point(aes()) +
    coord_flip() + xlab("Position") + ylab("MPdebate")

# too messy! so let's focus only on "Mr. Fergus O'Dowd"
newdata <- fitdf2[ which(fitdf2$row_names=="Mr. Fergus O'Dowd"), ]
str(newdata)

# we can see the change across debates of the position of "Mr. Fergus O'Dowd"
ggplot(newdata, aes(x=reorder (MPdebate , mp_debate_pos), y=mp_debate_pos, group=1)) +
    geom_point(aes()) +
    coord_flip() + xlab("Position") + ylab("MPdebate")