rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK")
getwd()

library(readtext)
library(quanteda)
library(stm)

#########################################################################
#########################################################################
# Creating the Corpus and the DfM
#########################################################################
#########################################################################

myText <- readtext("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK/*.txt", 
docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year"))
testCorpus <- corpus(myText )
summary(testCorpus)
docnames(testCorpus) <- gsub(".txt", "", docnames(testCorpus ))
summary(testCorpus)

myDfm <- dfm(testCorpus , remove = stopwords("english"), tolower = TRUE, stem = TRUE,
                remove_punct = TRUE, remove_numbers=TRUE)

#########################################################################
#########################################################################
# Using wordfish 
#########################################################################
#########################################################################

# here: LAB 92 to the left of CONS 92
wfm <- textmodel_wordfish(myDfm, dir = c(3, 1))
summary(wfm)
textplot_scale1d(wfm, margin = "documents")
str(wfm)
theta <-wfm$theta # save the thetas

#########################################################################
#########################################################################
# STM
#########################################################################
#########################################################################
summary(testCorpus)
head(docvars(testCorpus))
docvars(testCorpus, "LR") <- theta # passing the thetas to the corpus and let's call them "LR"
summary(testCorpus)
head(docvars(testCorpus))

# convert from quanteda to stm
Dfmstm2 <- convert(myDfm, to = "stm", docvars = docvars(testCorpus ))
str(Dfmstm2)

# Searching for topic number
K <-c(3, 4, 5, 10, 20) 
storage  <- searchK(Dfmstm2$documents, Dfmstm2$vocab, K,  data = Dfmstm2$meta,
N = floor(0.2 * length(Dfmstm2$documents)), prevalence = ~ LR +Year, init.type = "Spectral")

  plot(storage$results$semcoh, storage$results$exclus,
           xlab= "Semantic coherence",
      ylab= "Exclusivity",
       col= "blue", pch = 19, cex = 1, lty = "solid", lwd = 2)
 text(storage$results$semcoh, storage$results$exclus, labels=storage$results$K, cex= 1, pos=4)

# I decide to keep 5 topics
# Let's estimate the STM with LR and Year as our covariates
stmFitted  <- stm(Dfmstm2$documents, Dfmstm2$vocab, K=5, max.em.its = 500,
prevalence = ~ LR +Year, data = Dfmstm2$meta, init.type = "Spectral")

#########################################################################
# Label meaning
#########################################################################

plot(stmFitted, type = "summary", labeltype = c("frex"), n=5)
plot(stmFitted, type = "labels", labeltype = c("frex"), n=15)
# Topic 1: economy
# Topic 2: public services
# Topic 3: work
# Topic 4: democratic involvement
# Topic 5: welfare

# which are the the most likely topics across our documents?
apply(stmFitted$theta,1,which.max) 
# you can save them as a document-level variable.
docvars(myDfm , 'STMtopic') <- apply(stmFitted$theta,1,which.max) 
docvars(myDfm)

#########################################################################
# Topic correlation
#########################################################################

library(igraph)
mod.out.corr <- topicCorr(stmFitted)
plot(mod.out.corr)
mod.out.corr$cor # only negative correlation

library(corrplot)
corrplot(mod.out.corr$cor, method="pie")

#########################################################################
# Estimating topical prevalence using STM with LR and Year as our covariates
#########################################################################

prep <- estimateEffect(1:5 ~ LR + Year,  stmFitted, meta = Dfmstm2$meta, uncertainty = "Global")

# example with Topic 5 and LR
plot(prep, "LR", method = "continuous", topics = 5,
     model = stmFitted, printlegend = FALSE, xaxt = "n", xlab = "Left-Right")
seq <- seq(from = as.numeric("1970"), to = as.numeric("2017"))
axis(1, at = seq)
title("Topic 5 - Welfare")
abline(h=0, col="blue")

# the impact of Year
plot(prep, covariate = "Year", topics = c(1, 2, 3, 4, 5),
model = stmFitted, method = "difference",
cov.value1 = "97", cov.value2 = "92", xlim = c(-1, 1),
xlab = "More 1992... More 1997",
main = "Effect of Year",
labeltype = "custom",
custom.labels = c('Topic 1 (Economy)', 'Topic 2 (Public services)','Topic 3 (Work)', 'Topic 4 (Democratic involvement)', 'Topic 5 (Welfare)'))

#########################################################################
# Topical content  using STM Year as our covariate
#########################################################################

stmFitted_content <- stm(Dfmstm2$documents, Dfmstm2$vocab, K = 5, max.em.its = 75, 
prevalence = ~ LR +Year, content = ~ Year,  data = Dfmstm2$meta, init.type = "Spectral")

# example with Topic 5 
plot(stmFitted_content, type = "perspectives", topics = 5, main="Topic 5 (Welfare)")