rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK")
getwd()

library(readtext)
library(quanteda)
library(stm)

#########################################################################
#########################################################################
# Creating and Working with a Corpus
#########################################################################
#########################################################################

myText <- readtext("C:/Users/mw/Dropbox (Personale)/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK/*.txt", 
docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year"))
str(myText)
testCorpus <- corpus(myText )
summary(testCorpus)
testCorpus <- corpus(myText, docid_field = "doc_id")
summary(testCorpus)
docnames(testCorpus) <- gsub(".txt", "", docnames(testCorpus ))
summary(testCorpus)

myDfm <- dfm(testCorpus , remove = stopwords("english"), tolower = TRUE, stem = TRUE,
                remove_punct = TRUE, remove_numbers=TRUE)
topfeatures(myDfm , 20)  # 20 top words

#########################################################################
#########################################################################
# Using wordfish 
#########################################################################
#########################################################################

# here: LAB 92 to the left of CONS 92
summary(testCorpus)
wfm <- textmodel_wordfish(myDfm, dir = c(3, 1))
summary(wfm)
str(wfm)
textplot_scale1d(wfm, margin = "documents")

party <- wfm$docs
theta <-wfm$theta
se.theta <-wfm$se.theta
scores_texts <-data.frame(party , theta , se.theta)
str(scores_texts) 

#########################################################################
#########################################################################
# STM
#########################################################################
#########################################################################

head(docvars(testCorpus))
docvars(testCorpus, "LR") <- theta
summary(testCorpus)
head(docvars(testCorpus))

# convert from quanteda to stm
Dfmstm2 <- convert(myDfm, to = "stm", docvars = docvars(testCorpus ))
str(Dfmstm2)
str(Dfmstm2$vocab)
str(Dfmstm2$documents)

# Searching for topic number
K <-c(3, 4, 5, 10, 20) 
storage  <- searchK(Dfmstm2$documents, Dfmstm2$vocab, K,  data = Dfmstm2$meta,
N = floor(0.2 * length(Dfmstm2$documents)), prevalence = ~ LR +Year, init.type = "Spectral")

  plot(storage$results$semcoh, storage$results$exclus,
           xlab= "Semantic coherence",
      ylab= "Exclusivity",
       col= "blue", pch = 19, cex = 1, lty = "solid", lwd = 2)
 text(storage$results$semcoh, storage$results$exclus, labels=storage$results$K, cex= 1, pos=4)

# I decide to keep 5 topics
stmFitted  <- stm(Dfmstm2$documents, Dfmstm2$vocab, K=5, max.em.its = 500,
prevalence = ~ LR +Year, data = Dfmstm2$meta, init.type = "Spectral")

labelTopics(stmFitted)

library(igraph)
mod.out.corr <- topicCorr(stmFitted)
plot(mod.out.corr)
mod.out.corr$cor

#########################################################################
# Label meaning
#########################################################################
plot(stmFitted, type = "labels", labeltype = c("frex"))
cloud(stmFitted, topic = 1) # economy
cloud(stmFitted, topic = 2) # public services
cloud(stmFitted, topic = 3) # work
cloud(stmFitted, topic = 4) # democratic involvement
cloud(stmFitted, topic = 5) # welfare

plot(stmFitted, type = "perspectives", labeltype = c("frex"), topics = c(1, 4))

#########################################################################
# Estimating topical prevalence
#########################################################################

prep <- estimateEffect(1:5 ~ LR + Year,  stmFitted, meta = Dfmstm2$meta, uncertainty = "Global")

head(docvars(testCorpus))
str(Dfmstm2)

# example with Topic 5
plot(prep, "LR", method = "continuous", topics = 5,
     model = stmFitted, printlegend = FALSE, xaxt = "n", xlab = "Left-Right")
seq <- seq(from = as.numeric("1970"), to = as.numeric("2017"))
axis(1, at = seq)
title("Topic 5 - Welfare")
abline(h=0, col="blue")

plot(prep, covariate = "Year", topics = c(1, 2, 3, 4, 5),
model = stmFitted, method = "difference",
cov.value1 = "97", cov.value2 = "92", xlim = c(-1, 1),
xlab = "More 1992... More 1997",
main = "Effect of Year",
labeltype = "custom",
custom.labels = c('Topic 1 (Economy)', 'Topic 2 (Public services)','Topic 3 (Work)', 'Topic 4 (Democratic involvement)', 'Topic 5 (Welfare)'))

#########################################################################
# Plotting covariate interactions [just a plus, not needed]
#########################################################################

stmFitted_interaction <- stm(Dfmstm2$documents, Dfmstm2$vocab, K = 5, max.em.its = 75, 
prevalence = ~ LR * Year,  data = Dfmstm2$meta, init.type = "Spectral")

prep <- estimateEffect(c(5) ~ LR * Year, 
stmFitted_interaction, meta = Dfmstm2$meta, uncertainty = "Global")

 plot(prep, covariate = "LR", model = stmFitted_interaction,
 method = "continuous", xlab = "LR", moderator = "Year",
 moderator.value = 92, linecol = "blue", ylim = c(-1, 1),
 printlegend = F)
 plot(prep, covariate = "LR", model = stmFitted_interaction,
method = "continuous", xlab = "LR", moderator = "Year",
 moderator.value = 97, linecol = "red", add = T,
 printlegend = F)
 legend(2000, -0.7, c("92", "97"),
 lwd = 2, col = c("blue", "red"))
abline(h=0, col="black")

#########################################################################
# Topical content
#########################################################################

stmFitted_content <- stm(Dfmstm2$documents, Dfmstm2$vocab, K = 5, max.em.its = 75, 
prevalence = ~ LR +Year, content = ~ Year,  data = Dfmstm2$meta, init.type = "Spectral")

plot(stmFitted_content, type = "perspectives", topics = 5, main="Topic 5 (Welfare)")