rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK") getwd() library(readtext) library(quanteda) library(stm) ######################################################################### ######################################################################### # Creating the Corpus and the DfM ######################################################################### ######################################################################### myText <- readtext("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK/*.txt", docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year")) testCorpus <- corpus(myText ) summary(testCorpus) docnames(testCorpus) <- gsub(".txt", "", docnames(testCorpus )) summary(testCorpus) myDfm <- dfm(testCorpus , remove = stopwords("english"), tolower = TRUE, stem = TRUE, remove_punct = TRUE, remove_numbers=TRUE) ######################################################################### ######################################################################### # Using wordfish ######################################################################### ######################################################################### # here: LAB 92 to the left of CONS 92 wfm <- textmodel_wordfish(myDfm, dir = c(3, 1)) summary(wfm) textplot_scale1d(wfm, margin = "documents") str(wfm) theta <-wfm$theta # save the thetas ######################################################################### ######################################################################### # STM ######################################################################### ######################################################################### summary(testCorpus) head(docvars(testCorpus)) docvars(testCorpus, "LR") <- theta # passing the thetas to the corpus and let's call them "LR" summary(testCorpus) head(docvars(testCorpus)) # convert from quanteda to stm Dfmstm2 <- convert(myDfm, to = "stm", docvars = docvars(testCorpus )) str(Dfmstm2) # Searching for topic number K <-c(3, 4, 5, 10, 20) storage <- searchK(Dfmstm2$documents, Dfmstm2$vocab, K, data = Dfmstm2$meta, N = floor(0.2 * length(Dfmstm2$documents)), prevalence = ~ LR +Year, init.type = "Spectral") plot(storage$results$semcoh, storage$results$exclus, xlab= "Semantic coherence", ylab= "Exclusivity", col= "blue", pch = 19, cex = 1, lty = "solid", lwd = 2) text(storage$results$semcoh, storage$results$exclus, labels=storage$results$K, cex= 1, pos=4) # I decide to keep 5 topics # Let's estimate the STM with LR and Year as our covariates stmFitted <- stm(Dfmstm2$documents, Dfmstm2$vocab, K=5, max.em.its = 500, prevalence = ~ LR +Year, data = Dfmstm2$meta, init.type = "Spectral") ######################################################################### # Label meaning ######################################################################### plot(stmFitted, type = "summary", labeltype = c("frex"), n=5) plot(stmFitted, type = "labels", labeltype = c("frex"), n=15) # Topic 1: economy # Topic 2: public services # Topic 3: work # Topic 4: democratic involvement # Topic 5: welfare # which are the the most likely topics across our documents? apply(stmFitted$theta,1,which.max) # you can save them as a document-level variable. docvars(myDfm , 'STMtopic') <- apply(stmFitted$theta,1,which.max) docvars(myDfm) ######################################################################### # Topic correlation ######################################################################### library(igraph) mod.out.corr <- topicCorr(stmFitted) plot(mod.out.corr) mod.out.corr$cor # only negative correlation library(corrplot) corrplot(mod.out.corr$cor, method="pie") ######################################################################### # Estimating topical prevalence using STM with LR and Year as our covariates ######################################################################### prep <- estimateEffect(1:5 ~ LR + Year, stmFitted, meta = Dfmstm2$meta, uncertainty = "Global") # example with Topic 5 and LR plot(prep, "LR", method = "continuous", topics = 5, model = stmFitted, printlegend = FALSE, xaxt = "n", xlab = "Left-Right") seq <- seq(from = as.numeric("1970"), to = as.numeric("2017")) axis(1, at = seq) title("Topic 5 - Welfare") abline(h=0, col="blue") # the impact of Year plot(prep, covariate = "Year", topics = c(1, 2, 3, 4, 5), model = stmFitted, method = "difference", cov.value1 = "97", cov.value2 = "92", xlim = c(-1, 1), xlab = "More 1992... More 1997", main = "Effect of Year", labeltype = "custom", custom.labels = c('Topic 1 (Economy)', 'Topic 2 (Public services)','Topic 3 (Work)', 'Topic 4 (Democratic involvement)', 'Topic 5 (Welfare)')) ######################################################################### # Topical content using STM Year as our covariate ######################################################################### stmFitted_content <- stm(Dfmstm2$documents, Dfmstm2$vocab, K = 5, max.em.its = 75, prevalence = ~ LR +Year, content = ~ Year, data = Dfmstm2$meta, init.type = "Spectral") # example with Topic 5 plot(stmFitted_content, type = "perspectives", topics = 5, main="Topic 5 (Welfare)")