rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK") getwd() library(readtext) library(quanteda) library(stm) ######################################################################### ######################################################################### # Creating and Working with a Corpus ######################################################################### ######################################################################### myText <- readtext("C:/Users/mw/Dropbox (Personale)/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK/*.txt", docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year")) str(myText) testCorpus <- corpus(myText ) summary(testCorpus) testCorpus <- corpus(myText, docid_field = "doc_id") summary(testCorpus) docnames(testCorpus) <- gsub(".txt", "", docnames(testCorpus )) summary(testCorpus) myDfm <- dfm(testCorpus , remove = stopwords("english"), tolower = TRUE, stem = TRUE, remove_punct = TRUE, remove_numbers=TRUE) topfeatures(myDfm , 20) # 20 top words ######################################################################### ######################################################################### # Using wordfish ######################################################################### ######################################################################### # here: LAB 92 to the left of CONS 92 summary(testCorpus) wfm <- textmodel_wordfish(myDfm, dir = c(3, 1)) summary(wfm) str(wfm) textplot_scale1d(wfm, margin = "documents") party <- wfm$docs theta <-wfm$theta se.theta <-wfm$se.theta scores_texts <-data.frame(party , theta , se.theta) str(scores_texts) ######################################################################### ######################################################################### # STM ######################################################################### ######################################################################### head(docvars(testCorpus)) docvars(testCorpus, "LR") <- theta summary(testCorpus) head(docvars(testCorpus)) # convert from quanteda to stm Dfmstm2 <- convert(myDfm, to = "stm", docvars = docvars(testCorpus )) str(Dfmstm2) str(Dfmstm2$vocab) str(Dfmstm2$documents) # Searching for topic number K <-c(3, 4, 5, 10, 20) storage <- searchK(Dfmstm2$documents, Dfmstm2$vocab, K, data = Dfmstm2$meta, N = floor(0.2 * length(Dfmstm2$documents)), prevalence = ~ LR +Year, init.type = "Spectral") plot(storage$results$semcoh, storage$results$exclus, xlab= "Semantic coherence", ylab= "Exclusivity", col= "blue", pch = 19, cex = 1, lty = "solid", lwd = 2) text(storage$results$semcoh, storage$results$exclus, labels=storage$results$K, cex= 1, pos=4) # I decide to keep 5 topics stmFitted <- stm(Dfmstm2$documents, Dfmstm2$vocab, K=5, max.em.its = 500, prevalence = ~ LR +Year, data = Dfmstm2$meta, init.type = "Spectral") labelTopics(stmFitted) library(igraph) mod.out.corr <- topicCorr(stmFitted) plot(mod.out.corr) mod.out.corr$cor ######################################################################### # Label meaning ######################################################################### plot(stmFitted, type = "labels", labeltype = c("frex")) cloud(stmFitted, topic = 1) # economy cloud(stmFitted, topic = 2) # public services cloud(stmFitted, topic = 3) # work cloud(stmFitted, topic = 4) # democratic involvement cloud(stmFitted, topic = 5) # welfare plot(stmFitted, type = "perspectives", labeltype = c("frex"), topics = c(1, 4)) ######################################################################### # Estimating topical prevalence ######################################################################### prep <- estimateEffect(1:5 ~ LR + Year, stmFitted, meta = Dfmstm2$meta, uncertainty = "Global") head(docvars(testCorpus)) str(Dfmstm2) # example with Topic 5 plot(prep, "LR", method = "continuous", topics = 5, model = stmFitted, printlegend = FALSE, xaxt = "n", xlab = "Left-Right") seq <- seq(from = as.numeric("1970"), to = as.numeric("2017")) axis(1, at = seq) title("Topic 5 - Welfare") abline(h=0, col="blue") plot(prep, covariate = "Year", topics = c(1, 2, 3, 4, 5), model = stmFitted, method = "difference", cov.value1 = "97", cov.value2 = "92", xlim = c(-1, 1), xlab = "More 1992... More 1997", main = "Effect of Year", labeltype = "custom", custom.labels = c('Topic 1 (Economy)', 'Topic 2 (Public services)','Topic 3 (Work)', 'Topic 4 (Democratic involvement)', 'Topic 5 (Welfare)')) ######################################################################### # Plotting covariate interactions [just a plus, not needed] ######################################################################### stmFitted_interaction <- stm(Dfmstm2$documents, Dfmstm2$vocab, K = 5, max.em.its = 75, prevalence = ~ LR * Year, data = Dfmstm2$meta, init.type = "Spectral") prep <- estimateEffect(c(5) ~ LR * Year, stmFitted_interaction, meta = Dfmstm2$meta, uncertainty = "Global") plot(prep, covariate = "LR", model = stmFitted_interaction, method = "continuous", xlab = "LR", moderator = "Year", moderator.value = 92, linecol = "blue", ylim = c(-1, 1), printlegend = F) plot(prep, covariate = "LR", model = stmFitted_interaction, method = "continuous", xlab = "LR", moderator = "Year", moderator.value = 97, linecol = "red", add = T, printlegend = F) legend(2000, -0.7, c("92", "97"), lwd = 2, col = c("blue", "red")) abline(h=0, col="black") ######################################################################### # Topical content ######################################################################### stmFitted_content <- stm(Dfmstm2$documents, Dfmstm2$vocab, K = 5, max.em.its = 75, prevalence = ~ LR +Year, content = ~ Year, data = Dfmstm2$meta, init.type = "Spectral") plot(stmFitted_content, type = "perspectives", topics = 5, main="Topic 5 (Welfare)")