rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL") getwd() library(rtweet) library(ggplot2) library(dplyr) library(readtext) library(quanteda) library(httpuv) library(maps) library(leaflet) library(lattice) library(cowplot) library(gridExtra) library(plyr) library(syuzhet) library(wordcloud) ####################### # I report here only the second part related to the STM ####################### tmls <- get_timeline(c("realDonaldTrump"), n = 2000, include_rts=TRUE) # I want to convert the POSIXct time format to a date (here you can also change the time zone by selecting tz="Hongkong" for example) # here I choose Greenwich Mean Time (GMT) str(tmls $created_at) tmls $date <- as.Date(tmls$created_at, "GMT") tmls$date_number <- as.numeric(tmls$date) # I transform the time-variable into a numeric one tmls$date_factor <- as.factor(tmls$date) # I transform the time-variable alsot into a factor one myCorpusTwitter<- corpus(tmls) library(stm) myDfm <- dfm(myCorpusTwitter , remove = c(stopwords("english"), ("realdonaldtrump"), ("trump")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, stem = TRUE, remove_twitter = TRUE, remove_url = TRUE) topfeatures(myDfm , 20) # 20 top words # Let me see my document-term matrix for the first 10 documents and first 10 words myDfm[1:10, 1:10] myDfm.trim<- dfm_trim(myDfm , min_docfreq= 0.05) Dfmstm2 <- convert(myDfm.trim, to = "stm", docvars = docvars(myCorpusTwitter)) str(Dfmstm2 ) ####################################################################### ####################################################################### # Topic analysis ####################################################################### ####################################################################### ########################### how many K? I choose 10 (according to the analysis of tweets back to 4 of November) K <-c( 5:20) storage <- searchK(Dfmstm2$documents, Dfmstm2$vocab, K, max.em.its = 500, prevalence = ~ date_number, data = Dfmstm2$meta, init.type = "Spectral") str(storage) plot(storage$results$semcoh, storage$results$exclus, xlab= "Semantic coherence", ylab= "Exclusivity", col= "blue", pch = 19, cex = 1, lty = "solid", lwd = 2) text(storage$results$semcoh, storage$results$exclus, labels=storage$results$K, cex= 1, pos=4) ########################### running the stm stmFitted <- stm(Dfmstm2$documents, Dfmstm2$vocab, K = 10, max.em.its = 75, prevalence = ~ date_number, data = Dfmstm2$meta, init.type = "Spectral") str(stmFitted) ################################################################ # Interpreting the STM by plotting and inspecting results ################################################################ # Summary visualization: understanding the extracted topics through words and example documents # First: which words/topic association? labelTopics(stmFitted, n=20) plot(stmFitted, labeltype = c("frex")) ################################################################ # Additional tools for interpretation and visualization ################################################################ # Uses a topic correlation graph estimated by topicCorr and the igraph package to plot a network where nodes are topics # and edges indicate a positive correlation. If no edges = only negative correlations! library(igraph) mod.out.corr <- topicCorr(stmFitted) str(mod.out.corr) mod.out.corr set.seed(12345) par(mar=c(0,0,0,0)) plot(mod.out.corr) ######################################### # Estimating topic prevalence ######################################### prep <- estimateEffect(1:10 ~ date_number, stmFitted, meta = Dfmstm2$meta, uncertainty = "Global") summary(prep) # First two topics as an example plot(prep,"date_number", xaxt = "n", printlegend = FALSE, method = "continuous", topics = 1, model = stmFitted) title("Topic 1") abline(h=0, col="blue") axis.Date(side = 1, tmls$date_factor, format = "%d/%m/%Y") plot(prep,"date_number", xaxt = "n", printlegend = FALSE, method = "continuous", topics = 2, model = stmFitted) title("Topic 2") abline(h=0, col="blue") axis.Date(side = 1, tmls$date_factor, format = "%d/%m/%Y") effects <- get_effects(estimates = prep, variable = 'date_number', type = 'continuous') str(effects)