rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL")
getwd()
library(rtweet)
library(ggplot2)
library(dplyr)
library(readtext)
library(quanteda)
library(httpuv)
library(maps)
library(leaflet)
library(lattice)
library(cowplot)
library(gridExtra)
library(plyr)
library(syuzhet)
library(wordcloud)

#######################
# I report here only the second part related to the STM
#######################

tmls <- get_timeline(c("realDonaldTrump"), n = 2000, include_rts=TRUE)
# I want to convert the POSIXct time format to a date (here you can also change the time zone by selecting tz="Hongkong" for example)
# here I choose Greenwich Mean Time (GMT)
str(tmls $created_at)
tmls $date <- as.Date(tmls$created_at, "GMT")
tmls$date_number <- as.numeric(tmls$date) # I transform the time-variable into a numeric one
tmls$date_factor <- as.factor(tmls$date) # I transform the time-variable alsot into a factor one

myCorpusTwitter<- corpus(tmls)
library(stm)
myDfm <- dfm(myCorpusTwitter , remove = c(stopwords("english"), ("realdonaldtrump"), ("trump")),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, stem = TRUE, remove_twitter = TRUE, remove_url = TRUE)
topfeatures(myDfm , 20)  # 20 top words
# Let me see my document-term matrix for the first 10 documents and first 10 words
myDfm[1:10, 1:10]
myDfm.trim<- dfm_trim(myDfm , min_docfreq= 0.05)
Dfmstm2 <- convert(myDfm.trim, to = "stm", docvars = docvars(myCorpusTwitter))
str(Dfmstm2 )

#######################################################################
#######################################################################
# Topic analysis
#######################################################################
#######################################################################

########################### how many K? I choose 10 (according to the analysis of tweets back to 4 of November)

 K <-c( 5:20)
 storage  <- searchK(Dfmstm2$documents, Dfmstm2$vocab, K, max.em.its = 500, 
 prevalence = ~ date_number, data = Dfmstm2$meta, init.type = "Spectral")
 str(storage)

 plot(storage$results$semcoh, storage$results$exclus,
           xlab= "Semantic coherence",
      ylab= "Exclusivity",
       col= "blue", pch = 19, cex = 1, lty = "solid", lwd = 2)
 text(storage$results$semcoh, storage$results$exclus, labels=storage$results$K, cex= 1, pos=4)

########################### running the stm 

stmFitted <- stm(Dfmstm2$documents, Dfmstm2$vocab, K = 10, max.em.its = 75, 
prevalence = ~ date_number,  data = Dfmstm2$meta, init.type = "Spectral")
str(stmFitted)

################################################################
# Interpreting the STM by plotting and inspecting results
################################################################

# Summary visualization: understanding the extracted topics through words and example documents
# First: which words/topic association?
labelTopics(stmFitted, n=20)
plot(stmFitted,  labeltype = c("frex"))

################################################################
# Additional tools for interpretation and visualization
################################################################

# Uses a topic correlation graph estimated by topicCorr and the igraph package to plot a network where nodes are topics 
# and edges indicate a positive correlation. If no edges = only negative correlations!

library(igraph)
mod.out.corr <- topicCorr(stmFitted)
str(mod.out.corr)
mod.out.corr 

set.seed(12345)
par(mar=c(0,0,0,0))
plot(mod.out.corr)

#########################################
# Estimating topic prevalence
#########################################
prep <- estimateEffect(1:10 ~ date_number, stmFitted, meta = Dfmstm2$meta, uncertainty = "Global")
summary(prep)

# First two topics as an example

plot(prep,"date_number", xaxt = "n", printlegend = FALSE, method = "continuous", topics = 1, model = stmFitted)
title("Topic 1")
abline(h=0, col="blue")
axis.Date(side = 1, tmls$date_factor, format = "%d/%m/%Y")

plot(prep,"date_number", xaxt = "n",  printlegend = FALSE, method = "continuous", topics = 2, model = stmFitted)
title("Topic 2")
abline(h=0, col="blue")
axis.Date(side = 1, tmls$date_factor, format = "%d/%m/%Y")

effects <- get_effects(estimates = prep,
                      variable = 'date_number',
                      type = 'continuous')

str(effects)