rm(list=ls(all=TRUE))
getwd()
# with setwd() you change the current working directory of the R process
setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(ggplot2)
library(stm)
library(igraph)
library(wordcloud)

################################################################
# Example with U.S. Presidential Inaugural Speeches since 1970s
################################################################

summary(data_corpus_inaugural)
# we could add some document-level variables – what quanteda calls docvars – to this corpus.
meta.pr <- read.csv("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/List President USA party.csv", 1)
str(meta.pr)
levels(meta.pr$Party)
table(meta.pr$Party)

# Suggestion: if you want to include in a corpus a Factor variable that you are going to include as a covariate
# in a STM, be sure that ALL the levels of such categorical variable will be used in the analysis. This is NOT what
# happens in our case, given that by focusing just on speeches made after 1970 in our example below, we will have 0 observations 
# for the levels Democratic-Republican, Federalist, Unaffiliated, Whig! So let's just keep the levels for Democratic and Repubblican!

levels(meta.pr$Party) <- list(Democratic="Dem", Republican="Rep")
levels(meta.pr$Party)
table(meta.pr$Party)

docvars(data_corpus_inaugural, "Party") <- meta.pr$Party
head(summary(data_corpus_inaugural))

myCorpus <- corpus_subset(data_corpus_inaugural, Year > 1970)
summary(myCorpus)
# inspect the document-level variables (that's very important as an option, as we will see..)
head(docvars(myCorpus ))

myDfm <- dfm(myCorpus, remove = stopwords("english"), tolower = TRUE, stem = TRUE,
                remove_punct = TRUE, remove_numbers=TRUE)

# if you want, you can decide to keep only words occuring >=10 times and in >=2 docs
# Some form of trim is HIGHLY suggested when you have many documents and you want to run a STM
myDfm.trim <- dfm_trim(myDfm, min_termfreq = 10, min_docfreq = 2) 

# Or just drop the words that occurred either too frequently (in more than 99% of documents)
# or too infrequently (in less than 1% of the documents)
myDfm.trim2 <- dfm_trim(myDfm, max_docfreq = 0.99, min_docfreq = 0.01)

# Convert the dfm from Quanteda to STM: this is a crucial step!!!
# When coverting the dfm, you have also to list the document variables that are present in the corpus!
# Note that in this case we keep all the words for simplicity as an example (no trimming!)

Dfmstm2 <- convert(myDfm, to = "stm", docvars = docvars(myCorpus))

################################################################
# STM run
################################################################

# 1) we want to extract 5 topics; 
# 2) we want to control for the following covariates in affecting topic prevalence: Party and Year;
# We could have ran a model with covariates affecting topical content as well (see below for an example).
# While including more covariates in topic prevalence will rarely affect the speed of the model, 
# including additional levels of the content covariates (topical content) can make the model much slower to converge.
# This is due to the model operating in the much higher dimensional space of words in dictionary 
# (which tend to be in the thousands) as opposed to topics. That's why I suggest you to run first a model w/o covariates
# for topical content; 
# 3) you can also run a model w/o covariates. In this case you have an unstructured
# topic model, by fitting a Correlated Topic Model (see below for  an example); 
# 4) it is highly suggested to employ the "spectral" intitialization  based on the method of moments, which is deterministic 
# and globally consistent under reasonable conditions. This means that no matter the seed that is set, the SAME results will be generated. 
# As an alternative you could employ a LDA initialization. In this, case, however, the answers the estimation procedure comes up with may depend 
# on starting values of the parameters (e.g., the distribution over words for a particular topic). 
# So remember always to define a set seed to replicate your analysis in this latter case!

str(Dfmstm2)
stmFitted <- stm(Dfmstm2$documents, Dfmstm2$vocab, K = 5, max.em.its = 75, 
prevalence = ~ Party +Year,  data = Dfmstm2$meta, init.type = "Spectral")

################################################################
# Interpreting the STM by plotting and inspecting results
################################################################

# Summary visualization: understanding the extracted topics through words and example documents
# First: which words/topic association?
labelTopics(stmFitted)
plot(stmFitted, type = "summary", labeltype = c("frex"))
plot(stmFitted, type = "labels", labeltype = c("frex"))
plot(stmFitted, type = "summary") #  words with the highest prob.
cloud(stmFitted, topic = 3) # it works with the words with the highest prob.
# note these grahps also show you the frequency distribution of topics across the 12 documents
plot(stmFitted, type = "hist", labeltype = c("frex"))

# Let's read the documents most associated with each topic
docs <- myCorpus$documents$texts
str(docs)

# Let's focus on topic 2 for example...
thought <- findThoughts(stmFitted, texts=docs, topics=2, n=3)$docs[[1]]
par(mfrow = c(1, 2),mar = c(.5, .5, 1, .5))
plotQuote(thought, width = 30, main = "Topic 2")
# you cannot read it properly!

# save only the first 200 words from each document and display them!
docs2 <- substr(docs, start = 1, stop = 200)
thought2 <- findThoughts(stmFitted, texts=docs2, topics=2, n=3)$docs[[1]]
par(mfrow = c(1, 2),mar = c(.5, .5, 1, .5))
plotQuote(thought2, width = 30, main = "Topic 2")

# Let's focus on topic 5
thought5 <- findThoughts(stmFitted, texts=docs2, topics=5, n=3)$docs[[1]]
par(mfrow = c(1, 2),mar = c(.5, .5, 1, .5))
plotQuote(thought5, width = 30, main = "Topic 5")

# Let's plot Topic 2 and 5 together
par(mfrow = c(1, 2),mar = c(.5, .5, 1, .5))
plotQuote(thought2, width = 30, main = "Topic 2")
plotQuote(thought5, width = 30, main = "Topic 5")

# but to whom those speeches belong to? First let's extract the thetas
theta <-as.data.frame(stmFitted$theta)
str(theta )

# Then let's combine the thetas with the documents
combine <- paste(myCorpus$documents$President, myCorpus$documents$Year, sep = " ")
combine 
rownames(theta ) <- combine 
rownames(theta )

# which are the the most likely topics across our documents?
apply(theta ,1,which.max) 
# you can save them as a document-level variable.
docvars(myDfm , 'STMtopic') <- apply(theta ,1,which.max) 
docvars(myDfm)

# which is the document with the highest value of Topic 2?
rownames(theta )[which.max(theta $V2)]
# which are the documents with the highest value for each Topic?
rownames(theta )[apply(theta , 2, which.max)]

# and the 2nd and 3rd largest one?
maxn <- function(n) function(x) order(x, decreasing = TRUE)[n]
rownames(theta )[apply(theta , 2, maxn(1))]
rownames(theta )[apply(theta , 2, maxn(2))]
rownames(theta )[apply(theta , 2, maxn(3))]

# I want to read the Obama speech of 2009 to better understand the content of Topic 4
Obama <- texts(myCorpus)[10]
strwrap(Obama [[1]])

################################################################
# Additional tools for interpretation and visualization
################################################################

# Uses a topic correlation graph estimated by topicCorr and the igraph package to plot a network where nodes are topics 
# and edges indicate a positive correlation. If no edges = only negative correlations!

mod.out.corr <- topicCorr(stmFitted)
plot(mod.out.corr)
mod.out.corr$cor

library(corrplot)
corrplot(mod.out.corr$cor, method="circle", type="lower")
corrplot(mod.out.corr$cor, method="circle", type="upper")
corrplot(mod.out.corr$cor, method="pie")
corrplot(mod.out.corr$cor, method="color")
corrplot(mod.out.corr$cor, method="number")

#########################################
# Estimating topic prevalence using as IV: Party and Year
#########################################
prep <- estimateEffect(1:5 ~ Party+ Year, stmFitted, meta = Dfmstm2$meta, uncertainty = "Global")
summary(prep)
summary(prep, topics=5)

# Method used for plotting topic prevalence: 
# a) "continuous" estimates how topic proportions vary over the support of a continuous covariate.
# b) "pointestimate" estimates mean topic proportions for each value of the categorical covariate 
# c) "difference" estimates the mean difference in topic proportions for two different values of the covariate (cov.value1 and cov.value2 must be specified)

plot(prep, "Year", method = "continuous", topics = 5,
     model = stmFitted, printlegend = FALSE, xaxt = "n", xlab = "Year")
seq <- seq(from = as.numeric("1970"), to = as.numeric("2017"))
axis(1, at = seq)
title("Topic 5")
abline(h=0, col="blue")

# Topic 4 appears as a Democratic one, while Topic 5 as a Repubblican one
plot(prep, "Party", model=stmFitted, method="pointestimate")

# However only for the Topic 4 case there is a statistical significant difference between the 2 parties 
plot(prep, covariate = "Party", topics = c(1, 2, 3, 4, 5),
model = stmFitted, method = "difference",
cov.value1 = "Democratic", cov.value2 = "Republican", xlim = c(-1, 1),
xlab = "More Repubblican... More Democratic",
main = "Effect of Democratic vs. Repubblican",
labeltype = "custom",
custom.labels = c('Topic 1', 'Topic 2', 'Topic 3','Topic 4', 'Topic 5'))

# Plotting covariate interactions
stmFitted_interaction <- stm(Dfmstm2$documents, Dfmstm2$vocab, K = 5, max.em.its = 75, 
prevalence = ~ Party * Year,  data = Dfmstm2$meta, init.type = "Spectral")

prep2 <- estimateEffect(c(5) ~ Party * Year, 
stmFitted_interaction, meta = Dfmstm2$meta, uncertainty = "Global")

# This plots the interaction between Year and Party (Democratic versus Repubblican). Topic 5 prevalence is plotted as linear
# function of year, holding the rating at either 0 (Democratic) or 1 (Republican). Were other
# variables included in the model, they would be held at their sample medians.
# As you can see now, the effect of Year on Topic 5 prevalance is significant but ONLY for Repubblican Presidents, and only up to early 90s

 plot(prep2, covariate = "Year", model = stmFitted_interaction,
 method = "continuous", xlab = "Year", ylab="Expected Topic 5 Proportion", moderator = "Party",
 moderator.value = "Democratic", linecol = "blue", ylim = c(-1, 1),
 printlegend = F)
 plot(prep, covariate = "Year", model = stmFitted_interaction,
method = "continuous", xlab = "Year", moderator = "Party",
 moderator.value = "Republican", linecol = "red", add = T,
 printlegend = F)
 legend(2000, -0.7, c("Democratic", "Republican"),
 lwd = 2, col = c("blue", "red"))
abline(h=0, col="black")

#########################################
# Estimating topical content using as IV: Party
#########################################

# Let's run a topical content analysis by including as a covariate Party.
# As currently implemented the variable used for topical content must be a single variable which defines a discrete partition of the dataset 
# (each document is in one and only one group)

stmFitted_content <- stm(Dfmstm2$documents, Dfmstm2$vocab, K = 5, max.em.its = 75, 
prevalence = ~ Party +Year, content = ~ Party,  data = Dfmstm2$meta, init.type = "Spectral")

# This figure shows how democratic and repubblican Presidents talk about topic 5 differently
plot(stmFitted_content, type = "perspectives", topics = 5)

# This function can also be used to plot the contrast in words across two topics 
# This plot calculates the difference in probability of a word for the two topics, normalized by the maximum
# difference in probability of any word between the two topics.

plot(stmFitted, type = "perspectives", topics = c(2, 5))
plot(stmFitted, type = "perspectives", labeltype = c("frex"), topics = c(2, 5))

#########################################
# Model selection
#########################################

# Model search across numbers of topics STM assumes a fixed user-specified number of topics. 
# There is not a right answer to the number of topics that are appropriate for a given corpus, but
# the function searchK uses a data-driven approach to selecting the number of topics. 
# The function will perform several automated tests to help choose the number of topics. 
# For example, one could estimate a STM model for 3, 4 and 5 topics and compare the results along each of the criteria
# But REMEMBER: nothing replaces your interpretation!!!! 

K <-c(3,4,5)

# you can also relax the number of maximum required iterations if you want, but that would require you (much) more
# times especially with large datasets  
storage  <- searchK(Dfmstm2$documents, Dfmstm2$vocab, K, max.em.its = 500, 
prevalence = ~ Party +Year, data = Dfmstm2$meta, init.type = "Spectral")
str(storage)

# important: whenever the documents you want to analyze are pretty long, whenever you employ the searchK
# command, it is always advisable also adding this line: N = floor(0.2 * length(Dfmstm2$documents)) -
# where Dfmstm2$documents are the documents that you want to analyze. See below for an example!

# Do not run in the Lab:
# storage2  <- searchK(Dfmstm2$documents, Dfmstm2$vocab, K,  data = Dfmstm2$meta,  max.em.its = 500, 
# N = floor(0.2 * length(Dfmstm2$documents)), prevalence = ~ Party +Year, init.type = "Spectral")
# str(storage2)

# You should focus on those models that lie on the semantic coherence-exclusivity ‘frontier’, that
# is, where no model strictly dominates another in terms of semantic coherence and
# exclusivity (i.e., models with average scores towards the upper right side of the plot). 
# The figure below identifies model 4 as the one that better satisfies the
# ‘frontier’ criterion, i.e., a model with desirable properties in both dimensions 

plot(storage$results$semcoh, storage$results$exclus,
           xlab= "Semantic coherence",
      ylab= "Exclusivity",
       col= "blue", pch = 19, cex = 1, lty = "solid", lwd = 2)
 text(storage$results$semcoh, storage$results$exclus, labels=storage$results$K, cex= 1, pos=4)

# When init.type="Spectral" and K=0 the number of topics is set using the algorithm in Lee and Mimno (2014). 
# This method does not estimate the "true" number of topics and does not necessarily have any particular statistical 
# properties for consistently estimating the number of topics. It can however provide a useful starting point.
# The added randomness from the projection means that the algorithm is not deterministic
# as with the standard "Spectral" initialization type. Running it with a different seed can result in not only different
# results but a different number of topics.

# Do not run in the Lab:
# total  <- stm(Dfmstm2$documents, Dfmstm2$vocab, K=0, max.em.its = 500, 
# prevalence = ~ Party +Year, data = Dfmstm2$meta, init.type = "Spectral", seed = 8458159)
# labelTopics(total)

# In addition to these functions one can also explore if there are words that are extremely highly associated with
# a single topic via the checkBeta function. The ouput gives the user lists of which topics have problems, 
# which words in which topics have problems, as well as a count of the total problems in topics and the total number of problem words.
# In our case, we do not have such kind of problem

check <- checkBeta(stmFitted, tolerance = 0.01)
str(check)

#########################################
##### graphic extensions about STM (if you like them, take a look also at library(stmCorrViz), library(LDAvis)
##### and library(stminsights) among the others
#########################################

# better call this library after a STM with only topic prevalence (and not topical content)
library(stmBrowser)
Dfmstm2$meta$docs <- myCorpus$documents$texts
str(Dfmstm2$meta)
stmBrowser(stmFitted, data= Dfmstm2$meta, c("Year",  "Party"),  text="docs", labeltype = "frex")

# select in the HTML file you produced "X-axis=Topic 4", "Y-axis=Year", "Radius=none", "Color=party"; 
# by doing that you can clearly see that topic 4 is more popular on average among Democratic Presidents 
# [exactly as found in the STM analysis below!]
# if you select "Year" on the Y axis you see however only some numbers, because "Year" in the meta data is a numeric not a date.
# Let's then create a date out of the numeric variable of Year (and let's call it "Year2"), and let's use it as well!

Dfmstm2$meta$Year2 <- Dfmstm2$meta$Year
Dfmstm2$meta$Year2 <- as.character(Dfmstm2$meta$Year2)
str(Dfmstm2$meta)
paste(Dfmstm2$meta$Year2,"-01","-01",sep="")
Dfmstm2$meta$Year2<- as.Date(paste(Dfmstm2$meta$Year2,"-01", "-01",sep=""))
str(Dfmstm2$meta)
# now if you select "Year2" on the Y axis you see the year not numbers!
stmBrowser(stmFitted, data= Dfmstm2$meta, c("Year2", "Year", "Party"),  
text="docs", labeltype = "frex")

#########################################
########## Topic-model (LDA without topic correlations) 
#########################################

library(topicmodels)
# quanteda does not implement own topic models, but you can easily access to LDA() from the topicmodel package 
# through convert(). k = 5 specifies the number of topics to be discovered.
dtm <- convert(myDfm , to = "topicmodels")
lda <- LDA(dtm, k = 5)
# You can extract most important terms from the model using terms().
terms(lda, 5)
# Let's compare them with the results we got via our STM model
labelTopics(stmFitted, topic=1, n=5) 
labelTopics(stmFitted, topic=2, n=5)
labelTopics(stmFitted, topic=3, n=5)
labelTopics(stmFitted, topic=4, n=5)
labelTopics(stmFitted, topic=5, n=5)

# you can then obtain the most likely topics using topics() and save them as a document-level variable.
 topics(lda)
docvars(myDfm , 'LDAtopic') <- topics(lda)
# let's compare the results with the ones we got via STM
docvars(myDfm)

# You can also estimate a correlated topics model from topicmodels by typing the below command. 
# But do NOT run it, unless you are ready to wait for a couple of hours...
# ctm <- CTM(dtm, k = 5)

# The above command would be substantially the same as running stm w/o any covariates
# stmFitted <- stm(Dfmstm2$documents, Dfmstm2$vocab, K = 5, max.em.its = 75, init.type = "Spectral")