rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/FOR NEXT YEAR 2020/Newsmap")
getwd()
library(quanteda)
library(readtext)
library(ggplot2)
library(ldatuning)
library(topicmodels)
library(lubridate)
library(topicdoc)
library(cowplot)
library(stm)
library(igraph)

#########################################
#########################################
################################################################
########## Let's estimate a Topic-model (LDA) 
################################################################
#########################################
#########################################

# This data frame contains 1,959 Guardian news articles published in 2016
myText <- read.csv("guardian.csv", stringsAsFactors=FALSE)
str(myText)
# creating a new variable "text2". Later it will be useful
myText$text2 <- myText$text
news_corp <- corpus(myText)
head(summary(news_corp))

# After removal of function words and punctuations in dfm(), we keep only the top 5% of the most frequent features 
# (min_termfreq = 0.95) that appear less than 10% in all the documents (max_docfreq = 0.1) using dfm_trim() 
# to focus on common but distinguishing features.

news_dfm <- dfm(news_corp, remove =(stopwords("english")), tolower = TRUE, stem = TRUE,
                remove_punct = TRUE, remove_numbers=TRUE)
news_dfm <- dfm_remove(news_dfm, c('*-time', '*-timeUpdated', 'GMT', 'BST')) 
news_dfm <-   dfm_trim(news_dfm, min_termfreq = 0.95, termfreq_type = "quantile", 
                     max_docfreq = 0.1, docfreq_type = "prop")
news_dfm[1:2, 1:5]

# 20 top words
topfeatures(news_dfm, 20)

# keeping only documents with number of tokens >0 [given that after the above reduction, some articles could actually be empty - 8 here]
news_dfm[ntoken(news_dfm) == 0,]
news_dfm <- news_dfm[ntoken(news_dfm) > 0,]

# as docvars there is the text as well (text2) - it will be helpful later on!
str(news_dfm)

# quanteda does not implement own topic models, but you can easily access to LDA() from the topicmodel package 
# through convert(). k = 5 specifies the number of topics to be discovered.

dtm <- convert(news_dfm, to = "topicmodels")
# let's identify k=5
# being a probabilist model, always a good idea to declare the seed! 
set.seed(123)
system.time(lda <- LDA(dtm, method= "Gibbs", k = 5))

# alternative way to define the seed:
# system.time(lda <- LDA(dtm, method= "Gibbs", k = 5, control = list(seed = 123)))

# We can use get_terms to the top n terms from the topic model (i.e., terms with the highest beta associated with each topic)
# and get_topics to predict the top k topic for each document. This will help us interpret the results of the model.
# Which the most likely terms for each topic?

terms <- get_terms(lda, 10)
# 10 terms for topic 1
terms[,1]
# 10 terms for topic 2, and so on
terms[,2]
terms[,3]
terms[,4]
terms[,5]
# or we can extract directly the most important terms from the model using terms().
terms(lda, 15)
# which meaning for each topic according to you??

# which is the most likely topic for each document?
topics <- get_topics(lda, 1)
head(topics, 10)
# or you can obtain the most likely topics using topics() and save them as a document-level variable.
head(topics(lda))
docvars(news_dfm, 'pred_topic') <- topics(lda)
str(news_dfm)

# Let’s take a closer look at some of these topics. To help us interpret the output, we can look 
# at the words associated with each topic and take a random sample of documents highly associated 
# with each topic.

# Topic 1
str(news_dfm)

paste(terms[,1], collapse=", ")
sample(news_dfm@docvars$text2[topics==1], 2)

# Topic 3
paste(terms[,3], collapse=", ")
sample(news_dfm@docvars$text2[topics==3], 2)

# Topic 3
paste(terms[,5], collapse=", ")
sample(news_dfm@docvars$text2[topics==5], 3)

# Looking at the evolution of certain topics over time can be interesting
# Let’s look for example at Topic 5, which appears to be related to Isis

str(news_dfm@docvars$date)

news_dfm@docvars$month <- substr(news_dfm@docvars$date, 1, 7) # extract year and month (first 7 characters)
table(news_dfm@docvars$month )
# frequency table with articles about Isis as their MAIN topic, per month in 2016
tab <- table(news_dfm@docvars$month [news_dfm@docvars$pred_topic==5])
plot(tab)

# But we can actually do better than this. LDA is a probabilistic model, 
# which means that for each document, it actually computes a distribution over topics. 
# In other words, each document is considered to be about a mixture of topics as we have already discussed!
# This information is included in the matrix gamma in the LDA object (theta in the notation we used for the slides)

# For example, news 1 is 22% about topic 1, while news 3 is 38% about topic 1
round(lda@gamma[1,], 2)
round(lda@gamma[3,], 2)
round(lda@gamma[7,], 2)

# So we can actually take the information in the matrix and aggregate it to compute the average probability that a news each month 
# is about a particular topic. Let’s now choose once again Topic 5

# add probability to dfm
news_dfm@docvars$topic5 <- lda@gamma[,5]
str(news_dfm)
# now aggregate at the month level
agg <- aggregate(news_dfm@docvars$topic5, by=list(month=news_dfm@docvars$month), FUN=mean)
str(agg)
agg$date <- as.Date(paste(agg$month,"-01",sep=""))
str(agg)
# and plot it
plot(agg$date , agg$x, type="l", xlab="Month", ylab="Avg. prob. of article about topic 5",
     main="Estimated proportion of articles discussing about ISIS")

##################################
# identifying the optimal number of topics between for example: 4 and 9
##################################

# other alternatives you could consider are focusing on the perplexity metric or on the log-likelihood of your topic model given different values of K

# let's focus on topic coherence and exclusivity of our models with k=5
topic_diagnostics(lda, dtm)
topic_coherence(lda, dtm)
topic_exclusivity(lda)

mean(topic_coherence(lda, dtm))
mean(topic_exclusivity(lda))

# now let's make k changes between 4 and 9
top <- c(4:15)
top

# let's create an empty data frame that we will fill later on
results <- data.frame(first=vector(), second=vector(), third=vector()) 
results 

# with add iter=100 where iter is the number of Gibbs iterations to make LDA faster; the default value is 2000 [so this is not an advisable strategy
# for a serious project!]

system.time(
for (i  in top) 
{ 
set.seed(123)
lda <- LDA(dtm, method= "Gibbs", k = (i),  control=list(verbose=50L, iter=100))
topic <- (i)
coherence <- mean(topic_coherence(lda, dtm))
exclusivity <- mean(topic_exclusivity(lda))
results <- rbind(results , cbind(topic, coherence, exclusivity ))
    }
)

results
str(results)

# k=9 seems to be the best choice
plot(results$coherence, results$exclusivity, main="Scatterplot Example",
   xlab="Semantic Coherence", ylab="Exclusivity ", pch=19)
text(results$coherence, results$exclusivity, labels=results$topic, cex= 1,  pos=4)

#########################################
#########################################
################################################################
# Let's estimate a STM
################################################################
#########################################
#########################################

setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()

# We will now use a dataset that contains the lead paragraph of around 5,000 articles about the economy published 
# in the New York Times between 1980 and 2014.
nyt <- readtext("nyt.csv", text_field = "lead_paragraph")
str(nyt)
nyt$datetime <- as.Date(nyt$datetime, "%m/%d/%Y") # let's convert the string of the date in a date
str(nyt)

# let's create a new variable according to the party of the president in charge during the temporal period considered
year <- as.numeric(substr(nyt$datetime, 1, 4)) # let's extract the year [i.e., the four first numbers that appear]
table(year)
repub <- ifelse(year %in% c(1981:1992, 2000:2008), 1, 0) # 1 for repub; 0 for dems
table(repub)
nyt$repub <- repub
nyt$year <- year 
str(nyt)

# let's extract a random sample of 1,000 texts (to make things faster in the Lab!)
set.seed(123)
nyt2 <- sample_n(nyt, 1000)
str(nyt2)

myCorpus <- corpus(nyt2)
head(summary(myCorpus))

myDfm <- dfm(myCorpus, remove = stopwords("english"), tolower = TRUE, stem = TRUE,
                remove_punct = TRUE, remove_numbers=TRUE)

# Some form of trim is usually suggested when you have many documents and you want to run a STM.
# For example, we can decide to keep only words occurring in at least 2 documents
myDfm.trim <-dfm_trim(myDfm, min_docfreq = 2, verbose=TRUE)

length(myDfm@Dimnames$features) # 6553 features
length(myDfm.trim@Dimnames$features) # 3332 features

# In our case, given the relative low number of documents and features, we are going to proceed with our
# original dfm (i.e., myDfm)

# Convert the dfm from Quanteda to STM: this is a crucial step!!!
# When coverting the dfm, you have also to list the document variables that are present in the corpus!
# in our case: datetime, repub, year

DfmStm <- convert(myDfm, to = "stm", docvars = docvars(myCorpus))
head(docvars(myCorpus))
str(DfmStm)

################################################################
# STM run
################################################################

# 1) we want to extract 15 topics (i.e., K=15)

# 2) we want to control for the following covariates in affecting topic prevalence: repub and year;
# We could have ran a model with covariates affecting topical content as well (see below for an example).
# While including more covariates in topic prevalence will rarely affect the speed of the model, 
# including additional levels of the content covariates (topical content) can make the model much slower to converge.
# This is due to the model operating in the much higher dimensional space of words in dictionary 
# (which tend to be in the thousands) as opposed to topics. That's why I suggest you to run first a model w/o covariates
# for topical content. REMEMBER however: the results you get when you add variables for topic prevalence but no
# variables fro topical content will be (slighly or more) different to the ones when you add variables for both 
# topic prevalence and topical content together

# 3) it is highly suggested to employ the "spectral" intitialization  based on the method of moments, which is deterministic 
# and globally consistent under reasonable conditions. This means that no matter of the seed that is set, the SAME results will be generated. 
# As an alternative you could employ a LDA initialization. In this, case, however, the answers the estimation procedure comes up with may depend 
# on starting values of the parameters (e.g., the distribution over words for a particular topic). 
# So remember always to define a set seed to replicate your analysis in this latter case (as we did with Topic Models)!

# Note that by writing "s(year)" we use a spline functions for non-linear transformations of the time-variable.
# if you want to add just a linear relationship between "year" and topics, just write "year"

# You can also relax the number of maximum required iterations if you want, but that would require you (much) more
# time especially with large datasets. However 75 is a very small number! We apply that here just to save time!
# The default value is 500.

system.time(stmFitted <- stm(DfmStm $documents, DfmStm $vocab, K = 15, max.em.its = 75, 
prevalence = ~ repub + s(year),  data = DfmStm $meta, init.type = "Spectral")) # around 28 seconds on my laptop

##############################################################################################################
# NB: running a model with or without covariates for topic prevalance produces of course different results;
# the same is true if you run a model with covariates just for the topic prevalance part, or with both the
# topic prevalance part as well as the topical content part 
##############################################################################################################

################################################################
# Interpreting the STM by plotting and inspecting results
################################################################

# Summary visualization: understanding the extracted topics through words and example documents
# First: which words/topic association?

labelTopics(stmFitted, n=7) # 7 features for each topic (including FREX words!)
# The frequency/exclusivity (FREX) scoring summarizes words according to their probability of appearance under a topic and the exclusivity 
# to that topic. These words provide more semantically intuitive representations of each topic

labelTopics(stmFitted, n=7, topics=1) # the same just for topic 1
plot(stmFitted, type = "labels", labeltype = c("frex"), n=5) # plot just frex words 
plot(stmFitted, type = "summary", labeltype = c("frex"), n=5)  # topic 5 is the most frequent one
# topic meaning: according to frex words, topic 5 seems to be related to the trend in the economy; topic 10 to inflation; 
# topic 9 about politics; etc.

plot(stmFitted, type = "hist", labeltype = c("frex")) # Here topic 5 appears as more "evenly" distributed across documents than
# for example topic 11 for example

# Let's read the documents most associated with each topic
# In particular, let's identify the most representative documents for a particular topic. 
# We can also use this in order to get a better sense of the content of actual documents with a high topical content.
docs <- nyt2$text
str(docs)

# Let's focus on topic 5 for example and let's identify the three texts with the highest theta for that topic
# and let's read the first 200 words from each of them

docs2 <- substr(docs, start = 1, stop = 200)
thought5 <- findThoughts(stmFitted, texts=docs2, topics=5, n=3)$docs[[1]]
par(mfrow = c(1, 2),mar = c(.5, .5, 1, .5))
plotQuote(thought5, width = 30, main = "Topic 5")

# Let's focus on topic 9 and let's do the same
thought9 <- findThoughts(stmFitted, texts=docs2, topics=9, n=3)$docs[[1]]
par(mfrow = c(1, 2),mar = c(.5, .5, 1, .5))
plotQuote(thought9, width = 30, main = "Topic 9")

# Let's plot Topic 5 and 9 together
par(mfrow = c(1, 2),mar = c(.5, .5, 1, .5))
plotQuote(thought5, width = 30, main = "Topic 5 - Economy")
plotQuote(thought9, width = 30, main = "Topic 9 - Politics")

# which are the documents with the highest theta for each k?
apply(stmFitted$theta,2,which.max) 
# let's read the entire documents with the highest theta for topic=5
strwrap(texts(myCorpus)[485])

# same one as below
docs2 <- substr(docs, start = 1, stop = 200)
thought5 <- findThoughts(stmFitted, texts=docs2, topics=5, n=1)$docs[[1]]
par(mfrow = c(1, 2),mar = c(.5, .5, 1, .5))
plotQuote(thought5, width = 30, main = "Topic 5")

# which is the document with the 2nd largest value for each topic?
maxn <- function(n) function(x) order(x, decreasing = TRUE)[n]
apply(stmFitted$theta, 2, maxn(2))
# let's read the entire documents with the second highest theta for topic=5
strwrap(texts(myCorpus)[93])

# which is the document with the 3rd largest value for each topic?
apply(stmFitted$theta, 2, maxn(3))
# let's read the entire documents with the third highest theta for topic=5
strwrap(texts(myCorpus)[67])

# which are the most likely topics across our documents?
apply(stmFitted$theta,1,which.max) 
table(apply(stmFitted$theta,1,which.max) )
# you can save them as a document-level variable.
docvars(myDfm , 'STMtopic') <- apply(stmFitted$theta,1,which.max) 
str(myDfm)
head(docvars(myDfm))

# or you can also save them back in the original dataframe
nyt2$topic <- apply(stmFitted$theta,1,which.max)
str(nyt2) 
# Topic 5 - 5 random documents associated to it
set.seed(123)
sample(nyt2$text[nyt2$topic==5], 5)

# But we already know that STM actually computes a distribution over topics. 
# In other words, each document is considered to be about a mixture of topics as we have already discussed!
# This information is included in the matrix thera in the STM object 

# For example, news 1 is 43% about topic 7 and 28% about topic 4 for example
round(stmFitted$theta[1,], 2)

################################################################
# Checking for topic correlation
################################################################

# Uses a topic correlation graph estimated by topicCorr and the igraph package to plot a network where nodes are topics 
# and edges indicate a positive correlation. If no edges = only negative correlations!

mod.out.corr <- topicCorr(stmFitted)
plot(mod.out.corr) # in our case topics 8 and 15 are positively correlated
plot(stmFitted, type = "summary", labeltype = c("frex"), n=5)  # i.e. when an article discusses about Reagan there is a positive
# probability of also discussing about public budget

mod.out.corr$cor

#########################################
# Estimating topic prevalence using as IV: repub and year
#########################################

set.seed(123)
prep <- estimateEffect(1:15 ~ repub + s(year), stmFitted, meta = DfmStm $meta)
set.seed(123)
summary(prep)
# for example there is a tendency to talk less about topic 10 under a Republican Presidency

# Method used for plotting topic prevalence: 
# a) "continuous" estimates how topic proportions vary over the support of a continuous covariate.
# b) "pointestimate" estimates mean topic proportions for each value of the categorical covariate 
# c) "difference" estimates the mean difference in topic proportions for two different values of the covariate (cov.value1 and cov.value2 must be specified)

# Let's start with year (i.e., a continuum variable):

# let's plot it! as you can see topic 10 prevalence (inflation) is stable but at the end it declines becoming 
plot(prep, "year", method = "continuous", topics = 10,
     model = stmFitted, printlegend = FALSE, xaxt = "n", xlab = "Year")
seq <- seq(from = as.numeric("1980"), to = as.numeric("2014"))
axis(1, at = seq)
title("Topic 10 - Inflation")
abline(h=0, col="blue")

# the opposite happens with topics 9 (politics)
plot(prep, "year", method = "continuous", topics = 9,
     model = stmFitted, printlegend = FALSE, xaxt = "n", xlab = "Year")
seq <- seq(from = as.numeric("1980"), to = as.numeric("2014"))
axis(1, at = seq)
title("Topic 9 - Politics")
abline(h=0, col="blue")

# Let's now move to repub (i.e., a categorical variable) for topic 10 (inflation) and topic 14 (stock-market)
plot(stmFitted, type = "summary", labeltype = c("frex"), n=5)  

# the salience to topic 14 seems to be the same under different partisan presidency. While there is a tendency to discuss more
# about topic 10 under a democratic presidency
plot(prep, "repub", model=stmFitted, method="pointestimate", topic=c(10, 14))

# let's see if the difference between the two coefficients is statistically different from 0.
# Always remember that you plot "coef.value1-coef.value2". In this case (see below the script)
# it implies "coeff. of republican-coeff. of democratic" given that republic is a 0/1 variable and cov.value1 = "1" (i.e., Republican) 
# and cov.value2 = "0" (i.e., Democratic)

plot(prep, covariate = "repub", topics = c(10, 14),
model = stmFitted, method = "difference",
cov.value1 = "1", cov.value2 = "0", xlim = c(-1, 1),
xlab = "More Democratic... More Repubblican",
main = "Effect of Democratic vs. Repubblican Presidency",
labeltype = "custom",
custom.labels = c('Topic 10 - Inflation', 'Topic 14 - Stock-Market'))

#########################################
# Estimating topical content using as IV: repub
#########################################

# Let's run a topical content analysis by including as a covariate repub.
# As currently implemented the variable used for topical content must be a single variable which defines a discrete partition of the dataset 
# (each document is in one and only one group - so only categorical variables can be employed)
# You will need around 4 mins to run it. So let's call the RData where the model is already fitted

# system.time(stmFitted_topical <- stm(DfmStm $documents, DfmStm $vocab, K = 15, max.em.its = 75, 
# prevalence = ~ repub + s(year),  content = ~ repub, data = DfmStm $meta, init.type = "Spectral")) # around 222 seconds [4 mins]

# load("C:\\Users\\mw\\Dropbox (VOICES)\\TOPIC MODEL\\Lecture 5\\Material 2020\\STM.RData")
load("C:\\Users\\luigi\\Dropbox\\TOPIC MODEL\\Lecture 5\\Material 2020\\STM.RData")
ls()

# You see that the results you get with and w/o topical content are somehow different
table(apply(stmFitted$theta,1,which.max) )
table(apply(stmFitted_topical$theta,1,which.max) )

apply(stmFitted$theta,2,mean) # mean distribution of topics
apply(stmFitted_topical$theta,2,mean) # mean distribution of topics

# high - but not perfect - correlation
cor(apply(stmFitted$theta,2,mean), apply(stmFitted_topical$theta,2,mean))

# This figure shows how the nyt articles talk about topic 10 (inflation) under a democratic or repubblican presidency
plot(stmFitted_topical, type = "perspectives", topics = 10)

# This function can also be used to plot the contrast in words across two topics 
# This plot calculates the difference in probability of a word for the two topics, normalized by the maximum
# difference in probability of any word between the two topics.

plot(stmFitted_topical, type = "perspectives", topics = c(10, 14))

#########################################
# Model selection
#########################################

# Model search across numbers of topics STM assumes a fixed user-specified number of topics. 
# There is not a right answer to the number of topics that are appropriate for a given corpus, but
# the function searchK uses a data-driven approach to selecting the number of topics 
# The function will perform several automated tests to help choose the number of topics. 
# For example, one could estimate a STM model for 3, 4 and 5 topics (just as an example!) 
# and compare the results along each of the criteria
# But REMEMBER: nothing replaces your interpretation!!!! 

######## ADD always a set.seed here

set.seed(02138)
K <-c(3,4,5)
system.time(storage  <- searchK(DfmStm $documents, DfmStm $vocab, K, max.em.its = 75, 
prevalence = ~ repub + s(year), data = DfmStm $meta, init.type = "Spectral")) # around 1 minute

# important: whenever the documents you want to analyze are pretty long, whenever you employ the searchK
# command, it could be advisable also adding this line: N = floor(0.2 * length(Dfmstm2$documents)) -
# where Dfmstm2$documents are the documents that you want to analyze. See below for an example
# storage2  <- searchK(DfmStm $documents, DfmStm $vocab, K,  data = DfmStm $meta,  max.em.its = 500, 
# N = floor(0.2 * length(DfmStm $documents)), prevalence = ~ repub + s(year), init.type = "Spectral")
# str(storage2)

#####################################################
# The semantic coherence-exclusivity ‘frontier’
#####################################################

# You should focus on those models that lie on the semantic coherence-exclusivity ‘frontier’, that
# is, where no model strictly dominates another in terms of semantic coherence and
# exclusivity (i.e., models with average scores towards the upper right side of the plot). 
# The figure below identifies model 4 as the one that better satisfies the
# ‘frontier’ criterion, i.e., a model with desirable properties in both dimensions 

plot(storage$results$semcoh, storage$results$exclus,
           xlab= "Semantic coherence",
      ylab= "Exclusivity",
       col= "blue", pch = 19, cex = 1, lty = "solid", lwd = 2)
 text(storage$results$semcoh, storage$results$exclus, labels=storage$results$K, cex= 1, pos=4)

# When init.type="Spectral" and K=0 the number of topics is set using the algorithm in Lee and Mimno (2014). 
# This method does not estimate the "true" number of topics and does not necessarily have any particular statistical 
# properties for consistently estimating the number of topics. It can however provide a useful starting point.
# The added randomness from the projection means that the algorithm is not deterministic
# as with the standard "Spectral" initialization type. Running it with a different seed can result in not only different
# results but a different number of topics.

# Do not run in the Lab!!!
# library(geometry)
# library(Rtsne)
# library(rsvd)
# set.seed(123)
# system.time(total  <- stm(DfmStm $documents, DfmStm $vocab, K=0, max.em.its = 500, 
# prevalence = ~ Party +Year, data = DfmStm $meta, init.type = "Spectral"))
# labelTopics(total)

#########################################
##### graphic extensions about STM. There are several of them!
##### Take a look at library(stmCorrViz) and library(stminsights) among the others
#########################################

# Let's see two of them:

library(LDAvis)
library(servr)

toLDAvis(mod=stmFitted, docs=DfmStm $documents, reorder.topics = FALSE)

# Distance between the topics is an approximation of semantic relationship between the topics
# The topic which shares common words will be overlapping (closer in distance) in comparison 
# to the non-overlapping topic

# On selecting a topic (clicking on a topic bubble), top 30 words (with the red-shaded area) are shown

# Finally, decreasing the lambda parameter increases the weight of the ratio of the frequency of word given a specific topic. 
# Moving the slider allows to adjust the rank of terms based on much discriminatory (or "relevant") are for the 
# specific topic: try to move lambda from 1 to 0 and check the results!

# Second: stmBrowser

library(stmBrowser)

str(DfmStm)
str(nyt2)

DfmStm $meta$docs <- nyt2$text
str(DfmStm $meta)

stmBrowser(stmFitted, data= DfmStm $meta, c("repub",  "year"),  text="docs", labeltype = "frex")

# you have an error! Why? cause in the original nyt2$text you have some characters that are not valid UTF-8
# This creates probelm when transforming the object to JSON (using the rjson package).

# Therefore: let's use iconv

nyt2$text2 <- iconv(nyt2$text, "UTF-8", "UTF-8",sub='') 
str(nyt2)

DfmStm $meta$docs <- nyt2$text2
str(DfmStm $meta)

stmBrowser(stmFitted, data= DfmStm $meta, c("repub",  "datetime"),  text="docs", labeltype = "frex")

# as an example: select in the HTML file you produced "X-axis=Topic 10", "Y-axis=datetime", "Radius=none", "Color=repub"