rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/")
getwd()

library(keyATM)
library(quanteda)
library(magrittr)

data("data_corpus_moviereviews", package = "quanteda.textmodels")
corp <- tail(data_corpus_moviereviews, 500)+head(data_corpus_moviereviews, 500) # keeping the last and the first 500 texts 
ndoc(corp )

# just an example here: I could have also removed stopwords, turning everything into lower case, etc.
# however be careful with the stemming option! Read below

tok4 <- tokens(corp, remove_numbers=TRUE)
dfmt <- dfm(tok4)
# min_char: I specify the minimum length in characters for tokens to be removed 
dfmt <- dfm_remove(dfmt, stopwords('en'), min_nchar = 2)
# I keep only words that occurr in the top 90% of the distribution and in less than 10% of documents 
# (i.e., very frequent but document-specific words) 
dfmt <- dfm_trim(dfmt, min_termfreq = 0.90, termfreq_type = "quantile",
             max_docfreq = 0.1, docfreq_type = "prop")
topfeatures(dfmt , 20)  # 20 top words

###########################################################################
# keyATM 
###########################################################################

# Researchers are required to remove any documents that do not contain any 
# terms before using the keyATM_read() function (as we did for topic models and STM). 
# If there are documents that do not contain any terms, the keyATM_read() 
# function raises a warning. It is highly recommended to manually check 
# documents with length 0 before fitting the model, otherwise, the keyATM 
# will automatically drop these documents when fitting a model. 

# no problems here
table(ntoken(dfmt) > 0)

# if you have problems, you can use the dfm_subset such as this: "dfm_subset(dfmt, ntoken(dfmt) >0)"

# keyATM_read function reads your data for keyATM.
keyATM_docs <- keyATM_read(texts = dfmt)
summary(keyATM_docs)

# let's identify our keywords
# very IMPORTANT: contrary to when you build a dictionary from within quanteda, in this case wildcards are not recognized!
# Therefore, when you define your dictionary with keyATM, either 1) you do not use stemming before creating your dfm and therefore
# you must include in your keywords both the singular and the plural (kid and kids); 
# or 2) you use the stemming, bu then among the words in the dictionary you must include the stemmed feature 
# (for example, "govern" for "government", etc.)
# Let's follow here the first route (given we have not stemmed our features)

keywords <- list(people = c("family", "couple", "kid", "kids", "children", "parents", "husband", "wife"),
                        space = c("alien", "aliens", "planet", "space"),
                        monster = c("monster", "monsters", "ghost", "ghosts", "zombie", "fear"),
                        war = c("war", "soldier", "soldiers", "tanks", "military"),
                        crime = c("crime", "crimes", "murder", "murders", "killer", "killers", "police"))
keywords 

# Keywords should appear a reasonable number of times (typically more than 0.1% of the corpus) in the documents.
# In our example some keywords do not satisfy such benchmark (and the topic "monster" overall appears to have problems)
key_viz <- visualize_keywords(docs = keyATM_docs, keywords = keywords)
key_viz
values_fig(key_viz)
# Proportion is defined as a number of times a keyword occurs in the corpus divided by the total length of documents. 
# This measures the frequency of the keyword in the corpus.

# The prune argument of the visualize_keywords() function is TRUE by default. It drops keywords that do not appear 
# in the corpus and raises a warning if there is any.

keywords_2 <- list( crime = c("crime", "murder", "killer", "trump"))
key_viz2 <- visualize_keywords(docs = keyATM_docs, keywords = keywords_2)
key_viz2 
values_fig(key_viz2)

## If all keywords assigned to a topic is pruned, it raises an error.
keywords_3 <- list(crime = c("trump", "biden"))
key_viz3 <- visualize_keywords(docs = keyATM_docs, keywords = keywords_3)

# Fitting the model
# We pass the output of the keyATM_read function and keywords to the keyATM function.
# Additionally, we need to specify the number of topics without keywords (the no_keyword_topics argument) 
# and model. Since this example does not use covariates or time stamps, base is the appropriate model.

# To guarantee the replicability, let's set the random seed in the option argument 

system.time(out <- keyATM(docs              = keyATM_docs,    # text input
              no_keyword_topics = 1,              # number of topics without keywords; in this case: 1; but you can add any number you want, 
								  # including decide to NOT include any no-keyword topic 
              keywords          = keywords,       # keywords
              model             = "base",         # select the model
              options           = list(seed = 123)))

# There are two main quantities of interest in topic models (and you know that!):
# First, "topic-word distribution" represents the relative frequency of words for each topics, characterizing the topic content
# The top_words() function returns a table of top words for each of estimated topics. Keywords assigned to a keyword topic are suffixed with a check mark. 
# Keywords from another keyword topic are labeled with the topic id of that category.

top_words(out, 10)

# what? [<U+2713>] stands for? 
"\u2713"
# check here: https://cran.r-project.org/web/packages/utf8/vignettes/utf8.html

# In the table above, "children" and "husband" are keywords of the "people" topic for example
# There are no keywords under the "monster" topic. Why? Cause they were not originally present that much already since the beginning:
# see: key_viz. In this case, we should probably go back to the "monster" topic and improve the qualities of our keywords

# There are two main quantities of interest in topic models (and you know that!):
# Second, "document-topic distribution" represents the topic prevalence.
# To explore documents that are highly associated with each topic, the top_docs() function returns a table of document indexes in which a topic has high proportion.
# The table below indicates, for example, that the 447 document in the corpus has the highest proportion of the Space topic among all other documents.

top_docs(out)
as.character(corp )[447]

# You may want to obtain the entire document-topic distribution and topic-word distribution
# The output of the keyATM() function contains both quantities.

out$theta  # Document-topic distribution
apply(out$theta,2,mean) # mean distribution of topics
out$phi    # Topic-word distribution

###########################################
###########################################
# some diagnostic
###########################################
###########################################

# The keyATM provides other functions to diagnose and explore the fitted model. First, it is important to check the model fitting. 
# If the model is working as expected, we would observe an increase trend for the log-likelihood and an decrease trend for the perplexity (remember?).
# Also the fluctuation of these values get smaller as iteration increases. The plot_modelfit() function visualizes the within sample log-likelihood and perplexity

plot_modelfit(out)

# You can use the value of Perplexity to compare different models (with different number of no-keywords topics for example)
# to understand which is the one that fits better your corpus

# which is the minimum value of Perplexity of my model
min(out$model_fit$Perplexity)

# Furthermore, the keyATM can visualize alpha, the prior for the document-topic distribution. Values of these parameters should stabilize over time.
# As well as pi, the probability that each topic uses keyword topic-word distribution (as you can see, we have a bit of problem for the "monster" class)

plot_alpha(out)
plot_pi(out)

############################################################
# We can also estimate the avg value for topic coherence and exclusivity to compare different models as we did for topic models and STM.
# Of course we do that by estimating the avg. only with respect to keyword-topics (the ones we are mainly
# interested about; also because otherwise by definition there is a higher likelihood that 
# for example the avg. topic coherence decreases if you increase the number of nokeyword topics...)
############################################################

# Let's first estimate the avg topic coherence

words <- top_words(out, 10)
words

# let's get rid of the unicode
topic<- c(1:length(words))
topic

for (i  in topic) 
{ 
words[,i]  <- gsub("\u2713","",words[,i] )
words[,i]  <- gsub("[[:punct:]]","",words[,i] )
words[,i]  <- gsub(" ","",words[,i] )
    }
words

# let's estimate the coherence for each topic
# the coherence function below needs to have a dfm similar to the one we employ in the topicmodels package
library(topicmodels)
dtm <- convert(dfmt , to = "topicmodels")
library(topicdoc)

# let's create an empty data frame that we will fill later on
results <- data.frame(first=vector()) 
results 

# note: coherence is used inside the function topic_coherence. 
# Though it is documented it is not directly available for use. 
# If you want to you could use topicdocs:::coherence and use the function. 
# Note the 3 : to access internal functions

for (i  in topic) 
{ 
coherence <- topicdoc:::coherence(dtm, words[,i], 1)
results <- rbind(results , cbind(coherence ))
    }

results
# avg level of coherence
mean(results [, 1][1:5] )

# Let's estimate now the avg topic exclusiveness (following the same procedure employed in STM)
out$phi
str(out$phi)
tt <- t(out$phi)
str(tt)
apply(tt,2,which.max) 
rownames(tt)[c(3,1137, 983, 334, 72,78)]
top_words(out, 2)

w <- 0.7 # weight to apply (this is the default value in STM)
M <- 10 # let's focus just on the top 10 words
tbeta <-  t(out$phi)
s <- rowSums(tbeta)
mat <- tbeta/s
ex <- apply(mat, 2, rank)/nrow(mat)
fr <- apply(tbeta, 2, rank)/nrow(mat)
frex <- 1/(w/ex + (1 - w)/fr)
index <- apply(tbeta, 2, order, decreasing = TRUE)[1:M, ]
exclus <- vector(length = ncol(tbeta))
for (i in 1:ncol(frex)) {
        exclus [i] <- sum(frex[index[, i], i])
    }
exclus 
mean(exclus[1:5])

#########################################
### Adding Covariate
#########################################

# in the dfm there is a variable named "sentiment" either positive or negative (for a positive or negative review of the movie)

str(dfmt)
table(dfmt@docvars$sentiment)

# IMPORTANT: Since the keyATM does not take missing values, you must remove any missing values from covariates before extracting 
# them. Moreover, I strongly recommend to extract covariates after preprocessing texts (i.e., after you have created your dfm) 
# because preprocessing steps will remove documents without any terms, which may result in any discrepancies between documents and covariates.

# Therefore, the recommended procedure is (1) removing missing values from covariates, (2) preprocessing texts (including the process to discard documents 
# that do not contain any words), and (3) extracting covariates 

vars_selected <- as.data.frame(dfmt@docvars$sentiment) # let's extract in a data frame the "sentiment" covariate
names(vars_selected )[1] <- "sentiment" # let's rename it
str(vars_selected)

# Note: given that we want also to run an analysis on Topic-word distributions (see below), we have to use the "keep" argument in the keyATM() function 
# to store Z and S. See below. Otherwise, we could have dropped the last line of the command

system.time(out <- keyATM(docs              = keyATM_docs,
              no_keyword_topics = 1,
              keywords          = keywords,
              model             = "covariates",
              model_settings    = list(covariates_data    = vars_selected,
                                       covariates_formula = ~ sentiment),
              options           = list(seed = 123),
                keep              = c("Z", "S")  ))

# keyATM automatically standardizes non-factor covariates (i.e., each covariates to have zero mean and a standard deviation one)
# while it does not standardize factor covariate (default) as it happens in our case
# This transformation does not change the substantive results as it is a linear transformation. 
# The standardize option in model_settings argument of the keyATM() function takes one of "all", "none", or "non-factor" (default)
# "all" standardizes all covariates (except the intercept), "none" does not standardize any covariates, and "non-factor" standardizes non-factor covariates.

# You can glance covariate information with the covariates_info() and the covariates_get() function will return covariates 
# used in the fitted output. In this case, nothing exciting...

covariates_info(out)
head(covariates_get(out))
tail(covariates_get(out))

# The covariate keyATM can characterize the relations between covariates and document-topic distributions. 
# We can obtain the marginal posterior mean of document-topic distribution conditioned on the covariates.
# Suppose that we want to know document-topic distributions for each type of sentiment (pos/neg) 
# We use a binary variable sentimentpos, which indicates the sentiment for this purpose.

# The by_strata_DocTopic() function can display the marginal posterior means of document-topic distributions 
# for each value of (discrete) covariates (note that it also accepts continuous variables).
# Substantially, it does the same thing that the "estimateEffect" function (do you remember?) does with STM

# In the by_strata_DocTopic() function, we specify the variable we focus on in by_var argument 
# and label each value in the variable (ascending order).

# In this case, the sentimentpos equals 0 indicates negative reviews and equals 1 for positive reviews;
# if you have a mac, avoid to add "mc.cores = 1"

strata_topic <- by_strata_DocTopic(out, by_var = "sentimentpos", labels = c("Negative", "Positive"), mc.cores = 1)

# We can visualize results with the plot() function. The figure shows the marginal posterior means of document-topic distributions 
# and the 90% credible intervals of them for each value of covariates.
# The figure indicates that crime movies are more likely to receive a negative review contrary to people movies

fig_doctopic <- plot(strata_topic, var_name = "sentimentpos", show_topic = c(1:6))
fig_doctopic

# we can also change the credible intervals: below we are using a 99% credible interval for example
fig_doctopic <- plot(strata_topic, var_name = "sentimentpos", show_topic = c(2), ci=.99)
fig_doctopic

# all results together:
plot(strata_topic, var_name = "sentimentpos", show_topic = c(1:6), by = "covariate")

# Using the output of the keyATM, we can calculate 95% (or any other) credible intervals of the differences in the mean of document-topic distribution.

str(strata_topic )
theta1 <- strata_topic$theta[[1]]  # negative reviews
str(theta1)
theta2 <- strata_topic$theta[[2]]  # positive reviews
str(theta2)
theta_diff <- theta2[, c(1:6)] - theta1[, c(1:6)]  # positive minus negative reviews
# we can see that there is a statistical significant difference for people movies (at least with a 95% credible interval), 
# that is, people movies tend to get with higher probability a positive rather than a negative review, but not for ex. for crime ones
theta_diff_quantile <- apply(theta_diff, 2, quantile, c(0.05, 0.5, 0.975))
theta_diff_quantile

###############################################
##### Topic-word distributions
###############################################

# Although the covariate keyATM does not directly model topic-word distributions (i.e., topical content), 
# the model can examine how topic-word distributions change across different values of document-level covariates. 
# We can pass the output with our stored values (obtained via the "keep" command above) to the by_strata_TopicWord() function. 
# The example below demonstrates top words associated to each topic for positive and negative rewiews using the Sentiment covariate.

strata_tw <- by_strata_TopicWord(out, keyATM_docs, by = as.vector(vars_selected$sentiment))

# We check the result
top_words(strata_tw, n = 10)

# with keyATM you can also run a semi-supervised dynamic topic analysis (i.e., where the relationship between a given word and a given topic can
# change over time). We do not discuss about it here