rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/")
getwd()

#####################################
#  newsmap
#####################################

library(newsmap)
library(quanteda)

# Step 1: let's begin with 6 texts
text_en <- c(text1 = "This is an article about Italy",
             text2 = "Milan is a beautiful Italian city",
             text3= "I miss the sea of Spain",
             text4="Granada is an ancient Spanish city",
             text5 = "We liked food more in Siena than in Milan",
             text6 = "In Granada you can feel history all around you") 
corp <- corpus(text_en)
tok <- tokens(corp)
feat_dfm_en <- dfm(tok)
feat_dfm_en

# Step 2: let's apply a dictionary of seed-words
# let's use this list of seed-words about countries coming with newsmap - including word stems/wildcards!
data_dictionary_newsmap_en
names(data_dictionary_newsmap_en)
names(data_dictionary_newsmap_en[["EUROPE"]])
names(data_dictionary_newsmap_en[["EUROPE"]] [["SOUTH"]])
print(data_dictionary_newsmap_en[["EUROPE"]][["SOUTH"]], max_nkey=15)

label <- dfm_lookup(feat_dfm_en, dictionary = data_dictionary_newsmap_en)
label[1:6, 190:205]  
# i.e., the keywords of the topic "IT" of our dictionary appears twice in the corpus, the same for the topic "ES"
# while 2 texts have no seed words

# Step 3: let's train the model
model_en <- textmodel_newsmap(feat_dfm_en, label)
# now all the words of our two texts, including those not included in the seed-words, get a value!
# take a look at the scores for Milan and Granada (not originally included in the seed dictionary!)
model_en$model
# let's predict the label - we also predicted (correctly) both text5 and text6 (i.e., those texts that did not
# include any seed words originally). How was it possible?
predict(model_en)

# Step 4: we can now apply the trained model to other new texts
text_en2 <- c(text7 = "Make India great again", text8 = "I love the sea!")
corp2 <- tokens(text_en2)
feat_dfm_en2 <- dfm(corp2, tolower = TRUE)
# can you explain why you get such results?
predict(model_en, newdata=feat_dfm_en2)

# we cannot classify text5 because it does not include any words included in the texts included considered during
# our training-stage. On the other side text6 has been classified as "Spain" thanks to the word sea
model_en$model

########################################
### another example with movie reviews
########################################

# Step 1: let's begin with the texts
data("data_corpus_moviereviews", package = "quanteda.textmodels")
corp <- tail(data_corpus_moviereviews, 500) # keeping the first 500 texts 
as.character(corp )[2]
ndoc(corp )
tok <- tokens(corp, remove_number = TRUE)
tok <- tokens_remove(tok, stopwords("en"))
dfmt <- dfm(tok )
# min_char: I specify the minimum length in characters for tokens to be removed 
dfmt <- dfm_remove(dfmt, min_nchar = 2)
dfmt <- dfm_trim(dfmt, min_termfreq = 0.90, termfreq_type = "quantile",
             max_docfreq = 0.1, docfreq_type = "prop")

topfeatures(dfmt, 50)
 
# Step 2: let's apply a dictionary of seed-words - remember my dictionary option
dict <- dictionary(list(people = c("family", "couple", "kid*", "child*", "parents", "husband", "wife"),
                        space = c("alien*", "planet", "space"),
                        monster = c("monster*", "ghost*", "zombie*", "fear"),
                        war = c("war", "soldier*", "tanks", "military"),
                        crime = c("crime*", "murder*", "killer*", "police")))
dict

# dfm_lookup
label <- dfm_lookup(dfmt, dictionary = dict)
label[2:3, 1:5] # i.e., the keywords of the topic "crime" of our dictionary appears 2 times in the second review, etc.

# Step 3: let's train the model
model_en <- textmodel_newsmap(dfmt, label)
# now all the words of our texts, including those not included in the seed-words, get a value!
# take a look at words such as dating and tarantino
str(model_en$model)
model_en$model[c("people", "space", "monster", "war", "crime"), c("dating", "tarantino")]

# let's see the top ten features associated to each of the 5 topics we defined above
for (i in 1:5) 
{ 
print(dict[i,])
 print(sort(model_en $model[i,], decreasing = T)[1:10]) 
}

# let's predict the label
predict(model_en)
prop.table(table(predict(model_en)))

# let's plot it
newsmap_pred <- as.data.frame(prop.table(table(predict(model_en))))
str(newsmap_pred)
names(newsmap_pred)[1] <- "Topics"
str(newsmap_pred)

library(ggplot2)
ggplot(newsmap_pred, aes(x = reorder(Topics, -Freq), y = Freq)) + 
geom_bar(stat = "identity") + xlab("Topics in the training-set (via Newsmap)")

# let's save the trained prediction
newsmap_pred <- prop.table(table(predict(model_en)))
newsmap_pred

# Step 4: let's apply the trained model to completely new texts
corp2 <- tail(data_corpus_moviereviews, 250) # keeping the last 250 texts 
ndoc(corp2)
tok2 <- tokens(corp2, remove_number = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
dfmt2 <- dfm(tok2 )
# min_char: I specify the minimum length in characters for tokens to be removed 
dfmt2 <- dfm_remove(dfmt2, min_nchar = 2)
dfmt2 <- dfm_trim(dfmt2, min_termfreq = 0.90, termfreq_type = "quantile",
             max_docfreq = 0.1, docfreq_type = "prop")
predict(model_en, newdata=dfmt2)

prop.table(table(predict(model_en)))
prop.table(table(predict(model_en, newdata=dfmt2)))

#####################################
#  keyATM
#####################################

library(keyATM)
library(quanteda)
library(magrittr)

data("data_corpus_moviereviews", package = "quanteda.textmodels")
corp <- tail(data_corpus_moviereviews, 500)+head(data_corpus_moviereviews, 500) # keeping the last and the first 500 texts 
ndoc(corp )

# just an example here: I could have also removed stopwords, turning everything into lower case, etc.
# however be careful with the stemming option! Read below

tok4 <- tokens(corp, remove_numbers=TRUE)
dfmt <- dfm(tok4)
# min_char: I specify the minimum length in characters for tokens to be removed 
dfmt <- dfm_remove(dfmt, stopwords('en'), min_nchar = 2)
# I keep only words that occurr in the top 90% of the distribution and in less than 10% of documents 
# (i.e., very frequent but document-specific words) 
dfmt <- dfm_trim(dfmt, min_termfreq = 0.90, termfreq_type = "quantile",
             max_docfreq = 0.1, docfreq_type = "prop")
topfeatures(dfmt , 20)  # 20 top words

# It is highly suggested to remove documents that do not contain any terms (as we did for topic models and STM)
# before using the keyATM_read() function 

# no problems here
table(ntoken(dfmt) > 0)

# if you have problems, you can use as usual the dfm_subset such as this: "dfm_subset(dfmt, ntoken(dfmt) >0)"

# keyATM_read function reads your data for keyATM.
keyATM_docs <- keyATM_read(texts = dfmt)
summary(keyATM_docs)

# let's identify our keywords.
# very IMPORTANT: contrary to when you build a dictionary from within quanteda, in this case wildcards are not recognized!
# Therefore, when you define your dictionary with keyATM, either 1) you do not use stemming before creating your dfm and therefore
# you must include in your keywords both the singular and the plural (kid and kids); 
# or 2) you use the stemming, bu then among the words in the dictionary you must include the stemmed feature 
# (for example, "govern" for "government", etc.)
# Let's follow here the first route (given we have not stemmed our features)

keywords <- list(people = c("family", "couple", "kid", "kids", "children", "parents", "husband", "wife"),
                        space = c("alien", "aliens", "planet", "space"),
                        monster = c("monster", "monsters", "ghost", "ghosts", "zombie", "fear"),
                        war = c("war", "soldier", "soldiers", "tanks", "military"),
                        crime = c("crime", "crimes", "murder", "murders", "killer", "killers", "police"))
keywords 

# Keywords should appear a reasonable number of times (typically more than 0.1% of the corpus) in the documents.
# In our example some keywords do not satisfy such benchmark (and the topic "monster" overall appears to have problems)
key_viz <- visualize_keywords(docs = keyATM_docs, keywords = keywords)
key_viz
values_fig(key_viz)
# Proportion is defined as a number of times a keyword occurs in the corpus divided by the total length of documents. 
# This measures the frequency of the keyword in the corpus.

# The prune argument of the visualize_keywords() function is TRUE by default. It drops keywords that do not appear 
# in the corpus and raises a warning if there is any.

keywords_2 <- list( crime = c("crime", "murder", "killer", "trump"))
key_viz2 <- visualize_keywords(docs = keyATM_docs, keywords = keywords_2)
key_viz2 
values_fig(key_viz2)

## If all keywords assigned to a topic is pruned, it raises an error.
keywords_3 <- list(crime = c("trump", "biden"))
key_viz3 <- visualize_keywords(docs = keyATM_docs, keywords = keywords_3)

# Fitting the model
# We pass the output of the keyATM_read function and keywords to the keyATM function.
# Additionally, we need to specify the number of topics without keywords (the no_keyword_topics argument) 
# and model. Since this example does not use covariates or time stamps, base is the appropriate model.

# To guarantee the replicability, let's set the random seed in the option argument 

system.time(out <- keyATM(docs              = keyATM_docs,    # text input
              no_keyword_topics = 1,              # number of topics without keywords; in this case: 1; but you can add any number you want, 
								  # including decide to NOT include any no-keyword topic 
              keywords          = keywords,       # keywords
              model             = "base",         # select the model
              options           = list(seed = 123)))

# There are two main quantities of interest in topic models (and you know that!):
# First, "topic-word distribution" represents the relative frequency of words for each topics, characterizing the topic content
# The top_words() function returns a table of top words for each of estimated topics. Keywords assigned to a keyword topic are suffixed with a check mark. 
# Keywords from another keyword topic are labeled with the topic id of that category.

top_words(out, 10)

# what? [<U+2713>] stands for? 
"\u2713"
# check here: https://cran.r-project.org/web/packages/utf8/vignettes/utf8.html

# In the table above, "children" and "husband" are keywords of the "people" topic for example
# There are no keywords under the "monster" topic. Why? Cause they were not originally present that much already since the beginning:
# see: key_viz. In this case, we should probably go back to the "monster" topic and improve the qualities of our keywords

# There are two main quantities of interest in topic models (and you know that!):
# Second, "document-topic distribution" represents the topic prevalence.
# To explore documents that are highly associated with each topic, the top_docs() function returns a table of document indexes in which a topic has high proportion.
# The table below indicates, for example, that the 447 document in the corpus has the highest proportion of the Space topic among all other documents.

top_docs(out)
as.character(corp )[447]

# You may want to obtain the entire document-topic distribution and topic-word distribution
# The output of the keyATM() function contains both quantities.

out$theta  # Document-topic distribution
apply(out$theta,2,mean) # mean distribution of topics
out$phi    # Topic-word distribution

###########################################
###########################################
# some diagnostic
###########################################
###########################################

# The keyATM provides other functions to diagnose and explore the fitted model. First, it is important to check the model fitting. 
# If the model is working as expected, we would observe an increase trend for the log-likelihood and an decrease trend for the perplexity (remember?).
# Also the fluctuation of these values get smaller as iteration increases. The plot_modelfit() function visualizes the within sample log-likelihood and perplexity

plot_modelfit(out)

# You can use the value of Perplexity to compare different models (with different number of no-keywords topics for example)
# to understand which is the one that fits better your corpus

# which is the minimum value of Perplexity of our actual model
min(out$model_fit$Perplexity)

# Furthermore, the keyATM can visualize alpha, the prior for the document-topic distribution. Values of these parameters should stabilize over time.
# As well as pi, the probability that each topic uses keyword topic-word distribution (as you can see, we have a bit of problem for the "monster" class)

plot_alpha(out)
plot_pi(out)

############################################################
# We do not discuss this in the class, but if you are interest, take a look!
# We can also estimate the avg value for topic coherence and exclusivity to compare different models as we did for topic models and STM.
# Of course we do that by estimating the avg. only with respect to keyword-topics (the ones we are mainly
# interested about; also because otherwise by definition there is a higher likelihood that 
# for example the avg. topic coherence decreases if you increase the number of nokeyword topics...)
############################################################

# Let's first estimate the avg. topic coherence

words <- top_words(out, 10)
words

# let's get rid of the unicode
topic<- c(1:length(words))
topic

for (i  in topic) 
{ 
words[,i]  <- gsub("\u2713","",words[,i] )
words[,i]  <- gsub("[[:punct:]]","",words[,i] )
words[,i]  <- gsub(" ","",words[,i] )
    }
words

# let's estimate the coherence for each topic
# the coherence function below needs to have a dfm similar to the one we employ in the topicmodels package
library(topicmodels)
dtm <- convert(dfmt , to = "topicmodels")

# let's create an empty data frame that we will fill later on
results <- data.frame(first=vector()) 
results 

# The coherence statistic can be computed inside the function topic_coherence. 
# Note the ":::" to access internal functions of a package

library(topicdoc)

for (i  in topic) 
{ 
coherence <- topicdoc:::coherence(dtm, words[,i], 1)
results <- rbind(results , cbind(coherence ))
    }

results
# avg. level of coherence
mean(results [, 1][1:5] )

# Let's estimate now the avg. topic exclusiveness (following the same procedure employed in STM)
out$phi
str(out$phi)
tt <- t(out$phi)
str(tt)
apply(tt,2,which.max) 
rownames(tt)[c(3,1137, 983, 334, 72,78)]
top_words(out, 2)

w <- 0.7 # weight to apply (this is the default value in STM)
M <- 10 # let's focus just on the top 10 words
tbeta <-  t(out$phi)
s <- rowSums(tbeta)
mat <- tbeta/s
ex <- apply(mat, 2, rank)/nrow(mat)
fr <- apply(tbeta, 2, rank)/nrow(mat)
frex <- 1/(w/ex + (1 - w)/fr)
index <- apply(tbeta, 2, order, decreasing = TRUE)[1:M, ]
exclus <- vector(length = ncol(tbeta))
for (i in 1:ncol(frex)) {
        exclus [i] <- sum(frex[index[, i], i])
    }
exclus 
# avg. level of exclusiveness 
mean(exclus[1:5])

#########################################
### Adding Covariate
#########################################

# in the dfm there is a variable named "sentiment" either positive or negative (for a positive or negative review of the movie)

str(dfmt)
table(dfmt@docvars$sentiment)

# IMPORTANT: Since the keyATM does not take missing values, you must remove any missing values from covariates before extracting 
# them. Moreover, I also strongly recommend to extract covariates after preprocessing texts (i.e., after you have created your dfm) 
# because preprocessing steps will remove documents without any terms, which may result in any discrepancies between documents and covariates.

# Therefore, the recommended procedure is (1) removing missing values from covariates, (2) preprocessing texts (including the process to discard documents 
# that do not contain any words), and (3) extracting covariates 

vars_selected <- as.data.frame(dfmt@docvars$sentiment) # let's extract in a data frame the "sentiment" covariate
names(vars_selected )[1] <- "sentiment" # let's rename it
str(vars_selected)

# Note: given that we want also to run an analysis on Topic-word distributions, we have to use the "keep" argument in the keyATM() function 
# to store Z and S (see below). Otherwise, we could have dropped the last line of the command w/o affecting any of the estimations of the main model

system.time(out <- keyATM(docs              = keyATM_docs,
              no_keyword_topics = 1,
              keywords          = keywords,
              model             = "covariates",
              model_settings    = list(covariates_data    = vars_selected,
                                       covariates_formula = ~ sentiment),
              options           = list(seed = 123),
                keep              = c("Z", "S")  ))

# keyATM automatically standardizes non-factor covariates (i.e., each covariates to have zero mean and a standard deviation one)
# while it does not standardize factor covariate (default) as it happens in our case.
# The standardize option in model_settings argument of the keyATM() function takes one of "all", "none", or "non-factor" (default)
# "all" standardizes all covariates (except the intercept), "none" does not standardize any covariates, 
# and "non-factor" standardizes non-factor covariates (the default).

# You can glance covariate information with the covariates_info() and the covariates_get() function will return covariates 
# used in the fitted output. In this case, nothing exciting...still this step is very important cause we realize how keyATM has
# named our covariate (a name that we will use below), here: "sentimentpos" (cause sentiment gets a value of 1 if positive; 0 otherwise)

covariates_info(out)
head(covariates_get(out))
tail(covariates_get(out))

# The covariate keyATM can characterize the relations between covariates and document-topic distributions. 
# We can obtain the marginal posterior mean of document-topic distribution conditioned on the covariates 
# (in our specific case a binary variable sentimentpos, which indicates the sentiment for this purpose).

# We employ in this respect the by_strata_DocTopic() function.
# Substantially, it does the same thing that the "estimateEffect" function (do you remember?) does with STM

# In the by_strata_DocTopic() function, we have to specify the variable we focus on in by_var argument 
# and label each value in the variable (ascending order).
# In this case, the sentimentpos equals 0 indicates negative reviews and equals 1 for positive reviews.
# IMPORTANT: if you have a mac, avoid to add "mc.cores = 1"

strata_topic <- by_strata_DocTopic(out, by_var = "sentimentpos", labels = c("Negative", "Positive"), mc.cores = 1)

# We can visualize results with the plot() function. The figure shows the marginal posterior means of document-topic distributions 
# and the 90% credible intervals of them for each value of covariates.
# The figure indicates that crime movies are more likely to receive a negative review contrary to people movies

fig_doctopic <- plot(strata_topic, var_name = "sentimentpos", show_topic = c(1:6))
fig_doctopic

# we can also change the credible intervals: below we are using a 99% credible interval for example
fig_doctopic <- plot(strata_topic, var_name = "sentimentpos", show_topic = c(2), ci=.99)
fig_doctopic

# Using the output of the keyATM, we can calculate 95% (or any other) credible intervals of the differences in the mean of document-topic distribution.

str(strata_topic )
theta1 <- strata_topic$theta[[1]]  # negative reviews
str(theta1)
theta2 <- strata_topic$theta[[2]]  # positive reviews
str(theta2)
theta_diff <- theta2[, c(1:6)] - theta1[, c(1:6)]  # positive minus negative reviews
# we can see that there is a statistical significant difference for people movies (at least with a 95% credible interval), 
# that is, people movies tend to get with higher probability a positive rather than a negative review, but not for ex. for crime ones
theta_diff_quantile <- apply(theta_diff, 2, quantile, c(0.05, 0.5, 0.975))
theta_diff_quantile

###############################################
##### Topic-word distributions
###############################################

# Although the covariate keyATM does not directly model topic-word distributions (i.e., topical content) contrary to STM, 
# the model can examine how topic-word distributions change across different values of document-level covariates. 
# We can pass the output with our stored values (obtained via the "keep" command above) to the by_strata_TopicWord() function. 
# The example below demonstrates top words associated to each topic for positive and negative rewiews using the Sentiment covariate.

strata_tw <- by_strata_TopicWord(out, keyATM_docs, by = as.vector(vars_selected$sentiment))

# We check the result
top_words(strata_tw, n = 10)

# with keyATM you can also run a semi-supervised dynamic topic analysis (i.e., where the relationship between a given word and a given topic can
# change over time). We do not discuss about it here