rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/")
getwd()
library(keyATM)
library(quanteda)
library(magrittr)
data("data_corpus_moviereviews", package = "quanteda.textmodels")
corp <- tail(data_corpus_moviereviews, 500)+head(data_corpus_moviereviews, 500) # keeping the last and the first 500 texts
texts(corp )[2]
ndoc(corp )
dfmt <- dfm(corp, remove_number = TRUE)
# min_char: I specify the minimum length in characters for tokens to be removed
dfmt <- dfm_remove(dfmt, stopwords('en'), min_nchar = 2)
# I keep only words that occurr in the top 90% of the distribution and in less than 10% of documents
# (i.e., very frequent but document-specific words)
dfmt <- dfm_trim(dfmt, min_termfreq = 0.90, termfreq_type = "quantile",
max_docfreq = 0.1, docfreq_type = "prop")
###########################################################################
# keyATM
###########################################################################
# Researchers are required to remove any documents that do not contain any
# terms before using the keyATM_read() function.
# If there are documents that do not contain any terms, the keyATM_read()
# function raises a warning. It is highly recommended to manually check
# documents with length 0 before fitting the model, otherwise, the keyATM
# will automatically drop these documents when fitting a model.
# no problems here
table(ntoken(dfmt) > 0)
# if you have problems, you can use the dfm_subset such as this: "dfm_subset(dfmt, ntoken(dfmt) >0)"
# keyATM_read function reads your data for keyATM.
keyATM_docs <- keyATM_read(texts = dfmt)
summary(keyATM_docs)
# let's identify our keywords
keywords <- list(people = c("teen", "kid", "children", "husband"),
space = c("alien", "aliens", "planet", "space"),
monster = c("monster", "ghosts", "fear"),
war = c("war", "soldier", "military"),
crime = c("crime", "murder", "killer"))
# Keywords should appear a reasonable number of times (typically more than 0.1% of the corpus) in the documents.
# In our example some keywords do not satisfy such benchmark (and the topic "monster" overall apepars to have problems)
key_viz <- visualize_keywords(docs = keyATM_docs, keywords = keywords)
key_viz
values_fig(key_viz)
# Proportion is defined as a number of times a keyword occurs in the corpus divided by the total length of documents.
# This measures the frequency of the keyword in the corpus.
# The prune argument of the visualize_keywords() function is TRUE by default. It drops keywords that do not appear
# in the corpus and raises a warning if there is any.
keywords_2 <- list( crime = c("crime", "murder", "killer", "trump"))
key_viz2 <- visualize_keywords(docs = keyATM_docs, keywords = keywords_2)
key_viz2
values_fig(key_viz2)
## If all keywords assigned to a topic is pruned, it raises an error.
keywords_3 <- list(crime = c("trump", "biden"))
key_viz3 <- visualize_keywords(docs = keyATM_docs, keywords = keywords_3)
# Fitting the model
# We pass the output of the keyATM_read function and keywords to the keyATM function.
# Additionally, we need to specify the number of topics without keywords (the no_keyword_topics argument)
# and model. Since this example does not use covariates or time stamps, base is the appropriate model.
# To guarantee the replicability, let's set the random seed in the option argument
system.time(out <- keyATM(docs = keyATM_docs, # text input
no_keyword_topics = 1, # number of topics without keywords; in this case: 1; but you can add any number you want,
# including decide to NOT include any no-keyword topic
keywords = keywords, # keywords
model = "base", # select the model
options = list(seed = 123)))
# There are two main quantities of interest in topic models (and you know that!):
# First, "topic-word distribution" represents the relative frequency of words for each topics, characterizing the topic content
# The top_words() function returns a table of top words for each of estimated topics. Keywords assigned to a keyword topic are suffixed with a check mark.
# Keywords from another keyword topic are labeled with the topic id of that category.
top_words(out)
# what? [__] stands for?
"\u2713"
# check here? https://cran.r-project.org/web/packages/utf8/vignettes/utf8.html
# In the table above, "children" and "husband" are keywords of the "people" topic for example
# There are no keywords under the "monster" topic. Why? Cause they were not originally present that much already since the beginning:
# see: key_viz. In this case, we should probably go back to the "monster" topic and improve the qualities of our keywords
# The no-keyword topic probably is a topic discussing about Holidays movies?
# There are two main quantities of interest in topic models (and you know that!):
# Second, "document-topic distribution" represents the topic prevalence.
# To explore documents that are highly associated with each topic, the top_docs() function returns a table of document indexes in which a topic has high proportion.
# The table below indicates, for example, that the 447 document in the corpus has the highest proportion of the Space topic among all other documents.
top_docs(out)
texts(corp )[447]
# You may want to obtain the entire document-topic distribution and topic-word distribution
# The output of the keyATM() function contains both quantities.
out$theta # Document-topic distribution
apply(out$theta,2,mean) # mean distribution of topics
out$phi # Topic-word distribution
# The keyATM provides other functions to diagnose and explore the fitted model. First, it is important to check the model fitting.
# If the model is working as expected, we would observe an increase trend for the log-likelihood and an decrease trend for the perplexity (remember?).
# Also the fluctuation of these values get smaller as iteration increases. The plot_modelfit() function visualizes the within sample log-likelihood and perplexity
plot_modelfit(out)
# Furthermore, the keyATM can visualize alpha, the prior for the document-topic distribution. Values of these parameters should stabilize over time.
# As well as pi, the probability that each topic uses keyword topic-word distribution (as you can see, we have a bit of problem for the "monster" class)
plot_alpha(out)
plot_pi(out)
#########################################
### Let's compare the results with off-keywords topics =3 or off-keywords topics =5
#########################################
system.time(out2 <- keyATM(docs = keyATM_docs, # text input
no_keyword_topics = 3, # number of topics without keywords; in this case: 1; but you can add any number you want
keywords = keywords, # keywords
model = "base", # select the model
options = list(seed = 123)))
system.time(out3 <- keyATM(docs = keyATM_docs, # text input
no_keyword_topics = 5, # number of topics without keywords; in this case: 1; but you can add any number you want
keywords = keywords, # keywords
model = "base", # select the model
options = list(seed = 123)))
apply(out$theta,2,mean)
apply(out2$theta,2,mean)
apply(out3$theta,2,mean)
cor(apply(out$theta[,1:5],2,mean), apply(out2$theta[,1:5],2,mean))
cor(apply(out$theta[,1:5],2,mean), apply(out3$theta[,1:5],2,mean))
#########################################
### Adding Covariate
#########################################
# in the dfm there is a variable named "sentiment" either positive or negative (for a positive or negative review of the movie)
str(dfmt)
table(dfmt@docvars$sentiment)
# IMPORTANT: Since the keyATM does not take missing values, you must remove any missing values from covariates before extracting
# them. Moreover, I strongly recommend to extract covariates after preprocessing texts (i.e., after you have created your dfm)
# because preprocessing steps will remove documents without any terms, which may result in any discrepancies between documents and covariates.
# Therefore, the recommended procedure is (1) removing missing values from covariates, (2) preprocessing texts (including the process to discard documents
# that do not contain any words), and (3) extracting covariates
vars_selected <- as.data.frame(dfmt@docvars$sentiment) # let's extract in a data frame the "sentiment" covariate
names(vars_selected )[1] <- "sentiment" # let's rename it
str(vars_selected)
# Note: given that we want also to run an analysis on Topic-word distributions, we have to use the "keep" argument in the keyATM() function
# to store Z and S. See below. Otherwise, we could have dropped the last line of the command
system.time(out <- keyATM(docs = keyATM_docs,
no_keyword_topics = 1,
keywords = keywords,
model = "covariates",
model_settings = list(covariates_data = vars_selected,
covariates_formula = ~ sentiment),
options = list(seed = 123),
keep = c("Z", "S") ))
# keyATM automatically standardizes non-factor covariates (i.e., each covariates to have zero mean and a standard deviation one)
# while it does not standardize factor covariate (default) as it happens in our case
# This transformation does not change the substantive results as it is a linear transformation.
# The standardize option in model_settings argument of the keyATM() function takes one of "all", "none", or "non-factor" (default)
# "all" standardizes all covariates (except the intercept), "none" does not standardize any covariates, and "non-factor" standardizes non-factor covariates.
# You can glance covariate information with the covariates_info() and the covariates_get() function will return covariates
# used in the fitted output. In this case, nothing exciting...
covariates_info(out)
head(covariates_get(out))
# The covariate keyATM can characterize the relations between covariates and document-topic distributions.
# We can obtain the marginal posterior mean of document-topic distribution conditioned on the covariates.
# Suppose that we want to know document-topic distributions for each type of sentiment (pos/neg)
# We use a binary variable sentimentpos, which indicates the sentiment for this purpose.
# The by_strata_DocTopic() function can display the marginal posterior means of document-topic distributions
# for each value of (discrete) covariates (note that it also accepts continuous variables)
# In the by_strata_DocTopic() function, we specify the variable we focus on in by_var argument
# and label each value in the variable (ascending order).
# In this case, the sentimentpos equals 0 indicates negative reviews and equals 1 for positive reviews;
# if you have a mac, avoid to add "mc.cores = 1"
strata_topic <- by_strata_DocTopic(out, by_var = "sentimentpos", labels = c("Negative", "Positive"), mc.cores = 1)
# We can visualize results with the plot() function. The figure shows the marginal posterior means of document-topic distributions
# and the 90% credible intervals of them for each value of covariates.
# The figure indicates that crime movies are more likely to receive a negative review contrary to people movies
fig_doctopic <- plot(strata_topic, var_name = "sentimentpos", show_topic = c(1:6))
fig_doctopic
# we can also change the credible intervals: below we are using a 99% credible interval for example
fig_doctopic <- plot(strata_topic, var_name = "sentimentpos", show_topic = c(1:6), ci=.99)
fig_doctopic
# all results together:
plot(strata_topic, var_name = "sentimentpos", show_topic = c(1:6), by = "covariate")
# Using the output of the keyATM, we can calculate 90% (or any other) credible intervals of the differences in the mean of document-topic distribution.
str(strata_topic )
theta1 <- strata_topic$theta[[1]] # negative reviews
str(theta1)
theta2 <- strata_topic$theta[[2]] # positive reviews
str(theta2)
theta_diff <- theta2[, c(1:6)] - theta1[, c(1:6)] # positive minus negative reviews
# we can see that there is a statistical significant difference for people movies (at least with a 90% credible interval),
# that is, people movies tend to get with higher probability a positive rather than a negative review, but not for ex. for monster ones
theta_diff_quantile <- apply(theta_diff, 2, quantile, c(0.025, 0.5, 0.975))
theta_diff_quantile
###############################################
##### Topic-word distributions
###############################################
# Although the covariate keyATM does not directly model topic-word distributions (i.e., topical content),
# the model can examine how topic-word distributions change across different values of document-level covariates.
# We can pass the output with our stored values (obtained via the "keep" command above) to the by_strata_TopicWord() function.
# The example below demonstrates top words associated to each topic for positive and negative rewiews using the Sentiment covariate.
strata_tw <- by_strata_TopicWord(out, keyATM_docs, by = as.vector(vars_selected$sentiment))
# We check the result
top_words(strata_tw, n = 10)
# with keyATM you can also run a semi-supervised dynamic topic analysis (we do not consider it here)
__