rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/") getwd() library(keyATM) library(quanteda) library(magrittr) data("data_corpus_moviereviews", package = "quanteda.textmodels") corp <- tail(data_corpus_moviereviews, 500)+head(data_corpus_moviereviews, 500) # keeping the last and the first 500 texts ndoc(corp ) # just an example here: I could have also removed stopwords, turning everything into lower case, etc. # however be careful with the stemming option! Read below tok4 <- tokens(corp, remove_numbers=TRUE) dfmt <- dfm(tok4) # min_char: I specify the minimum length in characters for tokens to be removed dfmt <- dfm_remove(dfmt, stopwords('en'), min_nchar = 2) # I keep only words that occurr in the top 90% of the distribution and in less than 10% of documents # (i.e., very frequent but document-specific words) dfmt <- dfm_trim(dfmt, min_termfreq = 0.90, termfreq_type = "quantile", max_docfreq = 0.1, docfreq_type = "prop") topfeatures(dfmt , 20) # 20 top words ########################################################################### # keyATM ########################################################################### # Researchers are required to remove any documents that do not contain any # terms before using the keyATM_read() function (as we did for topic models and STM). # If there are documents that do not contain any terms, the keyATM_read() # function raises a warning. It is highly recommended to manually check # documents with length 0 before fitting the model, otherwise, the keyATM # will automatically drop these documents when fitting a model. # no problems here table(ntoken(dfmt) > 0) # if you have problems, you can use the dfm_subset such as this: "dfm_subset(dfmt, ntoken(dfmt) >0)" # keyATM_read function reads your data for keyATM. keyATM_docs <- keyATM_read(texts = dfmt) summary(keyATM_docs) # let's identify our keywords # very IMPORTANT: contrary to when you build a dictionary from within quanteda, in this case wildcards are not recognized! # Therefore, when you define your dictionary with keyATM, either 1) you do not use stemming before creating your dfm and therefore # you must include in your keywords both the singular and the plural (kid and kids); # or 2) you use the stemming, bu then among the words in the dictionary you must include the stemmed feature # (for example, "govern" for "government", etc.) # Let's follow here the first route (given we have not stemmed our features) keywords <- list(people = c("family", "couple", "kid", "kids", "children", "parents", "husband", "wife"), space = c("alien", "aliens", "planet", "space"), monster = c("monster", "monsters", "ghost", "ghosts", "zombie", "fear"), war = c("war", "soldier", "soldiers", "tanks", "military"), crime = c("crime", "crimes", "murder", "murders", "killer", "killers", "police")) keywords # Keywords should appear a reasonable number of times (typically more than 0.1% of the corpus) in the documents. # In our example some keywords do not satisfy such benchmark (and the topic "monster" overall appears to have problems) key_viz <- visualize_keywords(docs = keyATM_docs, keywords = keywords) key_viz values_fig(key_viz) # Proportion is defined as a number of times a keyword occurs in the corpus divided by the total length of documents. # This measures the frequency of the keyword in the corpus. # The prune argument of the visualize_keywords() function is TRUE by default. It drops keywords that do not appear # in the corpus and raises a warning if there is any. keywords_2 <- list( crime = c("crime", "murder", "killer", "trump")) key_viz2 <- visualize_keywords(docs = keyATM_docs, keywords = keywords_2) key_viz2 values_fig(key_viz2) ## If all keywords assigned to a topic is pruned, it raises an error. keywords_3 <- list(crime = c("trump", "biden")) key_viz3 <- visualize_keywords(docs = keyATM_docs, keywords = keywords_3) # Fitting the model # We pass the output of the keyATM_read function and keywords to the keyATM function. # Additionally, we need to specify the number of topics without keywords (the no_keyword_topics argument) # and model. Since this example does not use covariates or time stamps, base is the appropriate model. # To guarantee the replicability, let's set the random seed in the option argument system.time(out <- keyATM(docs = keyATM_docs, # text input no_keyword_topics = 1, # number of topics without keywords; in this case: 1; but you can add any number you want, # including decide to NOT include any no-keyword topic keywords = keywords, # keywords model = "base", # select the model options = list(seed = 123))) # There are two main quantities of interest in topic models (and you know that!): # First, "topic-word distribution" represents the relative frequency of words for each topics, characterizing the topic content # The top_words() function returns a table of top words for each of estimated topics. Keywords assigned to a keyword topic are suffixed with a check mark. # Keywords from another keyword topic are labeled with the topic id of that category. top_words(out, 10) # what? [] stands for? "\u2713" # check here: https://cran.r-project.org/web/packages/utf8/vignettes/utf8.html # In the table above, "children" and "husband" are keywords of the "people" topic for example # There are no keywords under the "monster" topic. Why? Cause they were not originally present that much already since the beginning: # see: key_viz. In this case, we should probably go back to the "monster" topic and improve the qualities of our keywords # There are two main quantities of interest in topic models (and you know that!): # Second, "document-topic distribution" represents the topic prevalence. # To explore documents that are highly associated with each topic, the top_docs() function returns a table of document indexes in which a topic has high proportion. # The table below indicates, for example, that the 447 document in the corpus has the highest proportion of the Space topic among all other documents. top_docs(out) as.character(corp )[447] # You may want to obtain the entire document-topic distribution and topic-word distribution # The output of the keyATM() function contains both quantities. out$theta # Document-topic distribution apply(out$theta,2,mean) # mean distribution of topics out$phi # Topic-word distribution ########################################### ########################################### # some diagnostic ########################################### ########################################### # The keyATM provides other functions to diagnose and explore the fitted model. First, it is important to check the model fitting. # If the model is working as expected, we would observe an increase trend for the log-likelihood and an decrease trend for the perplexity (remember?). # Also the fluctuation of these values get smaller as iteration increases. The plot_modelfit() function visualizes the within sample log-likelihood and perplexity plot_modelfit(out) # You can use the value of Perplexity to compare different models (with different number of no-keywords topics for example) # to understand which is the one that fits better your corpus # which is the minimum value of Perplexity of my model min(out$model_fit$Perplexity) # Furthermore, the keyATM can visualize alpha, the prior for the document-topic distribution. Values of these parameters should stabilize over time. # As well as pi, the probability that each topic uses keyword topic-word distribution (as you can see, we have a bit of problem for the "monster" class) plot_alpha(out) plot_pi(out) ############################################################ # We can also estimate the avg value for topic coherence and exclusivity to compare different models as we did for topic models and STM. # Of course we do that by estimating the avg. only with respect to keyword-topics (the ones we are mainly # interested about; also because otherwise by definition there is a higher likelihood that # for example the avg. topic coherence decreases if you increase the number of nokeyword topics...) ############################################################ # Let's first estimate the avg topic coherence words <- top_words(out, 10) words # let's get rid of the unicode topic<- c(1:length(words)) topic for (i in topic) { words[,i] <- gsub("\u2713","",words[,i] ) words[,i] <- gsub("[[:punct:]]","",words[,i] ) words[,i] <- gsub(" ","",words[,i] ) } words # let's estimate the coherence for each topic # the coherence function below needs to have a dfm similar to the one we employ in the topicmodels package library(topicmodels) dtm <- convert(dfmt , to = "topicmodels") library(topicdoc) # let's create an empty data frame that we will fill later on results <- data.frame(first=vector()) results # note: coherence is used inside the function topic_coherence. # Though it is documented it is not directly available for use. # If you want to you could use topicdocs:::coherence and use the function. # Note the 3 : to access internal functions for (i in topic) { coherence <- topicdoc:::coherence(dtm, words[,i], 1) results <- rbind(results , cbind(coherence )) } results # avg level of coherence mean(results [, 1][1:5] ) # Let's estimate now the avg topic exclusiveness (following the same procedure employed in STM) out$phi str(out$phi) tt <- t(out$phi) str(tt) apply(tt,2,which.max) rownames(tt)[c(3,1137, 983, 334, 72,78)] top_words(out, 2) w <- 0.7 # weight to apply (this is the default value in STM) M <- 10 # let's focus just on the top 10 words tbeta <- t(out$phi) s <- rowSums(tbeta) mat <- tbeta/s ex <- apply(mat, 2, rank)/nrow(mat) fr <- apply(tbeta, 2, rank)/nrow(mat) frex <- 1/(w/ex + (1 - w)/fr) index <- apply(tbeta, 2, order, decreasing = TRUE)[1:M, ] exclus <- vector(length = ncol(tbeta)) for (i in 1:ncol(frex)) { exclus [i] <- sum(frex[index[, i], i]) } exclus mean(exclus[1:5]) ######################################### ### Adding Covariate ######################################### # in the dfm there is a variable named "sentiment" either positive or negative (for a positive or negative review of the movie) str(dfmt) table(dfmt@docvars$sentiment) # IMPORTANT: Since the keyATM does not take missing values, you must remove any missing values from covariates before extracting # them. Moreover, I strongly recommend to extract covariates after preprocessing texts (i.e., after you have created your dfm) # because preprocessing steps will remove documents without any terms, which may result in any discrepancies between documents and covariates. # Therefore, the recommended procedure is (1) removing missing values from covariates, (2) preprocessing texts (including the process to discard documents # that do not contain any words), and (3) extracting covariates vars_selected <- as.data.frame(dfmt@docvars$sentiment) # let's extract in a data frame the "sentiment" covariate names(vars_selected )[1] <- "sentiment" # let's rename it str(vars_selected) # Note: given that we want also to run an analysis on Topic-word distributions (see below), we have to use the "keep" argument in the keyATM() function # to store Z and S. See below. Otherwise, we could have dropped the last line of the command system.time(out <- keyATM(docs = keyATM_docs, no_keyword_topics = 1, keywords = keywords, model = "covariates", model_settings = list(covariates_data = vars_selected, covariates_formula = ~ sentiment), options = list(seed = 123), keep = c("Z", "S") )) # keyATM automatically standardizes non-factor covariates (i.e., each covariates to have zero mean and a standard deviation one) # while it does not standardize factor covariate (default) as it happens in our case # This transformation does not change the substantive results as it is a linear transformation. # The standardize option in model_settings argument of the keyATM() function takes one of "all", "none", or "non-factor" (default) # "all" standardizes all covariates (except the intercept), "none" does not standardize any covariates, and "non-factor" standardizes non-factor covariates. # You can glance covariate information with the covariates_info() and the covariates_get() function will return covariates # used in the fitted output. In this case, nothing exciting... covariates_info(out) head(covariates_get(out)) tail(covariates_get(out)) # The covariate keyATM can characterize the relations between covariates and document-topic distributions. # We can obtain the marginal posterior mean of document-topic distribution conditioned on the covariates. # Suppose that we want to know document-topic distributions for each type of sentiment (pos/neg) # We use a binary variable sentimentpos, which indicates the sentiment for this purpose. # The by_strata_DocTopic() function can display the marginal posterior means of document-topic distributions # for each value of (discrete) covariates (note that it also accepts continuous variables). # Substantially, it does the same thing that the "estimateEffect" function (do you remember?) does with STM # In the by_strata_DocTopic() function, we specify the variable we focus on in by_var argument # and label each value in the variable (ascending order). # In this case, the sentimentpos equals 0 indicates negative reviews and equals 1 for positive reviews; # if you have a mac, avoid to add "mc.cores = 1" strata_topic <- by_strata_DocTopic(out, by_var = "sentimentpos", labels = c("Negative", "Positive"), mc.cores = 1) # We can visualize results with the plot() function. The figure shows the marginal posterior means of document-topic distributions # and the 90% credible intervals of them for each value of covariates. # The figure indicates that crime movies are more likely to receive a negative review contrary to people movies fig_doctopic <- plot(strata_topic, var_name = "sentimentpos", show_topic = c(1:6)) fig_doctopic # we can also change the credible intervals: below we are using a 99% credible interval for example fig_doctopic <- plot(strata_topic, var_name = "sentimentpos", show_topic = c(2), ci=.99) fig_doctopic # all results together: plot(strata_topic, var_name = "sentimentpos", show_topic = c(1:6), by = "covariate") # Using the output of the keyATM, we can calculate 95% (or any other) credible intervals of the differences in the mean of document-topic distribution. str(strata_topic ) theta1 <- strata_topic$theta[[1]] # negative reviews str(theta1) theta2 <- strata_topic$theta[[2]] # positive reviews str(theta2) theta_diff <- theta2[, c(1:6)] - theta1[, c(1:6)] # positive minus negative reviews # we can see that there is a statistical significant difference for people movies (at least with a 95% credible interval), # that is, people movies tend to get with higher probability a positive rather than a negative review, but not for ex. for crime ones theta_diff_quantile <- apply(theta_diff, 2, quantile, c(0.05, 0.5, 0.975)) theta_diff_quantile ############################################### ##### Topic-word distributions ############################################### # Although the covariate keyATM does not directly model topic-word distributions (i.e., topical content), # the model can examine how topic-word distributions change across different values of document-level covariates. # We can pass the output with our stored values (obtained via the "keep" command above) to the by_strata_TopicWord() function. # The example below demonstrates top words associated to each topic for positive and negative rewiews using the Sentiment covariate. strata_tw <- by_strata_TopicWord(out, keyATM_docs, by = as.vector(vars_selected$sentiment)) # We check the result top_words(strata_tw, n = 10) # with keyATM you can also run a semi-supervised dynamic topic analysis (i.e., where the relationship between a given word and a given topic can # change over time). We do not discuss about it here