rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/") getwd() ##################################### # newsmap ##################################### library(newsmap) library(quanteda) # Step 1: let's begin with 6 texts text_en <- c(text1 = "This is an article about Italy", text2 = "Milan is a beautiful Italian city", text3= "I miss the sea of Spain", text4= "Granada is an ancient Spanish city", text5 = "We liked the fashion in Milan", text6 = "In Granada you can feel history all around you") text_en corp <- corpus(text_en) tok <- tokens(corp) feat_dfm_en <- dfm(tok) feat_dfm_en # Step 2: let's create a dictionary of seed-words via the "dictionary" function of Quanteda and let's apply it to our dfm dict <- dictionary(list(Spain = c("spain", "spanish", "spaniard*", "madrid"), Italy = c("italy", "italian*", "rome"))) dict label <- dfm_lookup(feat_dfm_en, dictionary = dict) label # i.e., the keywords of the topic "Italy" of our dictionary appears twice in the corpus, the same for the topic "Spain" # while 2 texts have no seed words # Step 3: let's train the model model_en <- textmodel_newsmap(feat_dfm_en, label) # now all the words of our two texts, including those not included in the seed-words, get a value! # take a look at the scores for Milan and Granada (not originally included in the seed dictionary!) model_en$model # let's predict the label - we also predicted (correctly) both text5 and text6 (i.e., those texts that did not # include any seed words originally). How was it possible? predict(model_en) # Step 4: we can now apply the trained model to other new texts text_en2 <- c(text7 = "Make India great again", text8 = "I love the sea!") text_en2 corp2 <- corpus(text_en2) tok2 <- tokens(text_en2) feat_dfm_en2 <- dfm(tok2, tolower = TRUE) # can you explain why you get such results? predict(model_en, newdata=feat_dfm_en2) # we cannot classify text5 because it does not include any words included in the texts included considered during # our training-stage. On the other side text6 has been classified as "Spain" thanks to the word sea model_en$model ######################################## ### another example with movie reviews ######################################## # Step 1: let's begin with the texts data("data_corpus_moviereviews", package = "quanteda.textmodels") corp <- head(data_corpus_moviereviews, 500) # keeping the first 500 texts as.character(corp )[2] ndoc(corp ) tok <- tokens(corp, remove_number = TRUE) tok <- tokens_remove(tok, stopwords("en")) dfmt <- dfm(tok ) # min_char: I specify the minimum length in characters for tokens to be removed dfmt <- dfm_remove(dfmt, min_nchar = 2) # I keep only words that occurr in the top 90% of the distribution and in less than 10% of documents dfmt <- dfm_trim(dfmt, min_termfreq = 0.90, termfreq_type = "quantile", max_docfreq = 0.1, docfreq_type = "prop") # It is highly suggested to remove documents that do not contain any terms (as we did for topic models and STM) # before using Newsmap (otherwise how can you predict a text with just a buch of 0s?) # no problems here table(ntoken(dfmt) > 0) # Step 2: let's apply a dictionary of seed-words - remember my dictionary option dict <- dictionary(list(people = c("family", "couple", "kid*", "child*", "parent*", "husband", "wife"), space = c("alien*", "planet", "space"), monster = c("monster*", "ghost*", "zombie*", "fear"), war = c("war", "soldier*", "tanks", "military"), crime = c("crime*", "murder*", "killer*", "police"))) dict # dfm_lookup label <- dfm_lookup(dfmt, dictionary = dict) label[4:5, 1:5] # i.e., the keywords of the topic "crime" of our dictionary appears 2 times in the second review, etc. # Step 3: let's train the model model_en <- textmodel_newsmap(dfmt, label) # now all the words of our texts, including those not included in the seed-words, get a value! # take a look at words such as "disco" and "clooney" str(model_en$model) model_en$model[c("people", "space", "monster", "war", "crime"), c("disco", "clooney")] # let's see the top ten features associated to each of the 5 topics we defined above for (i in 1:5) { print(dict[i,]) print(sort(model_en $model[i,], decreasing = T)[1:10]) } # let's predict the label predict(model_en) prop.table(table(predict(model_en))) # let's plot it newsmap_pred <- as.data.frame(prop.table(table(predict(model_en)))) str(newsmap_pred) names(newsmap_pred)[1] <- "Topics" str(newsmap_pred) library(ggplot2) ggplot(newsmap_pred, aes(x = reorder(Topics, -Freq), y = Freq)) + geom_bar(stat = "identity") + xlab("Topics in the training-set (via Newsmap)") # let's save the trained prediction newsmap_pred <- prop.table(table(predict(model_en))) newsmap_pred # Step 4: let's apply the trained model to completely new texts corp2 <- tail(data_corpus_moviereviews, 250) # keeping the last 250 texts ndoc(corp2) tok2 <- tokens(corp2, remove_number = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) dfmt2 <- dfm(tok2 ) # min_char: I specify the minimum length in characters for tokens to be removed dfmt2 <- dfm_remove(dfmt2, min_nchar = 2) dfmt2 <- dfm_trim(dfmt2, min_termfreq = 0.90, termfreq_type = "quantile", max_docfreq = 0.1, docfreq_type = "prop") predict(model_en, newdata=dfmt2) prop.table(table(predict(model_en))) prop.table(table(predict(model_en, newdata=dfmt2))) ##################################### # keyATM ##################################### library(keyATM) library(magrittr) data("data_corpus_moviereviews", package = "quanteda.textmodels") corp <- tail(data_corpus_moviereviews, 500)+head(data_corpus_moviereviews, 500) # keeping the last and the first 500 texts ndoc(corp ) # just an example here: I could have also removed stopwords, etc. # however be careful with the stemming option! Read below tok4 <- tokens(corp, remove_numbers=TRUE) dfmt <- dfm(tok4) # min_char: I specify the minimum length in characters for tokens to be removed dfmt <- dfm_remove(dfmt, stopwords('en'), min_nchar = 2) # I keep only words that occurr in the top 90% of the distribution and in less than 10% of documents # (i.e., very frequent but document-specific words) dfmt <- dfm_trim(dfmt, min_termfreq = 0.90, termfreq_type = "quantile", max_docfreq = 0.1, docfreq_type = "prop") # It is highly suggested to remove documents that do not contain any terms (as we did for topic models and STM) # before using the keyATM_read() function # no problems here table(ntoken(dfmt) > 0) # if you have problems here, you can use as usual the dfm_subset such as this: "dfm_subset(dfmt, ntoken(dfmt) >0)" # keyATM_read function reads your data for keyATM. keyATM_docs <- keyATM_read(texts = dfmt) summary(keyATM_docs) # let's identify our keywords. # very IMPORTANT: in this case wildcards (i.e., *) are not recognized! # Therefore, when you define your dictionary with keyATM, either 1) you do not use stemming before creating your dfm and therefore # you must include in your keywords both the singular and the plural (kid and kids); # or 2) you use the stemming, bu then among the words in the dictionary you must include the stemmed feature # (for example, "govern" for "government", etc.) # Let's follow here the first route (given we have not stemmed our features) keywords <- list(people = c("family", "couple", "kid", "kids", "children", "parent", "parents", "husband", "wife"), space = c("alien", "aliens", "planet", "space"), monster = c("monster", "monsters", "ghost", "ghosts", "zombie", "fear"), war = c("war", "soldier", "soldiers", "tanks", "military"), crime = c("crime", "crimes", "murder", "murders", "killer", "killers", "police")) keywords # Keywords should appear a reasonable number of times (typically more than 0.1% of the corpus) in the documents. # The plot below allows you to run the ex-ante check on the quality of your keywords. # In our example some keywords do not satisfy such benchmark (and the topic "monster" overall appears to have problems) key_viz <- visualize_keywords(docs = keyATM_docs, keywords = keywords) key_viz values_fig(key_viz) # Proportion is defined as a number of times a keyword occurs in the corpus divided by the total length of documents. # This measures the frequency of the keyword in the corpus. # The prune argument of the visualize_keywords() function is TRUE by default. It drops keywords that do not appear # in the corpus and raises a warning if there is any. keywords_2 <- list( crime = c("crime", "murder", "killer", "trump")) key_viz2 <- visualize_keywords(docs = keyATM_docs, keywords = keywords_2) ## If all keywords assigned to a topic is pruned, it raises an error. keywords_3 <- list(crime = c("trump", "biden")) key_viz3 <- visualize_keywords(docs = keyATM_docs, keywords = keywords_3) # Fitting the model # We pass the output of the keyATM_read function and keywords to the keyATM function. # Additionally, we need to specify the number of topics without keywords (the no_keyword_topics argument) # and model. Since this example does not use covariates, base is the appropriate model. # To guarantee the replicability, let's set the random seed in the option argument system.time(out <- keyATM(docs = keyATM_docs, # text input no_keyword_topics = 1, # number of topics without keywords; in this case: 1; but you can add any number you want, # including decide to NOT include any no-keyword topic keywords = keywords, # keywords model = "base", # select the model to implement options = list(seed = 123))) # always define the seed for replication purposes # There are two main quantities of interest in topic models (and you should already be familiar with them!): # FIRST, "topic-word distribution" represents the relative frequency of words for each topics, characterizing the topic content # The top_words() function returns a table of top words for each of estimated topic # Keywords assigned to a keyword topic are suffixed with a check mark. top_words(out, 10) # if you cannot see this unicode [], it stands for: "\u2713" # check here: https://cran.r-project.org/web/packages/utf8/vignettes/utf8.html # In the table above, "children" and "husband" are keywords of the "people" topic for example # There are no keywords under the "monster" topic. Why? Cause they were not originally present that much already since the beginning: # see: key_viz. In this case, we should probably go back to the "monster" topic and improve the qualities of our keywords # SECOND, "document-topic distribution" represents the topic prevalence. # To explore documents that are highly associated with each topic, the top_docs() function returns a table of document indexes in which a topic has high proportion. # The table below indicates, for example, that the 447 document in the corpus has the highest proportion of the Space topic among all other documents. top_docs(out) as.character(corp )[447] as.character(corp )[27] as.character(corp )[862] # You may want to obtain the entire document-topic distribution and topic-word distribution # The output of the keyATM() function contains both quantities. out$theta # Document-topic distribution apply(out$theta,2,mean) # mean distribution of topics out$phi # Topic-word distribution ########################################### ########################################### # some diagnostic ########################################### ########################################### # The keyATM provides other functions to diagnose and explore the fitted model. First, it is important to check the model fitting. # If the model is working as expected, we would observe an increase trend for the log-likelihood and an decrease trend for the perplexity (remember?). # Also the fluctuation of these values get smaller as iteration increases. # The plot_modelfit() function visualizes the within sample log-likelihood and perplexity plot_modelfit(out) # You can use the value of Perplexity to compare different models (with different number of no-keywords topics for example) # to understand which is the one that fits better your corpus # which is the minimum value of Perplexity of our actual model min(out$model_fit$Perplexity) # Note that we can also estimate the average value for topic coherence and exclusivity to compare # different model specifications as we did for topic models and STM! This is higly recommended! # Take a look at the EXTRA script in the home-page of the course for doing it # Furthermore, the keyATM can visualize alpha, the prior for the document-topic distribution. Values of these parameters should stabilize over time. # As well as pi, the probability that each topic uses keyword topic-word distribution (as you can see, we have a bit of problem even here for the "monster" class). # This latter plot allows you to run the ex-post check on the quality of your keywords. plot_alpha(out) plot_pi(out) ######################################### ### Adding Covariate(s) ######################################### # in the dfm there is a variable named "sentiment" either positive or negative (for a positive or negative review of the movie) head(docvars(dfmt)) str(dfmt@docvars) table(dfmt@docvars$sentiment) # IMPORTANT: Since the keyATM does not take missing values, you must remove any missing values from covariates before extracting # them. Moreover, I also strongly recommend to extract covariates after preprocessing texts (i.e., after you have created your dfm) # because preprocessing steps will remove documents without any terms, which may result in any discrepancies between documents and covariates. # Therefore, the recommended procedure is (1) removing missing values from covariates, (2) preprocessing texts (including the process to discard documents # that do not contain any words), and (3) extracting covariates vars_selected <- as.data.frame(dfmt@docvars$sentiment) # let's extract in a data frame the "sentiment" covariate str(vars_selected) names(vars_selected )[1] <- "sentiment" # let's rename it str(vars_selected) # Note: given that we want also to run an analysis on Topic-word distributions, we have to use the "keep" argument in the keyATM() function # to store Z and S (see below). Otherwise, we could have dropped the last line of the command w/o affecting any of the estimations of the main model system.time(out <- keyATM(docs = keyATM_docs, no_keyword_topics = 1, keywords = keywords, model = "covariates", model_settings = list(covariates_data = vars_selected, covariates_formula = ~ sentiment), options = list(seed = 123), keep = c("Z", "S") )) # keyATM automatically standardizes non-factor covariates (i.e., each covariates to have zero mean and a standard deviation one) # while it does not standardize factor covariate (default) as it happens in our case. # The standardize option in model_settings argument of the keyATM() function takes one of "all", "none", or "non-factor" (default) # "all" standardizes all covariates (except the intercept), "none" does not standardize any covariates, # and "non-factor" standardizes non-factor covariates (the default) # You can glance covariate information with the covariates_info() and the covariates_get() function will return covariates # used in the fitted output. In this case, nothing exciting...still this step is very important cause we realize how keyATM has # named and codified our covariate (a name that we will use below) # our original covariate is a factor named "sentiment" (1=neg; 2=pos) str(vars_selected$sentiment) table(vars_selected$sentiment) # within keyATM such covariate is called "sentimentpos" (i.e., sentiment + pos, i.e., the largest of the two sentiment values) # and it is turned into a dummy: 0=neg; 1=pos covariates_info(out) # first 500 equations (positive sentiment) head(covariates_get(out)) # last 500 equations (negative sentiment) tail(covariates_get(out)) # The covariate keyATM can characterize the relations between covariates and document-topic distributions. # We employ in this respect the by_strata_DocTopic() function. # Substantially, it does the same thing that the "estimateEffect" function (do you remember?) does with STM. # In the by_strata_DocTopic() function, we have to specify the variable we focus on in the "by_var" argument # and label each value in the variable (in an ascending order). # In this case, the sentimentpos equals 0 indicates negative reviews and equals 1 for positive reviews. strata_topic <- by_strata_DocTopic(out, by_var = "sentimentpos", labels = c("Negative", "Positive")) # What have we obtained? str(strata_topic) # Why 150,000 observations of 7 variables? # Cause you get 150 marginal (posterior) mean of document-topic distribution conditioned on the covariates # (in our specific case: sentimentpos) times the number of texts in our corpus (i.e., 1,000 documents). # Why a (posterior) mean distribution for each document? Cause keyATM adopts a Bayesian (not a frequentist) inference! # And the Bayesian inference returns the distribution of possible effect values (the posterior - rather than a single # value as it happens with the frequentist approach, i.e., say the coefficient with its standard error of an OLS)! # This is for example the distribution of the marginal (posterior) mean of the thetas for the "people" class when sentiment=neg plot(strata_topic$theta[[1]][1]) # This is the distribution of the marginal (posterior) mean of the thetas for "people" class when sentiment=pos plot(strata_topic$theta[[2]][1]) # Let's now compute the the median value of such distribution of thetas, as well as its 95% credibile interval. # Note: when you write "1", the apply function works over rows; "2" over columns. # credibile interval for "people" conditioned on the sentimentpost=neg apply(strata_topic$theta[[1]][1] , 2, quantile, c(0.025, 0.5, 0.975)) # credibile interval for "people" conditioned on the sentimentpost=pos apply(strata_topic$theta[[2]][1] , 2, quantile, c(0.025, 0.5, 0.975)) # Why credible and not confidence-intervals? Cause (once again!) we are here in the realm of the Bayesian (not frequentist) inference! # The interpretation of the frequentist confidence interval is the following one: we can be XX% (90%, 95%, 99%) confident that the true (unknown) estimate # would lie within the lower and upper limits of the CI, based on hypothesized repeats of the experiment. And for saying that, we take advantage of the # estimated coefficient together with its standard error. # Credible intervals also describe and summarise the uncertainty related to the unknown parameters you estimate, but using a probability distribution. # As the Bayesian inference returns the distribution of possible effect values (the posterior), # the credible interval is just the range containing a particular percentage of probable values. # The Bayesian framework allows us to say: given the observed data, the effect has XX% probability of falling within this range # Let's plot now the results for each of our class fig_doctopic <- plot(strata_topic, var_name = "sentimentpos", show_topic = c(1:6)) # The figure indicates that crime movies are more likely to receive a negative review contrary to people movies fig_doctopic # We can also change the credible intervals: below we are using a 99% credible interval for example fig_doctopic <- plot(strata_topic, var_name = "sentimentpos", show_topic = c(2), ci=.99) fig_doctopic # But is the difference between the thetas for positive and negative reviews as we noticed above also a significant one? # Let's estimate a 95% credible interval of the difference in the posterior mean of the thetas for each value of our covariate! # Remember: the 95% credible interval is simply the central portion of the posterior distribution (in our case the difference between # positive and negative thetas) that contains 95% of the values. theta_diff_quantile <- apply(strata_topic$theta[[2]]- strata_topic$theta[[1]] , 2, quantile, c(0.025, 0.5, 0.975)) # we can see that there is a statistical significant difference for people movies (at least with a 95% credible interval), # that is, people movies tend to get with a positive rather than a negative review; the opposite happens for crime ones theta_diff_quantile[,1:6] # let's plot it lower <- theta_diff_quantile[1,1:6] mean <- theta_diff_quantile[2,1:6] higher <- theta_diff_quantile[3,1:6] res <- as.data.frame(cbind(lower, mean, higher)) str(res) res$topic <- as.factor(row.names(res)) library(ggplot2) ggplot(res, aes(topic, mean)) + geom_point() + geom_linerange(aes(ymin = lower, ymax = higher)) + geom_hline(yintercept=0, linetype="dashed", color = "red") + theme_bw() + ylab("Mean of the difference with 95% credible interval") + coord_flip() # and if you want to estimate a 90% credibile interval? theta_diff_quantile2 <- apply(strata_topic$theta[[2]]- strata_topic$theta[[1]], 2, quantile, c(0.05, 0.5, 0.95)) # no any relevant differences with the 95% c.i. in this example theta_diff_quantile2[,1:6] ############################################### ##### Topic-word distributions ############################################### # Although the covariate keyATM does not directly model topic-word distributions (i.e., topical content) contrary to STM, # the model can examine how topic-word distributions change across different values of document-level covariates # (i.e., basically it reports which are the words most highly associated with a given topic, according to the values # of your covariates. In our case: which are the words most highly associate with for example the "space" topic # when you have a positive review, and when you have a negative review) # We can pass the output with our stored values (obtained via the "keep" command above) to the by_strata_TopicWord() function. # The example below demonstrates top words associated to each topic for positive and negative rewiews using the Sentiment covariate. # N.B. always pass to by_strata_TopicWord your covariate as a vector strata_tw <- by_strata_TopicWord(out, keyATM_docs, by = as.vector(vars_selected$sentiment)) # We check the result top_words(strata_tw, n = 10) # with keyATM you can also run a semi-supervised dynamic topic analysis (i.e., where the relationship between a given word and a given topic can # change over time). We do not discuss about it here. If you are interested, drop me an email!