rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/") getwd() ##################################### # newsmap ##################################### library(newsmap) library(quanteda) # Step 1: let's begin with 6 texts text_en <- c(text1 = "This is an article about Italy", text2 = "Milan is a beautiful Italian city", text3= "I miss the sea of Spain", text4="Granada is an ancient Spanish city", text5 = "We liked food more in Siena than in Milan", text6 = "In Granada you can feel history all around you") corp <- corpus(text_en) tok <- tokens(corp) feat_dfm_en <- dfm(tok) feat_dfm_en # Step 2: let's apply a dictionary of seed-words # let's use this list of seed-words about countries coming with newsmap - including word stems/wildcards! data_dictionary_newsmap_en names(data_dictionary_newsmap_en) names(data_dictionary_newsmap_en[["EUROPE"]]) names(data_dictionary_newsmap_en[["EUROPE"]] [["SOUTH"]]) print(data_dictionary_newsmap_en[["EUROPE"]][["SOUTH"]], max_nkey=15) label <- dfm_lookup(feat_dfm_en, dictionary = data_dictionary_newsmap_en) label[1:6, 190:205] # i.e., the keywords of the topic "IT" of our dictionary appears twice in the corpus, the same for the topic "ES" # while 2 texts have no seed words # Step 3: let's train the model model_en <- textmodel_newsmap(feat_dfm_en, label) # now all the words of our two texts, including those not included in the seed-words, get a value! # take a look at the scores for Milan and Granada (not originally included in the seed dictionary!) model_en$model # let's predict the label - we also predicted (correctly) both text5 and text6 (i.e., those texts that did not # include any seed words originally). How was it possible? predict(model_en) # Step 4: we can now apply the trained model to other new texts text_en2 <- c(text7 = "Make India great again", text8 = "I love the sea!") corp2 <- tokens(text_en2) feat_dfm_en2 <- dfm(corp2, tolower = TRUE) # can you explain why you get such results? predict(model_en, newdata=feat_dfm_en2) # we cannot classify text5 because it does not include any words included in the texts included considered during # our training-stage. On the other side text6 has been classified as "Spain" thanks to the word sea model_en$model ######################################## ### another example with movie reviews ######################################## # Step 1: let's begin with the texts data("data_corpus_moviereviews", package = "quanteda.textmodels") corp <- tail(data_corpus_moviereviews, 500) # keeping the first 500 texts as.character(corp )[2] ndoc(corp ) tok <- tokens(corp, remove_number = TRUE) tok <- tokens_remove(tok, stopwords("en")) dfmt <- dfm(tok ) # min_char: I specify the minimum length in characters for tokens to be removed dfmt <- dfm_remove(dfmt, min_nchar = 2) dfmt <- dfm_trim(dfmt, min_termfreq = 0.90, termfreq_type = "quantile", max_docfreq = 0.1, docfreq_type = "prop") topfeatures(dfmt, 50) # Step 2: let's apply a dictionary of seed-words - remember my dictionary option dict <- dictionary(list(people = c("family", "couple", "kid*", "child*", "parents", "husband", "wife"), space = c("alien*", "planet", "space"), monster = c("monster*", "ghost*", "zombie*", "fear"), war = c("war", "soldier*", "tanks", "military"), crime = c("crime*", "murder*", "killer*", "police"))) dict # dfm_lookup label <- dfm_lookup(dfmt, dictionary = dict) label[2:3, 1:5] # i.e., the keywords of the topic "crime" of our dictionary appears 2 times in the second review, etc. # Step 3: let's train the model model_en <- textmodel_newsmap(dfmt, label) # now all the words of our texts, including those not included in the seed-words, get a value! # take a look at words such as dating and tarantino str(model_en$model) model_en$model[c("people", "space", "monster", "war", "crime"), c("dating", "tarantino")] # let's see the top ten features associated to each of the 5 topics we defined above for (i in 1:5) { print(dict[i,]) print(sort(model_en $model[i,], decreasing = T)[1:10]) } # let's predict the label predict(model_en) prop.table(table(predict(model_en))) # let's plot it newsmap_pred <- as.data.frame(prop.table(table(predict(model_en)))) str(newsmap_pred) names(newsmap_pred)[1] <- "Topics" str(newsmap_pred) library(ggplot2) ggplot(newsmap_pred, aes(x = reorder(Topics, -Freq), y = Freq)) + geom_bar(stat = "identity") + xlab("Topics in the training-set (via Newsmap)") # let's save the trained prediction newsmap_pred <- prop.table(table(predict(model_en))) newsmap_pred # Step 4: let's apply the trained model to completely new texts corp2 <- tail(data_corpus_moviereviews, 250) # keeping the last 250 texts ndoc(corp2) tok2 <- tokens(corp2, remove_number = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) dfmt2 <- dfm(tok2 ) # min_char: I specify the minimum length in characters for tokens to be removed dfmt2 <- dfm_remove(dfmt2, min_nchar = 2) dfmt2 <- dfm_trim(dfmt2, min_termfreq = 0.90, termfreq_type = "quantile", max_docfreq = 0.1, docfreq_type = "prop") predict(model_en, newdata=dfmt2) prop.table(table(predict(model_en))) prop.table(table(predict(model_en, newdata=dfmt2))) ##################################### # keyATM ##################################### library(keyATM) library(quanteda) library(magrittr) data("data_corpus_moviereviews", package = "quanteda.textmodels") corp <- tail(data_corpus_moviereviews, 500)+head(data_corpus_moviereviews, 500) # keeping the last and the first 500 texts ndoc(corp ) # just an example here: I could have also removed stopwords, turning everything into lower case, etc. # however be careful with the stemming option! Read below tok4 <- tokens(corp, remove_numbers=TRUE) dfmt <- dfm(tok4) # min_char: I specify the minimum length in characters for tokens to be removed dfmt <- dfm_remove(dfmt, stopwords('en'), min_nchar = 2) # I keep only words that occurr in the top 90% of the distribution and in less than 10% of documents # (i.e., very frequent but document-specific words) dfmt <- dfm_trim(dfmt, min_termfreq = 0.90, termfreq_type = "quantile", max_docfreq = 0.1, docfreq_type = "prop") topfeatures(dfmt , 20) # 20 top words # It is highly suggested to remove documents that do not contain any terms (as we did for topic models and STM) # before using the keyATM_read() function # no problems here table(ntoken(dfmt) > 0) # if you have problems, you can use as usual the dfm_subset such as this: "dfm_subset(dfmt, ntoken(dfmt) >0)" # keyATM_read function reads your data for keyATM. keyATM_docs <- keyATM_read(texts = dfmt) summary(keyATM_docs) # let's identify our keywords. # very IMPORTANT: contrary to when you build a dictionary from within quanteda, in this case wildcards are not recognized! # Therefore, when you define your dictionary with keyATM, either 1) you do not use stemming before creating your dfm and therefore # you must include in your keywords both the singular and the plural (kid and kids); # or 2) you use the stemming, bu then among the words in the dictionary you must include the stemmed feature # (for example, "govern" for "government", etc.) # Let's follow here the first route (given we have not stemmed our features) keywords <- list(people = c("family", "couple", "kid", "kids", "children", "parents", "husband", "wife"), space = c("alien", "aliens", "planet", "space"), monster = c("monster", "monsters", "ghost", "ghosts", "zombie", "fear"), war = c("war", "soldier", "soldiers", "tanks", "military"), crime = c("crime", "crimes", "murder", "murders", "killer", "killers", "police")) keywords # Keywords should appear a reasonable number of times (typically more than 0.1% of the corpus) in the documents. # In our example some keywords do not satisfy such benchmark (and the topic "monster" overall appears to have problems) key_viz <- visualize_keywords(docs = keyATM_docs, keywords = keywords) key_viz values_fig(key_viz) # Proportion is defined as a number of times a keyword occurs in the corpus divided by the total length of documents. # This measures the frequency of the keyword in the corpus. # The prune argument of the visualize_keywords() function is TRUE by default. It drops keywords that do not appear # in the corpus and raises a warning if there is any. keywords_2 <- list( crime = c("crime", "murder", "killer", "trump")) key_viz2 <- visualize_keywords(docs = keyATM_docs, keywords = keywords_2) key_viz2 values_fig(key_viz2) ## If all keywords assigned to a topic is pruned, it raises an error. keywords_3 <- list(crime = c("trump", "biden")) key_viz3 <- visualize_keywords(docs = keyATM_docs, keywords = keywords_3) # Fitting the model # We pass the output of the keyATM_read function and keywords to the keyATM function. # Additionally, we need to specify the number of topics without keywords (the no_keyword_topics argument) # and model. Since this example does not use covariates or time stamps, base is the appropriate model. # To guarantee the replicability, let's set the random seed in the option argument system.time(out <- keyATM(docs = keyATM_docs, # text input no_keyword_topics = 1, # number of topics without keywords; in this case: 1; but you can add any number you want, # including decide to NOT include any no-keyword topic keywords = keywords, # keywords model = "base", # select the model options = list(seed = 123))) # There are two main quantities of interest in topic models (and you know that!): # First, "topic-word distribution" represents the relative frequency of words for each topics, characterizing the topic content # The top_words() function returns a table of top words for each of estimated topics. Keywords assigned to a keyword topic are suffixed with a check mark. # Keywords from another keyword topic are labeled with the topic id of that category. top_words(out, 10) # what? [] stands for? "\u2713" # check here: https://cran.r-project.org/web/packages/utf8/vignettes/utf8.html # In the table above, "children" and "husband" are keywords of the "people" topic for example # There are no keywords under the "monster" topic. Why? Cause they were not originally present that much already since the beginning: # see: key_viz. In this case, we should probably go back to the "monster" topic and improve the qualities of our keywords # There are two main quantities of interest in topic models (and you know that!): # Second, "document-topic distribution" represents the topic prevalence. # To explore documents that are highly associated with each topic, the top_docs() function returns a table of document indexes in which a topic has high proportion. # The table below indicates, for example, that the 447 document in the corpus has the highest proportion of the Space topic among all other documents. top_docs(out) as.character(corp )[447] # You may want to obtain the entire document-topic distribution and topic-word distribution # The output of the keyATM() function contains both quantities. out$theta # Document-topic distribution apply(out$theta,2,mean) # mean distribution of topics out$phi # Topic-word distribution ########################################### ########################################### # some diagnostic ########################################### ########################################### # The keyATM provides other functions to diagnose and explore the fitted model. First, it is important to check the model fitting. # If the model is working as expected, we would observe an increase trend for the log-likelihood and an decrease trend for the perplexity (remember?). # Also the fluctuation of these values get smaller as iteration increases. The plot_modelfit() function visualizes the within sample log-likelihood and perplexity plot_modelfit(out) # You can use the value of Perplexity to compare different models (with different number of no-keywords topics for example) # to understand which is the one that fits better your corpus # which is the minimum value of Perplexity of our actual model min(out$model_fit$Perplexity) # Furthermore, the keyATM can visualize alpha, the prior for the document-topic distribution. Values of these parameters should stabilize over time. # As well as pi, the probability that each topic uses keyword topic-word distribution (as you can see, we have a bit of problem for the "monster" class) plot_alpha(out) plot_pi(out) ############################################################ # We do not discuss this in the class, but if you are interest, take a look! # We can also estimate the avg value for topic coherence and exclusivity to compare different models as we did for topic models and STM. # Of course we do that by estimating the avg. only with respect to keyword-topics (the ones we are mainly # interested about; also because otherwise by definition there is a higher likelihood that # for example the avg. topic coherence decreases if you increase the number of nokeyword topics...) ############################################################ # Let's first estimate the avg. topic coherence words <- top_words(out, 10) words # let's get rid of the unicode topic<- c(1:length(words)) topic for (i in topic) { words[,i] <- gsub("\u2713","",words[,i] ) words[,i] <- gsub("[[:punct:]]","",words[,i] ) words[,i] <- gsub(" ","",words[,i] ) } words # let's estimate the coherence for each topic # the coherence function below needs to have a dfm similar to the one we employ in the topicmodels package library(topicmodels) dtm <- convert(dfmt , to = "topicmodels") # let's create an empty data frame that we will fill later on results <- data.frame(first=vector()) results # The coherence statistic can be computed inside the function topic_coherence. # Note the ":::" to access internal functions of a package library(topicdoc) for (i in topic) { coherence <- topicdoc:::coherence(dtm, words[,i], 1) results <- rbind(results , cbind(coherence )) } results # avg. level of coherence mean(results [, 1][1:5] ) # Let's estimate now the avg. topic exclusiveness (following the same procedure employed in STM) out$phi str(out$phi) tt <- t(out$phi) str(tt) apply(tt,2,which.max) rownames(tt)[c(3,1137, 983, 334, 72,78)] top_words(out, 2) w <- 0.7 # weight to apply (this is the default value in STM) M <- 10 # let's focus just on the top 10 words tbeta <- t(out$phi) s <- rowSums(tbeta) mat <- tbeta/s ex <- apply(mat, 2, rank)/nrow(mat) fr <- apply(tbeta, 2, rank)/nrow(mat) frex <- 1/(w/ex + (1 - w)/fr) index <- apply(tbeta, 2, order, decreasing = TRUE)[1:M, ] exclus <- vector(length = ncol(tbeta)) for (i in 1:ncol(frex)) { exclus [i] <- sum(frex[index[, i], i]) } exclus # avg. level of exclusiveness mean(exclus[1:5]) ######################################### ### Adding Covariate ######################################### # in the dfm there is a variable named "sentiment" either positive or negative (for a positive or negative review of the movie) str(dfmt) table(dfmt@docvars$sentiment) # IMPORTANT: Since the keyATM does not take missing values, you must remove any missing values from covariates before extracting # them. Moreover, I also strongly recommend to extract covariates after preprocessing texts (i.e., after you have created your dfm) # because preprocessing steps will remove documents without any terms, which may result in any discrepancies between documents and covariates. # Therefore, the recommended procedure is (1) removing missing values from covariates, (2) preprocessing texts (including the process to discard documents # that do not contain any words), and (3) extracting covariates vars_selected <- as.data.frame(dfmt@docvars$sentiment) # let's extract in a data frame the "sentiment" covariate names(vars_selected )[1] <- "sentiment" # let's rename it str(vars_selected) # Note: given that we want also to run an analysis on Topic-word distributions, we have to use the "keep" argument in the keyATM() function # to store Z and S (see below). Otherwise, we could have dropped the last line of the command w/o affecting any of the estimations of the main model system.time(out <- keyATM(docs = keyATM_docs, no_keyword_topics = 1, keywords = keywords, model = "covariates", model_settings = list(covariates_data = vars_selected, covariates_formula = ~ sentiment), options = list(seed = 123), keep = c("Z", "S") )) # keyATM automatically standardizes non-factor covariates (i.e., each covariates to have zero mean and a standard deviation one) # while it does not standardize factor covariate (default) as it happens in our case. # The standardize option in model_settings argument of the keyATM() function takes one of "all", "none", or "non-factor" (default) # "all" standardizes all covariates (except the intercept), "none" does not standardize any covariates, # and "non-factor" standardizes non-factor covariates (the default). # You can glance covariate information with the covariates_info() and the covariates_get() function will return covariates # used in the fitted output. In this case, nothing exciting...still this step is very important cause we realize how keyATM has # named our covariate (a name that we will use below), here: "sentimentpos" (cause sentiment gets a value of 1 if positive; 0 otherwise) covariates_info(out) head(covariates_get(out)) tail(covariates_get(out)) # The covariate keyATM can characterize the relations between covariates and document-topic distributions. # We can obtain the marginal posterior mean of document-topic distribution conditioned on the covariates # (in our specific case a binary variable sentimentpos, which indicates the sentiment for this purpose). # We employ in this respect the by_strata_DocTopic() function. # Substantially, it does the same thing that the "estimateEffect" function (do you remember?) does with STM # In the by_strata_DocTopic() function, we have to specify the variable we focus on in by_var argument # and label each value in the variable (ascending order). # In this case, the sentimentpos equals 0 indicates negative reviews and equals 1 for positive reviews. # IMPORTANT: if you have a mac, avoid to add "mc.cores = 1" strata_topic <- by_strata_DocTopic(out, by_var = "sentimentpos", labels = c("Negative", "Positive"), mc.cores = 1) # We can visualize results with the plot() function. The figure shows the marginal posterior means of document-topic distributions # and the 90% credible intervals of them for each value of covariates. # The figure indicates that crime movies are more likely to receive a negative review contrary to people movies fig_doctopic <- plot(strata_topic, var_name = "sentimentpos", show_topic = c(1:6)) fig_doctopic # we can also change the credible intervals: below we are using a 99% credible interval for example fig_doctopic <- plot(strata_topic, var_name = "sentimentpos", show_topic = c(2), ci=.99) fig_doctopic # Using the output of the keyATM, we can calculate 95% (or any other) credible intervals of the differences in the mean of document-topic distribution. str(strata_topic ) theta1 <- strata_topic$theta[[1]] # negative reviews str(theta1) theta2 <- strata_topic$theta[[2]] # positive reviews str(theta2) theta_diff <- theta2[, c(1:6)] - theta1[, c(1:6)] # positive minus negative reviews # we can see that there is a statistical significant difference for people movies (at least with a 95% credible interval), # that is, people movies tend to get with higher probability a positive rather than a negative review, but not for ex. for crime ones theta_diff_quantile <- apply(theta_diff, 2, quantile, c(0.05, 0.5, 0.975)) theta_diff_quantile ############################################### ##### Topic-word distributions ############################################### # Although the covariate keyATM does not directly model topic-word distributions (i.e., topical content) contrary to STM, # the model can examine how topic-word distributions change across different values of document-level covariates. # We can pass the output with our stored values (obtained via the "keep" command above) to the by_strata_TopicWord() function. # The example below demonstrates top words associated to each topic for positive and negative rewiews using the Sentiment covariate. strata_tw <- by_strata_TopicWord(out, keyATM_docs, by = as.vector(vars_selected$sentiment)) # We check the result top_words(strata_tw, n = 10) # with keyATM you can also run a semi-supervised dynamic topic analysis (i.e., where the relationship between a given word and a given topic can # change over time). We do not discuss about it here