rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/FOR NEXT YEAR 2020/Newsmap") getwd() library(quanteda) library(readtext) library(ggplot2) library(ldatuning) library(topicmodels) library(lubridate) library(topicdoc) library(cowplot) library(dplyr) library(tidytext) ######################################### ########## Topic-model (LDA) ######################################### # This data frame contains 1,959 Guardian news articles published in 2016 myText <- read.csv("guardian.csv", stringsAsFactors=FALSE) str(myText) # Let's create a new document-level variable "text2" storing the text of each of our document. Later it will be useful myText$text2 <- myText$text news_corp <- corpus(myText) head(summary(news_corp)) tok2 <- tokens(news_corp, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) news_dfm <- dfm(tok2) # let's remove the reference to time news_dfm <- dfm_remove(news_dfm, c('*-time', '*-timeUpdated', 'GMT', 'BST')) # We keep only the top 5% of the most frequent features (min_termfreq = 0.95) that appear less than 10% in all # the documents (max_docfreq = 0.1) using dfm_trim() to focus on common BUT distinguishing features. news_dfm <- dfm_trim(news_dfm, min_termfreq = 0.95, termfreq_type = "quantile", max_docfreq = 0.1, docfreq_type = "prop") news_dfm[1:2, 1:5] # note that as docvars there is the "text" as well "text2" str(docvars(news_dfm)) # 20 top words topfeatures(news_dfm, 20) # But wait a minute...Why "Trump" is not in the list of the topfeatures? After all everyone was discussing about Trump back in 2016! # ...but that's precisely the reason! It was so popular that it does not satify the trim rule decided above (it is so popular # that appears in a large number of articles, i.e., more than max_docfreq = 0.1) # And indeed let's look for "trump" in a not-trimmed dfm news_dfm2 <- dfm(tok2) topfeatures(news_dfm2, 20) # Let's also keep only documents with number of tokens >0 [given that after the above reduction, # some articles could actually be empty - 8 here] news_dfm[ntoken(news_dfm) == 0,] # or alternatively: table(ntoken(news_dfm)== 0) news_dfm <- news_dfm[ntoken(news_dfm) > 0,] # or alternatively using the dfm_subset command that we already saw: "dfm_subset(news_dfmntokennews_dfm>0)" news_dfm[ntoken(news_dfm) == 0,] # quanteda does not implement own topic models, but you can easily access to LDA() from the topicmodel package # through convert(). k = 5 specifies the number of topics to be discovered in this first example. dtm <- convert(news_dfm, to = "topicmodels") # let's identify k=5 # being a model with a random start, always a good idea to declare the seed to be able to replicate our findings! # moreover, LDA by default assume two values for alpha and delta (0.1). These are two tuning (or hyperparameters)-parameters of a topic-model # alpha is the parameter of Dirichlet concerning the distribution of topics over documents: a low value of alpha will assign fewer topics to each document # whereas a high value of alpha will have the opposite effect. # delta is the parameter of Dirichlet concerning the distribution of words over topics: a low value of delta will use fewer words to model a topic # whereas a high value will use more words, thus making topics more similar between them. # You can eventually improve your LDA results by tuning these two hyperparameters-parameters. Having said that, the improvement usually is never a drammatic one. # To tune alpha, add it to the control list such as: "control = list(alpha = 0.2)". set.seed(123) system.time(lda <- LDA(dtm, method= "Gibbs", k = 5, control=list(verbose=50L))) # around 20 seconds on my laptop # alternative way to define the seed: # system.time(lda <- LDA(dtm, method= "Gibbs", k = 5, control = list(seed = 123))) # We can use get_terms to the top n terms from the topic model (i.e., terms with the highest beta associated with each topic) # and get_topics to predict the top k topic for each document. This will help us interpret the results of the model. # Which the most likely terms for each topic? # Either we use get_terms... termsList <- get_terms(lda, 10) # 10 terms for topic 1: perhaps US politics? termsList[,1] # 10 terms for topic 2: perhaps IS and Islamic terrorism? termsList[,2] # and so on... termsList[,3] termsList[,4] # topic 5 seems to discuss about several different arguments...perhaps different topics are conflated in it? # That would be a clear sign that k=5 not a great choice... termsList[,5] #...or we can extract directly the most important terms from the model using terms(). terms(lda, 15) # Let's plot these words # let's convert the object containing the results of our lda into a particular data frame (called a tibble) lda_topics <- tidy(lda, matrix = "beta") str(lda_topics) # then let's convert a tibble into a more common data frame (actually not needed, just to make you more comfortable) lda_topics <- as.data.frame(lda_topics) str(lda_topics) top_terms <- group_by(lda_topics, topic) str(top_terms) # let's keep only the first top 10 betas for each of the 5 topics top_terms <- top_n(top_terms, 10, beta) top_terms <- ungroup(top_terms) top_terms <- arrange(top_terms , topic, -beta) str(top_terms) table(top_terms$topic) top_terms <- mutate(top_terms, topic = factor(topic), term = reorder_within(term, beta, topic)) str(top_terms) # here on the horizontal axis you have the betas for each single word ggplot(top_terms, aes(term, beta, fill = topic)) + geom_bar(alpha = 0.8, stat = "identity", show.legend = FALSE) + scale_x_reordered() + facet_wrap(facets = vars(topic), scales = "free", ncol = 2) + coord_flip() # which is the most likely topic for each document? # Either we use get_topics... topics <- get_topics(lda, 1) head(topics, 10) # or you can obtain the most likely topics using topics() head(topics(lda)) # let's save this info as a new document-level variable str(docvars(news_dfm)) docvars(news_dfm, 'pred_topic') <- topics(lda) str(docvars(news_dfm)) # Let us take a closer look at some of these topics. To further help us interpret the output of out topic model, # we can take a random sample of documents highly associated with each topic and read them # Topic 1 paste(termsList [,1], collapse=", ") set.seed(123) sample(news_dfm@docvars$text2[news_dfm@docvars$pred_topic==1], 2) strwrap(sample(news_dfm@docvars$text2[news_dfm@docvars$pred_topic==1], 2)) # comment: perhaps the first one not clearly associated to US politics after all... # Topic 2 paste(termsList [,2], collapse=", ") set.seed(123) strwrap(sample(news_dfm@docvars$text2[news_dfm@docvars$pred_topic==2], 2)) # Topic 3 paste(termsList [,3], collapse=", ") set.seed(123) strwrap(sample(news_dfm@docvars$text2[news_dfm@docvars$pred_topic==3], 2)) # Looking at the evolution of certain topics over time can be interesting # Let us look for example at Topic 2, which appears to be related somehow to Isis head(docvars(news_dfm)) str(news_dfm@docvars$date) # let's use the substr function to extract the temporal period we are interested about from the string date # Remember: # First argument in substr is the string # Second argument in substr is start position of the substring # Third argument in substr is end position of the substring news_dfm@docvars$month <- substr(news_dfm@docvars$date, 1, 7) # extract year and month (first 7 characters) str(docvars(news_dfm)) table(news_dfm@docvars$month ) # let's compute the relative frequency of articles discussing about Isis as their MAIN topic over all the articles published per-month in 2016 tab <- table(news_dfm@docvars$month [news_dfm@docvars$pred_topic==2])/table(news_dfm@docvars$month ) tab # three peaks: January, May and December plot(tab) # But we can actually do better than this. LDA is a mixed membership model, # which means that for each document, it actually computes a distribution over topics. # In other words, each document is considered to be about a mixture of topics as we have already discussed! # This information is included in the matrix gamma in the LDA object (theta in the notation we used for the slides) # For example, news 1 is 26% about topic 4, while news 3 is 39% about topic 4 round(lda@gamma[1,], 2) round(lda@gamma[3,], 2) # let's read the article with the largest gamma(=theta) for topic 1 max(lda@gamma[,1]) which.max(lda@gamma[,1]) # As you can read, this article is mainly discussing about "American politcs", but also about ISIS strwrap(news_dfm@docvars$text2[1805]) round(lda@gamma[1805,], 2) # So we can actually take the information in the matrix and aggregate it to compute the average probability that a news each month # is about a particular topic. Let us now choose once again Topic 2 # add probability to dfm news_dfm@docvars$topic2 <- lda@gamma[,2] str(docvars(news_dfm)) # now aggregate at the month level agg <- aggregate(news_dfm@docvars$topic2, by=list(month=news_dfm@docvars$month), FUN=mean) str(agg) agg$date <- as.Date(paste(agg$month,"-01",sep="")) str(agg) # Let's plot it! Once again three peaks: January, May and December ggplot(agg, aes(date, x)) + geom_point() + geom_line() + labs(title = "Estimated proportion of articles discussing about ISIS", x = "Month", y = "Avg. prob. of article about topic 2 (ISIS)") + theme_light() ################################## ### identifying the optimal number of topics given your corpus ### REMEMBER: this is the very first thing you HAVE to do before running a Topic Model! ################################## ################################## ### FIRST ALTERNATIVE - this alternative is the same one that we will employ also in the ### next lecture, when we will discuss about STM (structural topic models) ################################## # let's focus on topic coherence and exclusivity of our models with k=5. # You clearly see that topic 1 (the topic we clearly identified earlier as discussing about US politics) presents # the highest values of both coherence and exclusivity; while topic 5 (the topic we were not able to clearly identify earlier) # presents quite low values of both statistics (i.e., a difficult topic to give an interpretation to....) topic_diagnostics(lda, dtm) topic_coherence(lda, dtm) topic_exclusivity(lda) # both coherence and exclusivity is by default computed on the first 10 words for each Topic # but you can change it via "top_n_tokens = XXX" in the function above. # with 10 words, the highest possible value for exclusivity is 10 and for coherence always 0 (being a log likelihood). # But if we focus on the first 15 words for each Topic, the highest possible value for exclusivity becomes 15 topic_exclusivity(lda, top_n_tokens=15) topic_coherence(lda, dtm, top_n_tokens=15) # Avg. value of both coherence and exclusivity when k=5 mean(topic_coherence(lda, dtm)) mean(topic_exclusivity(lda)) # now let's change k between 4 and 25 and each time we store the corresponding avg. values of both coherence and exclusivity top <- c(4:25) top # let's create an empty data frame that we will fill later on results <- data.frame(first=vector(), second=vector(), third=vector()) results # with add iter=100 where iter is the number of Gibbs iterations to make LDA faster; the default value is 2,000 # [so this is not an advisable strategy for a serious project!] system.time( for (i in top) { set.seed(123) lda <- LDA(dtm, method= "Gibbs", k = i, control=list(verbose=50L, iter=100)) topic <- i coherence <- mean(topic_coherence(lda, dtm)) exclusivity <- mean(topic_exclusivity(lda)) results <- rbind(results , cbind(topic, coherence, exclusivity )) } ) results str(results) # You should focus on those models that lie on the semantic coherence-exclusivity �frontier�, that # is, where no model strictly dominates another in terms of semantic coherence and # exclusivity (i.e., models with average scores towards the upper right side of the plot). # The figure below identifies model k=11 or k=15 as models that satisfy the # frontier criterion, i.e., models with desirable properties in both dimensions. # These two values could be a good starting point. But remember: always validate your results by understanding the topics! ggplot(results, aes(x=coherence, y=exclusivity)) + geom_point() + geom_text(label=results$topic, vjust=-1) + ylab(label="Exclusivity ") + xlab("Semantic Coherence") + theme_light() ##################################### ### SECOND ALTERNATIVE ##################################### # An alternative metric for evaluating topic models is the log-likelihood as well as the "held-out likelihood" / "perplexity". # Remember: the higher (lower) the log-likelihood (perplexity), the better the model. # For computing the perplexity, let's extract 80% of the observations. Let's consider them as our training-set # [why 80% and not 60% or 70%? actually it would be more robust to use a cross-validation approach... # More on this later on!] nrow(dtm ) set.seed(123) index <- sample(1:nrow(dtm), nrow(dtm )*0.8 ) length(index) train_data <- dtm[index,] test_data <- dtm[-index,] nrow(train_data) nrow(test_data) # now let's make k changes between 4 and 50 topics <- seq(4, 50, by=1) topics # let's create an empty data frame that we will fill later on. results_df<- data.frame(topic=vector(), perplexity=vector(), log=vector()) str(results_df) system.time( for (i in topics){ set.seed(123) fitted <- LDA(train_data, k = i, method = "Gibbs", control=list(verbose=50L, iter=100)) results_df[i,1] <- (i) results_df[i,2] <- perplexity(fitted, newdata = test_data) results_df[i,3] <- logLik(fitted) }) perp <- ggplot(results_df, aes(x= as.numeric(row.names(results_df)), y=perplexity)) + geom_line(color="red", size=2) + ggtitle("Perplexity of held-out data") + labs(x = "Candidate number of topics", y = "Perplexity") log <- ggplot(results_df, aes(x= as.numeric(row.names(results_df)), y=log)) + geom_line(size=2) + ggtitle("Log-Likelihood") + labs(x = "Candidate number of topics", y = "Log-Likelihood") plot_grid(perp , log) # At around 30 to 40 topics, the decrease in perplexity for additional topics becomes notably less. This is one way to interpret the right # number of topics, similar to the interpretation of the elbow in the scree plot of a factor analysis. ################################## ### THIRD ALTERNATIVE ################################## # Across-Topic Divergence (Deveaud et al. 2014) - usually one of the test most often used in the literature # You retain the k that maximizes such divergence system.time(tune <- FindTopicsNumber(dtm, metrics = c("Deveaud2014"), topics = seq(4, 50, by=1), control = list(seed = 123, iter=100))) tune max(tune$Deveaud2014) # observation with the max Deveaud2014 value: 27, that corresponds to k=24 which.max(tune$Deveaud2014) ggplot(tune, aes(x = topics, y = Deveaud2014)) + geom_point() + geom_line() # Note that the Across-Topic Divergence method is generally similar to find the k that maximizes "exclusivity" in the plot above ggplot(results, aes(x=coherence, y=exclusivity)) + geom_point() + geom_text(label=results$topic, vjust=-1) + ylab(label="Exclusivity ") + xlab("Semantic Coherence") + theme_light()