rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/FOR NEXT YEAR 2020/Newsmap") getwd() library(quanteda) library(readtext) library(ggplot2) library(ldatuning) library(topicmodels) library(lubridate) library(topicdoc) library(cowplot) ######################################### ########## Topic-model (LDA) ######################################### # This data frame contains 1,959 Guardian news articles published in 2016 myText <- read.csv("guardian.csv", stringsAsFactors=FALSE) str(myText) # creating a new variable "text2". Later it will be useful myText$text2 <- myText$text news_corp <- corpus(myText) head(summary(news_corp)) tok2 <- tokens(news_corp, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) news_dfm <- dfm(tok2) # let's remove the reference to time news_dfm <- dfm_remove(news_dfm, c('*-time', '*-timeUpdated', 'GMT', 'BST')) # We keep only the top 5% of the most frequent features (min_termfreq = 0.95) that appear less than 10% in all # the documents (max_docfreq = 0.1) using dfm_trim() to focus on common but distinguishing features. news_dfm <- dfm_trim(news_dfm, min_termfreq = 0.95, termfreq_type = "quantile", max_docfreq = 0.1, docfreq_type = "prop") news_dfm[1:2, 1:5] # 20 top words topfeatures(news_dfm, 20) # Let's also keep only documents with number of tokens >0 [given that after the above reduction, # some articles could actually be empty - 8 here] news_dfm[ntoken(news_dfm) == 0,] news_dfm <- news_dfm[ntoken(news_dfm) > 0,] # And what about now the number of times "trump" appear in each news article? dict <- dictionary(list(trump = c("trump"))) trump <- dfm_lookup(news_dfm, dict) # in 0 articles you have now "Trump"! Why? Cause it does not satify the trim rule decided above (it is so popular # that appears in a large number of articles, i.e., more than max_docfreq = 0.1) table(trump@x) # and indeed let's look for "trump" in a not-trimmed dfm news_dfm2 <- dfm(tok2) trump <- dfm_lookup(news_dfm2, dict) table(trump@x) # in 249 articles the word "trump" appears in our not-trimmed dfm! length(trump@x) # as docvars there is the text as well (text2) - it will be helpful later on! str(news_dfm@docvars) # quanteda does not implement own topic models, but you can easily access to LDA() from the topicmodel package # through convert(). k = 5 specifies the number of topics to be discovered. dtm <- convert(news_dfm, to = "topicmodels") # let's identify k=5 # being a probabilist model, always a good idea to declare the seed! set.seed(123) system.time(lda <- LDA(dtm, method= "Gibbs", k = 5)) # around 20 seconds on my laptop # alternative way to define the seed: # system.time(lda <- LDA(dtm, method= "Gibbs", k = 5, control = list(seed = 123))) # We can use get_terms to the top n terms from the topic model (i.e., terms with the highest beta associated with each topic) # and get_topics to predict the top k topic for each document. This will help us interpret the results of the model. # Which the most likely terms for each topic? terms <- get_terms(lda, 10) # 10 terms for topic 1 terms[,1] # 10 terms for topic 2, and so on terms[,2] terms[,3] terms[,4] terms[,5] # or we can extract directly the most important terms from the model using terms(). terms(lda, 15) # which meaning for each topic according to you?? # which is the most likely topic for each document? topics <- get_topics(lda, 1) head(topics, 10) # or you can obtain the most likely topics using topics() and save them as a document-level variable. head(topics(lda)) docvars(news_dfm, 'pred_topic') <- topics(lda) str(news_dfm) # Let’s take a closer look at some of these topics. To help us interpret the output, we can look # at the words associated with each topic and take a random sample of documents highly associated # with each topic. # Topic 1 paste(terms[,1], collapse=", ") set.seed(123) sample(news_dfm@docvars$text2[news_dfm@docvars$pred_topic==1], 2) # same as above set.seed(123) sample(news_dfm@docvars$text2[topics==1], 2) # Topic 2 paste(terms[,2], collapse=", ") set.seed(123) sample(news_dfm@docvars$text2[news_dfm@docvars$pred_topic==2], 2) # Topic 3 paste(terms[,3], collapse=", ") set.seed(123) sample(news_dfm@docvars$text2[news_dfm@docvars$pred_topic==3], 2) # Looking at the evolution of certain topics over time can be interesting # Let’s look for example at Topic 2, which appears to be related somehow to Isis str(news_dfm@docvars$date) news_dfm@docvars$month <- substr(news_dfm@docvars$date, 1, 7) # extract year and month (first 7 characters) table(news_dfm@docvars$month ) # frequency table with articles about Isis as their MAIN topic, per month in 2016 tab <- table(news_dfm@docvars$month [news_dfm@docvars$pred_topic==2]) tab plot(tab) # But we can actually do better than this. LDA is a probabilistic model, # which means that for each document, it actually computes a distribution over topics. # In other words, each document is considered to be about a mixture of topics as we have already discussed! # This information is included in the matrix gamma in the LDA object (theta in the notation we used for the slides) # For example, news 1 is 26% about topic 4, while news 3 is 39% about topic 4 round(lda@gamma[1,], 2) round(lda@gamma[3,], 2) # So we can actually take the information in the matrix and aggregate it to compute the average probability that a news each month # is about a particular topic. Let’s now choose once again Topic 2 # add probability to dfm news_dfm@docvars$topic2 <- lda@gamma[,2] str(news_dfm) # now aggregate at the month level agg <- aggregate(news_dfm@docvars$topic2, by=list(month=news_dfm@docvars$month), FUN=mean) str(agg) agg$date <- as.Date(paste(agg$month,"-01",sep="")) str(agg) # and plot it plot(agg$date , agg$x, type="l", xlab="Month", ylab="Avg. prob. of article about topic 2 (ISIS)", main="Estimated proportion of articles discussing about ISIS") ################################## # identifying the optimal number of topics between for example: 4 and 15 ################################## ################################## ### FIRST ALTERNATIVE - this alternative is the same one that we will employ also in the ### next lecture, when we will discuss about STM (structural topic models) ################################## # let's focus on topic coherence and exclusivity of our models with k=5 topic_diagnostics(lda, dtm) topic_coherence(lda, dtm) topic_exclusivity(lda) # both coherence and exclusivity is by default computed on the first 10 words for each Topic # but you can change it via "top_n_tokens = XXX" in the function above # with 10 words, the highest possible value for exclusivity is 10 and for coherence 0 (being a log likelihood) mean(topic_coherence(lda, dtm)) mean(topic_exclusivity(lda)) # now let's make k changes between 4 and 15 top <- c(4:15) top # let's create an empty data frame that we will fill later on results <- data.frame(first=vector(), second=vector(), third=vector()) results # with add iter=100 where iter is the number of Gibbs iterations to make LDA faster; the default value is 2000 [so this is not an advisable strategy # for a serious project!] system.time( for (i in top) { set.seed(123) lda <- LDA(dtm, method= "Gibbs", k = (i), control=list(verbose=50L, iter=100)) topic <- (i) coherence <- mean(topic_coherence(lda, dtm)) exclusivity <- mean(topic_exclusivity(lda)) results <- rbind(results , cbind(topic, coherence, exclusivity )) } ) results str(results) # You should focus on those models that lie on the semantic coherence-exclusivity ‘frontier’, that # is, where no model strictly dominates another in terms of semantic coherence and # exclusivity (i.e., models with average scores towards the upper right side of the plot). # The figure below identifies model k=11 or k=15 as models that satisfy the # ‘frontier’ criterion, i.e., models with desirable properties in both dimensions. # These two values could be a good starting point. But remember: always validate your results by understanding the topics! plot(results$coherence, results$exclusivity, main="Scatterplot Example", xlab="Semantic Coherence", ylab="Exclusivity ", pch=19) text(results$coherence, results$exclusivity, labels=results$topic, cex= 1, pos=4) ##################################### ### SECOND ALTERNATIVE - OPTIONAL ##################################### # An alternative metric for evaluating topic models is the ‘held out likelihood’, also referred to as ‘perplexity’. # The perplexity metric is calculated by splitting your original corpus into two parts – a training set and a test set. # The basic idea is to hold out some fraction of the words in a set of documents, train the model and use the document-level latent variables to evaluate the probability # of the heldout portion (i.e., the previously unseen document). # In other words: for each document, show a fraction of words, use the learned topic matrix to predict the distribution Pr[z = i|doc], # the probability of word z to belong to topic i in that document. The lower the perplexity, the better the model. # Similar to the idea of cross-validation (we'll discuss a lot about it later on), when some of the data is removed from estimation and then later used for # validation, the held-out likelihood helps the user assess the model’s prediction performance. # In other words, perplexity metric assesses a topic model's ability to predict a test set after having been trained on a training set. # Note however that Perplexity is not strongly correlated to human judgment # Chang et al. have shown that, surprisingly, predictive likelihood (or equivalently, perplexity) and human judgment are often not correlated, and even sometimes slightly anti-correlated. # Chang, Jonathan, Jordan Boyd-Graber, Sean Gerrish, Chong Wang and David M. Blei. 2009. Reading Tea Leaves: How Humans Interpret Topic Models. NIPS. # Therefore, better focusing on coherence and exclusivity as above! # let's extract 80% of the observations in our dtm and let's consider them as our training-set # [why 80% and not 75% or 70%? actually it would be more elegant to use a fully cross-validation approach...what is that? More on this later on!] nrow(dtm ) set.seed(123) index <- sample(1:nrow(dtm), nrow(dtm )*0.8 ) length(index) train_data <- dtm[index,] test_data <- dtm[-index,] nrow(train_data) nrow(test_data) # now let's make k changes between 4 and 15 topics <- seq(4, 15, by=1) topics # let's create an empty data frame that we will fill later on # let's also save the log likelihood of the topic model for each single value of k (the higher is the log-likelihood, the better is) perplexity_df <- data.frame(topic=vector(), test_perplexity=vector()) str(perplexity_df) set.seed(12345) system.time( for (i in topics){ fitted <- LDA(train_data, k = i, method = "Gibbs", control=list(verbose=50L, iter=100)) perplexity_df[i,1] <-(i) perplexity_df[i,2] <- perplexity(fitted, newdata = test_data) # A low perplexity indicates the probability distribution is good at predicting the sample }) perplexity_df perplexity_df <- na.omit(perplexity_df) perplexity_df ggplot(perplexity_df, aes(x= as.numeric(row.names(perplexity_df)), y=test_perplexity)) + geom_line(color="red", size=2) + ggtitle("Perplexity of held-out data") + labs(x = "Candidate number of topics", y = "Perplexity") ################################## ### THIRD ALTERNATIVE - optional ################################## system.time(tune <- FindTopicsNumber(dtm, metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"), topics = seq(4, 15, by=1), control = list(seed = 123, iter=100))) # Minimize: # Density (Cao Juan et al. 2009) # Within-Topic Divergence (Arun et al. 2010) # Maximize: # Across-Topic Divergence (Deveaud et al. 2014) # log-likelihood (Griffiths and Steyvers 2004) (Collapsed Gibb Sampler only) tune FindTopicsNumber_plot(tune) # here 9 probably the best solution ################################## ### let's replicate our analysis but using the 2013 Guardian's articles ################################## myText2 <- read.csv("guardian2013.csv", stringsAsFactors=FALSE) str(myText2)