rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/FOR NEXT YEAR 2020/Newsmap")
getwd()
library(quanteda)
library(readtext)
library(ggplot2)
library(ldatuning)
library(topicmodels)
library(lubridate)
library(topicdoc)
library(cowplot)
library(tidytext)
library(dplyr)
#########################################
########## Topic-model (LDA)
#########################################
# This data frame contains 1,959 Guardian news articles published in 2016
myText <- read.csv("guardian.csv", stringsAsFactors=FALSE)
str(myText)
# creating a new variable "text2". Later it will be useful
myText$text2 <- myText$text
news_corp <- corpus(myText)
head(summary(news_corp))
tok2 <- tokens(news_corp, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
news_dfm <- dfm(tok2)
# let's remove the reference to time
news_dfm <- dfm_remove(news_dfm, c('*-time', '*-timeUpdated', 'GMT', 'BST'))
# We keep only the top 5% of the most frequent features (min_termfreq = 0.95) that appear less than 10% in all
# the documents (max_docfreq = 0.1) using dfm_trim() to focus on common BUT distinguishing features.
news_dfm <- dfm_trim(news_dfm, min_termfreq = 0.95, termfreq_type = "quantile",
max_docfreq = 0.1, docfreq_type = "prop")
news_dfm[1:2, 1:5]
# as docvars there is the "text" as well "text2" - it will be helpful later on!
str(news_dfm@docvars)
# 20 top words
topfeatures(news_dfm, 20)
# But wait a minute...Why "Trump" is not in the list of the topfeatures? After all everyone was discussing about Trump back in 2016!
# ...but that's precisely the reason! It was so popular that it does not satify the trim rule decided above (it is so popular
# that appears in a large number of articles, i.e., more than max_docfreq = 0.1)
# And indeed let's look for "trump" in a not-trimmed dfm
news_dfm2 <- dfm(tok2)
topfeatures(news_dfm2, 20)
# Let's also keep only documents with number of tokens >0 [given that after the above reduction,
# some articles could actually be empty - 8 here]
news_dfm[ntoken(news_dfm) == 0,]
news_dfm <- news_dfm[ntoken(news_dfm) > 0,]
news_dfm[ntoken(news_dfm) == 0,]
# quanteda does not implement own topic models, but you can easily access to LDA() from the topicmodel package
# through convert(). k = 5 specifies the number of topics to be discovered in this first example.
dtm <- convert(news_dfm, to = "topicmodels")
# let's identify k=5
# being a probabilist model, always a good idea to declare the seed!
set.seed(123)
system.time(lda <- LDA(dtm, method= "Gibbs", k = 5)) # around 20 seconds on my laptop
# alternative way to define the seed:
# system.time(lda <- LDA(dtm, method= "Gibbs", k = 5, control = list(seed = 123)))
# We can use get_terms to the top n terms from the topic model (i.e., terms with the highest beta associated with each topic)
# and get_topics to predict the top k topic for each document. This will help us interpret the results of the model.
# Which the most likely terms for each topic?
# Either we use get_terms...
termsList <- get_terms(lda, 10)
# 10 terms for topic 1: perhaps US politics?
termsList[,1]
# 10 terms for topic 2: perhaps IS and Islamic terrorism?
termsList[,2]
# and so on...
termsList[,3]
terms[,4]
# topic 5 seems to discuss about several different arguments...perhaps different topics are conflated in it?
# That would be a clear sign that k=5 not a great choice...
termsList[,5]
#...or we can extract directly the most important terms from the model using terms().
terms(lda, 15)
# Let's plot these words
# let's convert the object containing the results of our lda into a particular data frame (called a tibble)
lda_topics <- tidy(lda, matrix = "beta")
str(lda_topics)
# then let's convert a tibble into a more common data frame (actually not needed, just to make you more comfortable)
lda_topics <- as.data.frame(lda_topics)
str(lda_topics)
top_terms <- group_by(lda_topics, topic)
str(top_terms)
# let's keep only the first top 10 betas for each of the 5 topics
top_terms <- top_n(top_terms, 10, beta)
top_terms <- ungroup(top_terms)
top_terms <- arrange(top_terms , topic, -beta)
str(top_terms)
table(top_terms$topic)
top_terms <- mutate(top_terms, topic = factor(topic),
term = reorder_within(term, beta, topic))
str(top_terms)
ggplot(top_terms, aes(term, beta, fill = topic)) +
geom_bar(alpha = 0.8, stat = "identity", show.legend = FALSE) +
scale_x_reordered() +
facet_wrap(facets = vars(topic), scales = "free", ncol = 2) +
coord_flip()
# which is the most likely topic for each document?
# Either we use get_topics...
topics <- get_topics(lda, 1)
head(topics, 10)
# or you can obtain the most likely topics using topics()
head(topics(lda))
# let's save this info as a new document-level variable
docvars(news_dfm, 'pred_topic') <- topics(lda)
str(news_dfm@docvars)
# Let’s take a closer look at some of these topics. To further help us interpret the output of out topic model,
# we can take a random sample of documents highly associated with each topic and read them
# Topic 1
paste(terms[,1], collapse=", ")
set.seed(123)
sample(news_dfm@docvars$text2[news_dfm@docvars$pred_topic==1], 2)
# comment: perhaps the first one not clearly associated to US politics after all...
# Topic 2
paste(terms[,2], collapse=", ")
set.seed(123)
sample(news_dfm@docvars$text2[news_dfm@docvars$pred_topic==2], 2)
# Topic 3
paste(terms[,3], collapse=", ")
set.seed(123)
sample(news_dfm@docvars$text2[news_dfm@docvars$pred_topic==3], 2)
# Looking at the evolution of certain topics over time can be interesting
# Let’s look for example at Topic 2, which appears to be related somehow to Isis
str(news_dfm@docvars$date)
news_dfm@docvars$month <- substr(news_dfm@docvars$date, 1, 7) # extract year and month (first 7 characters)
str(news_dfm)
table(news_dfm@docvars$month )
# frequency table with articles about Isis as their MAIN topic, per month in 2016
tab <- table(news_dfm@docvars$month [news_dfm@docvars$pred_topic==2])
tab
plot(tab)
# But we can actually do better than this. LDA is a probabilistic model,
# which means that for each document, it actually computes a distribution over topics.
# In other words, each document is considered to be about a mixture of topics as we have already discussed!
# This information is included in the matrix gamma in the LDA object (theta in the notation we used for the slides)
# For example, news 1 is 26% about topic 4, while news 3 is 39% about topic 4
round(lda@gamma[1,], 2)
round(lda@gamma[3,], 2)
# So we can actually take the information in the matrix and aggregate it to compute the average probability that a news each month
# is about a particular topic. Let’s now choose once again Topic 2
# add probability to dfm
news_dfm@docvars$topic2 <- lda@gamma[,2]
str(news_dfm)
# now aggregate at the month level
agg <- aggregate(news_dfm@docvars$topic2, by=list(month=news_dfm@docvars$month), FUN=mean)
str(agg)
agg$date <- as.Date(paste(agg$month,"-01",sep=""))
str(agg)
# and plot it
plot(agg$date , agg$x, type="l", xlab="Month", ylab="Avg. prob. of article about topic 2 (ISIS)",
main="Estimated proportion of articles discussing about ISIS")
##################################
# identifying the optimal number of topics between for example
# - REMEMBER: this is the very first thing you HAVE to do before running a STM!
##################################
##################################
### FIRST ALTERNATIVE - this alternative is the same one that we will employ also in the
### next lecture, when we will discuss about STM (structural topic models)
##################################
# let's focus on topic coherence and exclusivity of our models with k=5
# You clearly see that topic 1 (the topic we clearly identified earlier as discussing about US politics) presents
# the highest values of both coherence and exclusivity; while topic 5 (the topic we were not able to clearly identify earlier)
presents quite low values of both statistics (i.e., a difficult topic to give an interpretation to....)
topic_diagnostics(lda, dtm)
topic_coherence(lda, dtm)
topic_exclusivity(lda)
# both coherence and exclusivity is by default computed on the first 10 words for each Topic
# but you can change it via "top_n_tokens = XXX" in the function above.
# with 10 words, the highest possible value for exclusivity is 10 and for coherence always 0 (being a log likelihood).
# But if we focus on the first 15 words for each Topic, the highest possible value for exclusivity becomes 15
topic_exclusivity(lda, top_n_tokens=15)
topic_coherence(lda, dtm, top_n_tokens=15)
# Avg. value of both coherence and exclusivity when k=5
mean(topic_coherence(lda, dtm))
mean(topic_exclusivity(lda))
# now let's change k between 4 and 25 and each time we store the corresponding avg. values of both coherence and exclusivity
top <- c(4:25)
top
# let's create an empty data frame that we will fill later on
results <- data.frame(first=vector(), second=vector(), third=vector())
results
# with add iter=100 where iter is the number of Gibbs iterations to make LDA faster; the default value is 2,000
# [so this is not an advisable strategy for a serious project!]
system.time(
for (i in top)
{
set.seed(123)
lda <- LDA(dtm, method= "Gibbs", k = i, control=list(verbose=50L, iter=100))
topic <- i
coherence <- mean(topic_coherence(lda, dtm))
exclusivity <- mean(topic_exclusivity(lda))
results <- rbind(results , cbind(topic, coherence, exclusivity ))
}
)
results
str(results)
# You should focus on those models that lie on the semantic coherence-exclusivity ‘frontier’, that
# is, where no model strictly dominates another in terms of semantic coherence and
# exclusivity (i.e., models with average scores towards the upper right side of the plot).
# The figure below identifies model k=11 or k=15 as models that satisfy the
# ‘frontier’ criterion, i.e., models with desirable properties in both dimensions.
# These two values could be a good starting point. But remember: always validate your results by understanding the topics!
plot(results$coherence, results$exclusivity, main="Scatterplot Example",
xlab="Semantic Coherence", ylab="Exclusivity ", pch=19)
text(results$coherence, results$exclusivity, labels=results$topic, cex= 1, pos=4)
#####################################
### SECOND ALTERNATIVE
#####################################
# An alternative metric for evaluating topic models is the "held-out likelihood" and/or the "perplexity".
# Both metrics are calculated by splitting your original corpus into two parts – a training set and a test set.
# Similar to the idea of cross-validation (we'll discuss a lot about it later on when dealing with ML), these 2 metrics help
# to assess the model's prediction performance.
# More in details, both metrics assess a topic model's ability to predict the words that appear in new documents (i.e., in the test-set)
# after having been trained on a training set (and using for that the topic-terms matrix as learned in the training-stage).
# That is, do the words in the test set appear together as they would according to the topic-terms matrix probabilities as learned in the
# training-stage?
# The lower (higher) the perplexity (held-out likelihood - being a log-likelihood!), the better the model.
# As an alternative, instead of splitting your corpus into two parts (the training and the test-set), you can also
# hold out some fraction of the words in a set of documents of your corpus.
# let's extract 80% of the observations in our DfM and let's consider them as our training-set
# [why 80% and not 75% or 70%? actually it would be more elegant to use a fully cross-validation approach...More on this later on!]
nrow(dtm )
set.seed(123)
index <- sample(1:nrow(dtm), nrow(dtm )*0.8 )
length(index)
train_data <- dtm[index,]
test_data <- dtm[-index,]
nrow(train_data)
nrow(test_data)
# now let's make k changes between 4 and 50
topics <- seq(4, 50, by=1)
topics
# let's create an empty data frame that we will fill later on
results_df<- data.frame(topic=vector(), perplexity=vector(), log=vector())
str(results_df)
set.seed(123)
system.time(
for (i in topics){
fitted <- LDA(train_data, k = i, method = "Gibbs", control=list(verbose=50L, iter=100))
results_df[i,1] <- (i)
results_df[i,2] <- perplexity(fitted, newdata = test_data)
results_df[i,3] <- logLik(fitted)
})
results_df
perp <- ggplot(results_df, aes(x= as.numeric(row.names(results_df)), y=perplexity)) + geom_line(color="red", size=2) + ggtitle("Perplexity of held-out data") +
labs(x = "Candidate number of topics", y = "Perplexity")
log <- ggplot(results_df, aes(x= as.numeric(row.names(results_df)), y=log)) + geom_line(size=2) + ggtitle("Log-Likelihood") +
labs(x = "Candidate number of topics", y = "Log-Likelihood")
plot_grid(perp , log)
# At around 30 to 40 topics, the decrease in perplexity for additional topics becomes notably less. This is one way to interpret the right
# number of topics, similar to the interpretation of the elbow in the scree plot of a factor analysis.
# Note however that Perplexity is not always strongly correlated to human judgment
# Chang et al. have shown that, surprisingly, predictive likelihood (or equivalently, perplexity) and human judgment are often not correlated, and even sometimes slightly anti-correlated.
# Chang, Jonathan, Jordan Boyd-Graber, Sean Gerrish, Chong Wang and David M. Blei. 2009. Reading Tea Leaves: How Humans Interpret Topic Models. NIPS.
# Therefore, better focusing on coherence and exclusivity as above!
##################################
### THIRD ALTERNATIVE
##################################
# Maximize (the higher, the better):
# Across-Topic Divergence (Deveaud et al. 2014) - usually the one most often used in the literature
system.time(tune <- FindTopicsNumber(dtm, metrics = c("Deveaud2014"),
topics = seq(4, 50, by=1),
control = list(seed = 123, iter=100)))
tune
max(tune$Deveaud2014)
# observation with the max Deveaud2014 value: 27, that corresponds to k=24
which.max(tune$Deveaud2014)
ggplot(tune, aes(x = topics, y = Deveaud2014)) + geom_point() + geom_line()