rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/FOR NEXT YEAR 2020/Newsmap")
getwd()
library(quanteda)
library(readtext)
library(ggplot2)
library(ldatuning)
library(topicmodels)
library(lubridate)
library(topicdoc)
library(cowplot)
library(dplyr)
library(tidytext)
#########################################
########## Topic-model (LDA)
#########################################
# This data frame contains 1,959 Guardian news articles published in 2016
myText <- read.csv("guardian.csv", stringsAsFactors=FALSE)
str(myText)
# creating a new variable "text2". Later it will be useful
myText$text2 <- myText$text
news_corp <- corpus(myText)
head(summary(news_corp))
tok2 <- tokens(news_corp, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
news_dfm <- dfm(tok2)
# let's remove the reference to time
news_dfm <- dfm_remove(news_dfm, c('*-time', '*-timeUpdated', 'GMT', 'BST'))
# We keep only the top 5% of the most frequent features (min_termfreq = 0.95) that appear less than 10% in all
# the documents (max_docfreq = 0.1) using dfm_trim() to focus on common BUT distinguishing features.
news_dfm <- dfm_trim(news_dfm, min_termfreq = 0.95, termfreq_type = "quantile",
max_docfreq = 0.1, docfreq_type = "prop")
news_dfm[1:2, 1:5]
# as docvars there is the "text" as well "text2" - it will be helpful later on!
str(docvars(news_dfm))
# 20 top words
topfeatures(news_dfm, 20)
# But wait a minute...Why "Trump" is not in the list of the topfeatures? After all everyone was discussing about Trump back in 2016!
# ...but that's precisely the reason! It was so popular that it does not satify the trim rule decided above (it is so popular
# that appears in a large number of articles, i.e., more than max_docfreq = 0.1)
# And indeed let's look for "trump" in a not-trimmed dfm
news_dfm2 <- dfm(tok2)
topfeatures(news_dfm2, 20)
# Let's also keep only documents with number of tokens >0 [given that after the above reduction,
# some articles could actually be empty - 8 here]
news_dfm[ntoken(news_dfm) == 0,]
# or alternatively:
table(ntoken(news_dfm)== 0)
news_dfm <- news_dfm[ntoken(news_dfm) > 0,]
# or alternatively using the dfm_subset command that we already saw: "dfm_subset(news_dfmntokennews_dfm>0)"
news_dfm[ntoken(news_dfm) == 0,]
# quanteda does not implement own topic models, but you can easily access to LDA() from the topicmodel package
# through convert(). k = 5 specifies the number of topics to be discovered in this first example.
dtm <- convert(news_dfm, to = "topicmodels")
# let's identify k=5
# being a model with a random start, always a good idea to declare the seed to be able to replicate our findings!
set.seed(123)
system.time(lda <- LDA(dtm, method= "Gibbs", k = 5, control=list(verbose=50L))) # around 20 seconds on my laptop
# alternative way to define the seed:
# system.time(lda <- LDA(dtm, method= "Gibbs", k = 5, control = list(seed = 123)))
# We can use get_terms to the top n terms from the topic model (i.e., terms with the highest beta associated with each topic)
# and get_topics to predict the top k topic for each document. This will help us interpret the results of the model.
# Which the most likely terms for each topic?
# Either we use get_terms...
termsList <- get_terms(lda, 10)
# 10 terms for topic 1: perhaps US politics?
termsList[,1]
# 10 terms for topic 2: perhaps IS and Islamic terrorism?
termsList[,2]
# and so on...
termsList[,3]
termsList[,4]
# topic 5 seems to discuss about several different arguments...perhaps different topics are conflated in it?
# That would be a clear sign that k=5 not a great choice...
termsList[,5]
#...or we can extract directly the most important terms from the model using terms().
terms(lda, 15)
# Let's plot these words
# let's convert the object containing the results of our lda into a particular data frame (called a tibble)
lda_topics <- tidy(lda, matrix = "beta")
str(lda_topics)
# then let's convert a tibble into a more common data frame (actually not needed, just to make you more comfortable)
lda_topics <- as.data.frame(lda_topics)
str(lda_topics)
top_terms <- group_by(lda_topics, topic)
str(top_terms)
# let's keep only the first top 10 betas for each of the 5 topics
top_terms <- top_n(top_terms, 10, beta)
top_terms <- ungroup(top_terms)
top_terms <- arrange(top_terms , topic, -beta)
str(top_terms)
table(top_terms$topic)
top_terms <- mutate(top_terms, topic = factor(topic),
term = reorder_within(term, beta, topic))
str(top_terms)
# here on the horizontal axis you have the betas for each single word
ggplot(top_terms, aes(term, beta, fill = topic)) +
geom_bar(alpha = 0.8, stat = "identity", show.legend = FALSE) +
scale_x_reordered() +
facet_wrap(facets = vars(topic), scales = "free", ncol = 2) +
coord_flip()
# which is the most likely topic for each document?
# Either we use get_topics...
topics <- get_topics(lda, 1)
head(topics, 10)
# or you can obtain the most likely topics using topics()
head(topics(lda))
# let's save this info as a new document-level variable
str(docvars(news_dfm))
docvars(news_dfm, 'pred_topic') <- topics(lda)
str(docvars(news_dfm))
# Let’s take a closer look at some of these topics. To further help us interpret the output of out topic model,
# we can take a random sample of documents highly associated with each topic and read them
# Topic 1
paste(termsList [,1], collapse=", ")
set.seed(123)
sample(news_dfm@docvars$text2[news_dfm@docvars$pred_topic==1], 2)
strwrap(sample(news_dfm@docvars$text2[news_dfm@docvars$pred_topic==1], 2))
# comment: perhaps the first one not clearly associated to US politics after all...
# Topic 2
paste(termsList [,2], collapse=", ")
set.seed(123)
strwrap(sample(news_dfm@docvars$text2[news_dfm@docvars$pred_topic==2], 2))
# Topic 3
paste(termsList [,3], collapse=", ")
set.seed(123)
strwrap(sample(news_dfm@docvars$text2[news_dfm@docvars$pred_topic==3], 2))
# Looking at the evolution of certain topics over time can be interesting
# Let’s look for example at Topic 2, which appears to be related somehow to Isis
str(news_dfm@docvars$date)
news_dfm@docvars$month <- substr(news_dfm@docvars$date, 1, 7) # extract year and month (first 7 characters)
str(docvars(news_dfm))
table(news_dfm@docvars$month )
# let's compute the relative frequency of articles discussing about Isis as their MAIN topic over all the articles published per month in 2016
tab <- table(news_dfm@docvars$month [news_dfm@docvars$pred_topic==2])/table(news_dfm@docvars$month )
tab
# three peaks: January, May and December
plot(tab)
# But we can actually do better than this. LDA is a mixed membership model,
# which means that for each document, it actually computes a distribution over topics.
# In other words, each document is considered to be about a mixture of topics as we have already discussed!
# This information is included in the matrix gamma in the LDA object (theta in the notation we used for the slides)
# For example, news 1 is 26% about topic 4, while news 3 is 39% about topic 4
round(lda@gamma[1,], 2)
round(lda@gamma[3,], 2)
# let's read the article with the largest gamma(=theta) for topic 1
max(lda@gamma[,1])
which.max(lda@gamma[,1])
# As you can read, this article is mainly discussing about "American politcs", but also about ISIS
strwrap(news_dfm@docvars$text2[1805])
round(lda@gamma[1805,], 2)
# So we can actually take the information in the matrix and aggregate it to compute the average probability that a news each month
# is about a particular topic. Let’s now choose once again Topic 2
# add probability to dfm
news_dfm@docvars$topic2 <- lda@gamma[,2]
str(docvars(news_dfm))
# now aggregate at the month level
agg <- aggregate(news_dfm@docvars$topic2, by=list(month=news_dfm@docvars$month), FUN=mean)
str(agg)
agg$date <- as.Date(paste(agg$month,"-01",sep=""))
str(agg)
# Let's plot it! Once again three peaks: January, May and December
ggplot(agg, aes(date, x)) +
geom_point() + geom_line() + labs(title = "Estimated proportion of articles discussing about ISIS",
x = "Month",
y = "Avg. prob. of article about topic 2 (ISIS)") +
theme_light()
##################################
### identifying the optimal number of topics given your corpus
### REMEMBER: this is the very first thing you HAVE to do before running a Topic Model!
##################################
##################################
### FIRST ALTERNATIVE - this alternative is the same one that we will employ also in the
### next lecture, when we will discuss about STM (structural topic models)
##################################
# let's focus on topic coherence and exclusivity of our models with k=5.
# You clearly see that topic 1 (the topic we clearly identified earlier as discussing about US politics) presents
# the highest values of both coherence and exclusivity; while topic 5 (the topic we were not able to clearly identify earlier)
# presents quite low values of both statistics (i.e., a difficult topic to give an interpretation to....)
topic_diagnostics(lda, dtm)
topic_coherence(lda, dtm)
topic_exclusivity(lda)
# both coherence and exclusivity is by default computed on the first 10 words for each Topic
# but you can change it via "top_n_tokens = XXX" in the function above.
# with 10 words, the highest possible value for exclusivity is 10 and for coherence always 0 (being a log likelihood).
# But if we focus on the first 15 words for each Topic, the highest possible value for exclusivity becomes 15
topic_exclusivity(lda, top_n_tokens=15)
topic_coherence(lda, dtm, top_n_tokens=15)
# Avg. value of both coherence and exclusivity when k=5
mean(topic_coherence(lda, dtm))
mean(topic_exclusivity(lda))
# now let's change k between 4 and 25 and each time we store the corresponding avg. values of both coherence and exclusivity
top <- c(4:25)
top
# let's create an empty data frame that we will fill later on
results <- data.frame(first=vector(), second=vector(), third=vector())
results
# with add iter=100 where iter is the number of Gibbs iterations to make LDA faster; the default value is 2,000
# [so this is not an advisable strategy for a serious project!]
system.time(
for (i in top)
{
set.seed(123)
lda <- LDA(dtm, method= "Gibbs", k = i, control=list(verbose=50L, iter=100))
topic <- i
coherence <- mean(topic_coherence(lda, dtm))
exclusivity <- mean(topic_exclusivity(lda))
results <- rbind(results , cbind(topic, coherence, exclusivity ))
}
)
results
str(results)
# You should focus on those models that lie on the semantic coherence-exclusivity ‘frontier’, that
# is, where no model strictly dominates another in terms of semantic coherence and
# exclusivity (i.e., models with average scores towards the upper right side of the plot).
# The figure below identifies model k=11 or k=15 as models that satisfy the
# ‘frontier’ criterion, i.e., models with desirable properties in both dimensions.
# These two values could be a good starting point. But remember: always validate your results by understanding the topics!
ggplot(results, aes(x=coherence, y=exclusivity)) + geom_point() +
geom_text(label=results$topic, vjust=-1) +
ylab(label="Exclusivity ") + xlab("Semantic Coherence") +
theme_light()
#####################################
### SECOND ALTERNATIVE
#####################################
# An alternative metric for evaluating topic models is the "held-out likelihood" / "perplexity".
# Remember: the lower the perplexity, the better the model.
# Let's extract 80% of the observations. Let's consider them as our training-set
# [why 80% and not 60% or 70%? actually it would be more robust to use a cross-validation approach...
# More on this later on!]
nrow(dtm )
set.seed(123)
index <- sample(1:nrow(dtm), nrow(dtm )*0.8 )
length(index)
train_data <- dtm[index,]
test_data <- dtm[-index,]
nrow(train_data)
nrow(test_data)
# now let's make k changes between 4 and 50
topics <- seq(4, 50, by=1)
topics
# let's create an empty data frame that we will fill later on.
# In this case, let's also save the log-likelihood of each of our estimated models on the training-set
results_df<- data.frame(topic=vector(), perplexity=vector(), log=vector())
str(results_df)
system.time(
for (i in topics){
set.seed(123)
fitted <- LDA(train_data, k = i, method = "Gibbs", control=list(verbose=50L, iter=100))
results_df[i,1] <- (i)
results_df[i,2] <- perplexity(fitted, newdata = test_data)
results_df[i,3] <- logLik(fitted)
})
perp <- ggplot(results_df, aes(x= as.numeric(row.names(results_df)), y=perplexity)) + geom_line(color="red", size=2) + ggtitle("Perplexity of held-out data") +
labs(x = "Candidate number of topics", y = "Perplexity")
log <- ggplot(results_df, aes(x= as.numeric(row.names(results_df)), y=log)) + geom_line(size=2) + ggtitle("Log-Likelihood") +
labs(x = "Candidate number of topics", y = "Log-Likelihood")
plot_grid(perp , log)
# At around 30 to 40 topics, the decrease in perplexity for additional topics becomes notably less. This is one way to interpret the right
# number of topics, similar to the interpretation of the elbow in the scree plot of a factor analysis.
##################################
### THIRD ALTERNATIVE
##################################
# Across-Topic Divergence (Deveaud et al. 2014) - usually one of the test most often used in the literature
# You retain the k that maximizes such divergence
system.time(tune <- FindTopicsNumber(dtm, metrics = c("Deveaud2014"),
topics = seq(4, 50, by=1),
control = list(seed = 123, iter=100)))
tune
max(tune$Deveaud2014)
# observation with the max Deveaud2014 value: 27, that corresponds to k=24
which.max(tune$Deveaud2014)
ggplot(tune, aes(x = topics, y = Deveaud2014)) + geom_point() + geom_line()
# Note that the Across-Topic Divergence method is very similar to find the k that maximizes "exclusivity" in the plot above
ggplot(results, aes(x=coherence, y=exclusivity)) + geom_point() +
geom_text(label=results$topic, vjust=-1) +
ylab(label="Exclusivity ") + xlab("Semantic Coherence") +
theme_light()