rm(list=ls(all=TRUE))
getwd()
# with setwd() you change the current working directory of the R process
setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/FOR NEXT YEAR 2020/Newsmap")
getwd()
library(quanteda)
library(readtext)
library(ggplot2)
library(ldatuning)
library(topicmodels)
library(lubridate)
library(topicdoc)
#########################################
########## Topic-model (LDA)
#########################################
# This data frame contains 1,951 Guardian news articles published in 2016
myText <- read.csv("guardian.csv", stringsAsFactors=FALSE)
str(myText)
# creating a new variable "text2". Later it will be useful
myText$text2 <- myText$text
news_corp <- corpus(myText)
head(summary(news_corp))
# After removal of function words and punctuations in dfm(), we keep only the top 5% of the most frequent features
# (min_termfreq = 0.95) that appear less than 10% in all the documents (max_docfreq = 0.1) using dfm_trim()
# to focus on common but distinguishing features.
news_dfm <- dfm(news_corp, remove =(stopwords("english")), tolower = TRUE, stem = TRUE,
remove_punct = TRUE, remove_numbers=TRUE)
news_dfm <- dfm_remove(news_dfm, c('*-time', '*-timeUpdated', 'GMT', 'BST'))
news_dfm <- dfm_trim(news_dfm, min_termfreq = 0.95, termfreq_type = "quantile",
max_docfreq = 0.1, docfreq_type = "prop")
news_dfm[1:2, 1:5]
# 20 top words
topfeatures(news_dfm, 20)
# keeping only documents with number of tokens >0 [given that after the above reduction, some articles could actually be empty - 8 here]
news_dfm[ntoken(news_dfm) == 0,]
news_dfm <- news_dfm[ntoken(news_dfm) > 0,]
# as docvars there is the text as well (text2) - it will be helpful later on!
str(news_dfm)
# quanteda does not implement own topic models, but you can easily access to LDA() from the topicmodel package
# through convert(). k = 5 specifies the number of topics to be discovered.
dtm <- convert(news_dfm, to = "topicmodels")
# being a probabilist model, always a good idea to declare the seed! either before or in the function as seed=. Better before!
# let's identify k=5
set.seed(123)
system.time(lda <- LDA(dtm, method= "Gibbs", k = 5))
# alternative to define the seed:
# system.time(lda <- LDA(dtm, method= "Gibbs", k = 5, control = list(seed = 123)))
# We can use get_terms to the top n terms from the topic model (i.e., terms with the highest beta associated with each topic)
# and get_topics to predict the top k topic for each document. This will help us interpret the results of the model.
# Which the most likely terms for each topic?
terms <- get_terms(lda, 10)
# 10 terms for topic 1
terms[,1]
# 10 terms for topic 2, and so on
terms[,2]
terms[,3]
terms[,4]
terms[,5]
# or we can extract directly the most important terms from the model using terms().
terms(lda, 5)
# which meaning for each post according to you??
# which is the most likely topic for each document?
topics <- get_topics(lda, 1)
head(topics, 10)
# or simply like that
head(topics(lda))
# Let’s take a closer look at some of these topics. To help us interpret the output, we can look
# at the words associated with each topic and take a random sample of documents highly associated
# with each topic.
# Topic 1
paste(terms[,1], collapse=", ")
sample(news_dfm@docvars$text2[topics==1], 2)
# Topic 3
paste(terms[,3], collapse=", ")
sample(news_dfm@docvars$text2[topics==3], 2)
# you can then obtain the most likely topics using topics() and save them as a document-level variable.
head(topics(lda), 20)
docvars(news_dfm, 'pred_topic') <- topics(lda)
str(news_dfm)
# In the case of date with timestamps, looking at the evolution of certain topics over time
# can be interesting. Let’s look for example at Topic 5, which appears to be related to Isis
news_dfm@docvars$month <- substr(news_dfm@docvars$date, 1, 7) # extract year and month (first 7 characters)
table(news_dfm@docvars$month )
# frequency table with articles about Isis as their MAIN topic, per month in 2016
tab <- table(news_dfm@docvars$month [news_dfm@docvars$pred_topic==5])
plot(tab)
# But we can actually do better than this. LDA is a probabilistic model,
# which means that for each document, it actually computes a distribution over topics.
# In other words, each document is considered to be about a mixture of topics as we have already discussed!
# This information is included in the matrix gamma in the LDA object (theta in the notation we used for the slides)
# For example, news 1 is 22% about topic 1, while news 3 is 38% about topic 1
round(lda@gamma[1,], 2)
round(lda@gamma[3,], 2)
# So we can actually take the information in the matrix and aggregate it to compute the average probability that a news each month
# is about a particular topic. Let’s now choose once again Topic 5
# add probability to dfm
news_dfm@docvars$topic5 <- lda@gamma[,5]
str(news_dfm)
# now aggregate at the month level
agg <- aggregate(news_dfm@docvars$topic5, by=list(month=news_dfm@docvars$month), FUN=mean)
agg$date <- as.Date(paste(agg$month,"-01",sep=""))
str(agg)
# and plot it
plot(agg$date , agg$x, type="l", xlab="Month", ylab="Avg. prob. of article about topic 5",
main="Estimated proportion of articles discussing about ISIS")
##################################
# identifying the optimal number of topics between for example: 4 and 9
##################################
### FIRST ALTERNATIVE
# let's focus on topic coherence and exclusivity of our models with k=5
topic_diagnostics(lda, dtm)
topic_coherence(lda, dtm)
topic_exclusivity(lda)
mean(topic_coherence(lda, dtm))
mean(topic_exclusivity(lda))
# now let's make k changes between 4 and 9
top <- c(4:9)
top
# let's create an empty data frame that we will fill later on
results <- data.frame(first=vector(), second=vector(), third=vector())
results
# with add iter=100 where iter is the number of Gibbs iterations to make LDA faster; the default value is 2000 [so this is not an advisable strategy
# for a serious project!]
system.time(
for (i in top)
{
set.seed(123)
lda <- LDA(dtm, method= "Gibbs", k = (i), control=list(verbose=50L, iter=100))
topic <- (i)
coherence <- mean(topic_coherence(lda, dtm))
exclusivity <- mean(topic_exclusivity(lda))
results <- rbind(results , cbind(topic, coherence, exclusivity ))
}
)
results
str(results)
# k=9 seems to be the best choice
plot(results$coherence, results$exclusivity, main="Scatterplot Example",
xlab="Semantic Coherence", ylab="Exclusivity ", pch=19)
text(results$coherence, results$exclusivity, labels=results$topic, cex= 1, pos=4)
### SECOND ALTERNATIVE [KEEPING IT???]
system.time(tune <- FindTopicsNumber(dtm, metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
topics = seq(4, 9, by=1),
control = list(seed = 123, iter=100)))
# Minimization: CaoJuan2009 and Arun2010 metrics
# Maximization: Deveaud2014 and Griffiths2004 metrics
tune
FindTopicsNumber_plot(tune) # here 9 probably the best solution
### THIRD ALTERNATIVE
# Perplexity: the most common way to evaluate a probabilistic model is to measure the log-likelihood of a held-out test set;
# Perplexity is a measurement of how well a probability distribution or probability model predicts a sample.
# The lower the value of perplexity, the best is!
# Specifically, if you’ve estimated a model on one set of data, the held-out likelihood is the likelihood (or log-likelihood)
# of the model, evaluated at the parameters you’ve estimated, on the data which you did not use in your initial estimation.
## Using perplexity for held-out set
# let's extract 80% of the observations in our dtm [why 80% and not 75% or 70%? actually it would be more elegant to use a cross-validation
# approach...what is that? More on this later on!]
nrow(dtm )
set.seed(123)
index <- sample(1:nrow(dtm), nrow(dtm )*0.8 )
length(index)
train_data <- dtm[index,]
test_data <- dtm[-index,]
nrow(train_data)
nrow(test_data)
# now let's make k changes between 4 and 9
topics <- seq(4, 9, by=1)
topics
# let's create an empty data frame that we will fill later on
perplexity_df <- data.frame(test_perplexity=numeric())
str(perplexity_df)
set.seed(12345)
system.time(
for (i in topics){
fitted <- LDA(train_data, k = i, method = "Gibbs", control=list(verbose=50L, iter=100))
perplexity_df[i,1] <- perplexity(fitted, newdata = test_data)
})
perplexity_df
# plotting the perplexity across different values of k - the lower the value, the better is
ggplot(perplexity_df, aes(x= as.numeric(row.names(perplexity_df)), y=test_perplexity)) + geom_line() + ggtitle("Perplexity of hold out data") +
scale_x_continuous("Number of topics",
breaks=seq(4, 9, by=1)) + xlim(4,9) + labs(x = "Candidate number of topics", y = "Perplexity when fitting the trained model to the hold-out set")
#################################### let's replicate our analysis but using the 2013 Guardian's articles
myText2 <- read.csv("guardian2013.csv", stringsAsFactors=FALSE)
str(myText2)