rm(list=ls(all=TRUE))
getwd()
# with setwd() you change the current working directory of the R process
setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(ggplot2)
library(stm)
library(igraph)
library(wordcloud)
library(dplyr)
# We will now use a dataset that contains the lead paragraph of around 5,000 articles about the economy published
# in the New York Times between 1980 and 2014.
nyt <- readtext("nyt.csv", text_field = "lead_paragraph")
str(nyt)
nyt$datetime <- as.Date(nyt$datetime, "%m/%d/%Y") # let's convert the string of the date in a date
str(nyt)
# let's create a new variable according to the party of the president in charge during the temporal period considered
year <- as.numeric(substr(nyt$datetime, 1, 4)) # let's extract the year [i.e., the four first numbers that appear]
table(year)
repub <- ifelse(year %in% c(1981:1992, 2000:2008), 1, 0) # 1 for repub; 0 for dems
table(repub)
nyt$repub <- repub
nyt$year <- year
str(nyt)
# let's extract a random sample of 1,000 texts (to make things faster in the Lab!)
set.seed(123)
nyt2 <- sample_n(nyt, 1000)
str(nyt2)
myCorpus <- corpus(nyt2)
summary(myCorpus)
myDfm <- dfm(myCorpus, remove = stopwords("english"), tolower = TRUE, stem = TRUE,
remove_punct = TRUE, remove_numbers=TRUE)
# Some form of trim is HIGHLY suggested when you have many documents (like in this case: 1K) and you want to run a STM.
# Let's keep only words occurring in at least 2 documents
myDfm.trim <-dfm_trim(myDfm, min_docfreq = 2, verbose=TRUE)
length(myDfm@Dimnames$features) # 6553 features
length(myDfm.trim@Dimnames$features) # 3332 features
# Convert the dfm from Quanteda to STM: this is a crucial step!!!
# When coverting the dfm, you have also to list the document variables that are present in the corpus!
# in our case: datetime, repub, year
DfmStm <- convert(myDfm, to = "stm", docvars = docvars(myCorpus))
head(docvars(myCorpus))
str(DfmStm)
################################################################
# STM run
################################################################
# 1) we want to extract 15 topics (i.e., K=15);
# 2) we want to control for the following covariates in affecting topic prevalence: repub and year;
# We could have ran a model with covariates affecting topical content as well (see below for an example).
# While including more covariates in topic prevalence will rarely affect the speed of the model,
# including additional levels of the content covariates (topical content) can make the model much slower to converge.
# This is due to the model operating in the much higher dimensional space of words in dictionary
# (which tend to be in the thousands) as opposed to topics. That's why I suggest you to run first a model w/o covariates
# for topical content
# 3) it is highly suggested to employ the "spectral" intitialization based on the method of moments, which is deterministic
# and globally consistent under reasonable conditions. This means that no matter of the seed that is set, the SAME results will be generated.
# As an alternative you could employ a LDA initialization. In this, case, however, the answers the estimation procedure comes up with may depend
# on starting values of the parameters (e.g., the distribution over words for a particular topic).
# So remember always to define a set seed to replicate your analysis in this latter case (as we did with Topic Models)!
# Note that by writing "s(year)" we use a spline functions for non-linear transformations of the time-variable.
# if you want to add just a linear relationship between "year" and topics, just write "year"
system.time(stmFitted <- stm(DfmStm $documents, DfmStm $vocab, K = 15, max.em.its = 75,
prevalence = ~ repub + s(year), data = DfmStm $meta, init.type = "Spectral")) # around 45 seconds in my laptop
################################################################
# Interpreting the STM by plotting and inspecting results
################################################################
# Summary visualization: understanding the extracted topics through words and example documents
# First: which words/topic association?
labelTopics(stmFitted, n=7) # 7 features for each topic (including FREX words!)
labelTopics(stmFitted, n=7, topics=1) # the same just for topic 1
plot(stmFitted, type = "labels", labeltype = c("frex"), n=5) # plot just frex words
plot(stmFitted, type = "summary", labeltype = c("frex"), n=5) # topic 5 is the most frequent one
# topic meaning: according to frex words, topic 5 seems to be related to the trend in the economy; topic 10 to inflation;
# topic 9 about politics; etc.
plot(stmFitted, type = "hist", labeltype = c("frex")) # just frex words. Here topic 5 more "evenly" distributed across documents than
# for example topic 11 for example
# Let's read the documents most associated with each topic
# In particular, let's identify the most representative documents for a particular topic.
# We can also use this in order to get a better sense of the content of actual documents with a high topical content.
docs <- nyt2$text
str(docs)
# Let's focus on topic 5 for example...
# save only the first 200 words from each document and display them. Otherwise you cannot read the texts properly
docs2 <- substr(docs, start = 1, stop = 200)
thought5 <- findThoughts(stmFitted, texts=docs2, topics=5, n=3)$docs[[1]]
par(mfrow = c(1, 2),mar = c(.5, .5, 1, .5))
plotQuote(thought5, width = 30, main = "Topic 5")
# Let's focus on topic 9
thought9 <- findThoughts(stmFitted, texts=docs2, topics=9, n=3)$docs[[1]]
par(mfrow = c(1, 2),mar = c(.5, .5, 1, .5))
plotQuote(thought9, width = 30, main = "Topic 9")
# Let's plot Topic 5 and 9 together
par(mfrow = c(1, 2),mar = c(.5, .5, 1, .5))
plotQuote(thought5, width = 30, main = "Topic 5 - Economy")
plotQuote(thought9, width = 30, main = "Topic 9 - Politics")
# which are the the most likely topics across our documents?
apply(stmFitted$theta,1,which.max)
table(apply(stmFitted$theta,1,which.max) )
# you can save them as a document-level variable.
docvars(myDfm , 'STMtopic') <- apply(stmFitted$theta,1,which.max)
str(myDfm)
head(docvars(myDfm))
# or you can also save them back in the original csv
nyt2$topic <- apply(stmFitted$theta,1,which.max)
str(nyt2)
# Topic 5 - 5 random documents associated to it
set.seed(123)
sample(nyt2$text[nyt2$topic==5], 5)
# But we already know that STM actually computes a distribution over topics.
# In other words, each document is considered to be about a mixture of topics as we have already discussed!
# This information is included in the matrix thera in the STM object
# For example, news 1 is 43% about topic 7 and 22% about topic 4 for example
round(stmFitted$theta[1,], 2)
################################################################
# Checking for topic correlation
################################################################
# Uses a topic correlation graph estimated by topicCorr and the igraph package to plot a network where nodes are topics
# and edges indicate a positive correlation. If no edges = only negative correlations!
mod.out.corr <- topicCorr(stmFitted)
plot(mod.out.corr) # in our case topics 8 and 15 are positively correlated
plot(stmFitted, type = "summary", labeltype = c("frex"), n=5) # i.e. when an article discusses about Reagan there is a positive
# probability of also discussing about public budget
mod.out.corr$cor
#########################################
# Estimating topic prevalence using as IV: repub and year
#########################################
set.seed(123)
prep <- estimateEffect(1:15 ~ repub + s(year), stmFitted, meta = DfmStm $meta)
set.seed(123)
summary(prep)
# for example there is a tendency to talk less about topic 10 under a Democratic Presidency
# Method used for plotting topic prevalence:
# a) "continuous" estimates how topic proportions vary over the support of a continuous covariate.
# b) "pointestimate" estimates mean topic proportions for each value of the categorical covariate
# c) "difference" estimates the mean difference in topic proportions for two different values of the covariate (cov.value1 and cov.value2 must be specified)
# Let's start with year (i.e., a continuum variable):
# let's plot it! as you can see topic 10 prevalence (inflation) is stable but at the end it declines becoming
# statistically not different from 0
plot(prep, "year", method = "continuous", topics = 10,
model = stmFitted, printlegend = FALSE, xaxt = "n", xlab = "Year")
seq <- seq(from = as.numeric("1980"), to = as.numeric("2014"))
axis(1, at = seq)
title("Topic 10 - Inflation")
abline(h=0, col="blue")
# Let's now move to repub (i.e., a categorical variable) for topic 10 (inflation) and topic 14 (stock-market)
plot(stmFitted, type = "summary", labeltype = c("frex"), n=5) # i.e. when an article discusses about Reagan there is a positive
# the salience to topic 14 seems to be the same under different partisan presidency. While there is a tendency to discuss more
# about topic 10 under a democratic presidency
plot(prep, "repub", model=stmFitted, method="pointestimate", topic=c(10, 14))
# let's see if the difference between the two coefficients is statistically different from 0
# always remember that you plot "cof.value1-coef.value2". In this case it implies "coeff. of republican-coeff. of democratic"
# given that republic is a 0/1 variable and cov.value1 = "1" (i.e., Republican) and cov.value2 = "0" (i.e., Democratic)
plot(prep, covariate = "repub", topics = c(10, 14),
model = stmFitted, method = "difference",
cov.value1 = "1", cov.value2 = "0", xlim = c(-1, 1),
xlab = "More Democratic... More Repubblican",
main = "Effect of Democratic vs. Repubblican Presidency",
labeltype = "custom",
custom.labels = c('Topic 10 - Inflation', 'Topic 14 - Stock-Market'))
#########################################
# Estimating topical content using as IV: repub
#########################################
# Let's run a topical content analysis by including as a covariate repub.
# As currently implemented the variable used for topical content must be a single variable which defines a discrete partition of the dataset
# (each document is in one and only one group - so only categorical variables can be employed)
# You will need around 4 mins to run it. So let's call the RData where the model is already fitted
# system.time(stmFitted_topical <- stm(DfmStm $documents, DfmStm $vocab, K = 15, max.em.its = 75,
# prevalence = ~ repub + s(year), , content = ~ repub, data = DfmStm $meta, init.type = "Spectral")) # around 222 seconds [4 mins]
load("C:\\Users\\mw\\Dropbox (VOICES)\\TOPIC MODEL\\Lecture 5\\Material 2020\\STM.RData")
ls()
# This figure shows how the nyt articles talk about topic 10 (inflation) under a democratic or repubblican presidency
plot(stmFitted_topical, type = "perspectives", topics = 10)
# This function can also be used to plot the contrast in words across two topics
# This plot calculates the difference in probability of a word for the two topics, normalized by the maximum
# difference in probability of any word between the two topics.
plot(stmFitted_topical, type = "perspectives", topics = c(10, 14))
#########################################
# Model selection
#########################################
# Model search across numbers of topics STM assumes a fixed user-specified number of topics.
# There is not a right answer to the number of topics that are appropriate for a given corpus, but
# the function searchK uses a data-driven approach to selecting the number of topics (similar to FindTopicsNumber
# for LDA but with a different algorithm).
# The function will perform several automated tests to help choose the number of topics.
# For example, one could estimate a STM model for 3, 4 and 5 topics (just as an example!)
# and compare the results along each of the criteria
# But REMEMBER: nothing replaces your interpretation!!!!
######## ADD always set.seed here: Due to the need to calculate the heldout-likelihood N documents have proportion
####### of the documents heldout at random (see more below)
set.seed(02138)
K <-c(3,4,5)
# you can also relax the number of maximum required iterations if you want, but that would require you (much) more
# times especially with large datasets
system.time(storage <- searchK(DfmStm $documents, DfmStm $vocab, K, max.em.its = 500,
prevalence = ~ repub + s(year), data = DfmStm $meta, init.type = "Spectral")) # around 2 minutes
# important: whenever the documents you want to analyze are pretty long, whenever you employ the searchK
# command, it is always advisable also adding this line: N = floor(0.2 * length(Dfmstm2$documents)) -
# where Dfmstm2$documents are the documents that you want to analyze. See below for an example!
# Do not run in the Lab:
# storage2 <- searchK(DfmStm $documents, DfmStm $vocab, K, data = DfmStm $meta, max.em.its = 500,
# N = floor(0.2 * length(DfmStm $documents)), prevalence = ~ repub + s(year), init.type = "Spectral")
# str(storage2)
#####################################################
# The semantic coherence-exclusivity ‘frontier’
#####################################################
# You should focus on those models that lie on the semantic coherence-exclusivity ‘frontier’, that
# is, where no model strictly dominates another in terms of semantic coherence and
# exclusivity (i.e., models with average scores towards the upper right side of the plot).
# The figure below identifies model 4 as the one that better satisfies the
# ‘frontier’ criterion, i.e., a model with desirable properties in both dimensions
plot(storage$results$semcoh, storage$results$exclus,
xlab= "Semantic coherence",
ylab= "Exclusivity",
col= "blue", pch = 19, cex = 1, lty = "solid", lwd = 2)
text(storage$results$semcoh, storage$results$exclus, labels=storage$results$K, cex= 1, pos=4)
#####################################################
# The held-out likelihood
#####################################################
# The document-completion held-out likelihood method: it is the estimation of the probability of words appearing
# within a document when those words have been removed from the document in the estimation step.
# The basic idea is to hold out some fraction of the words in a set of documents,
# train the model and use the document-level latent variables to evaluate the probability of the heldout portion.
# Similar to cross-validation, when some of the data is removed from estimation and then later used for
# validation, the held-out likelihood helps the user assess the model’s prediction performance.
# The higher the likelihood, the better!
# So compare this graph and the previous one to arrive to a conclusion about the number of k to select
plot(storage$results$K, storage$results$heldout,
xlab= "Number of Topics",
ylab= "held-out likelihood",
col= "blue", pch = 19, cex = 1, lty = "solid", lwd = 2, xaxt="n")
text(storage$results$K, storage$results$heldout, labels=storage$results$K, cex= 1, pos=4)
xtick<-seq(3, 5, by=1)
axis(side=1, at=xtick, labels = FALSE)
text(x=xtick, par("usr")[3],
labels = xtick, pos = 1, xpd = TRUE)
# When init.type="Spectral" and K=0 the number of topics is set using the algorithm in Lee and Mimno (2014).
# This method does not estimate the "true" number of topics and does not necessarily have any particular statistical
# properties for consistently estimating the number of topics. It can however provide a useful starting point.
# The added randomness from the projection means that the algorithm is not deterministic
# as with the standard "Spectral" initialization type. Running it with a different seed can result in not only different
# results but a different number of topics.
# Do not run in the Lab!!!
# library(geometry)
# library(Rtsne)
# library(rsvd)
# set.seed(02138)
# system.time(total <- stm(DfmStm $documents, DfmStm $vocab, K=0, max.em.its = 500,
# prevalence = ~ Party +Year, data = DfmStm $meta, init.type = "Spectral"))
# labelTopics(total)
#########################################
##### graphic extensions about STM. There are several of them!
##### Take a look at library(stmCorrViz), library(LDAvis) and library(stminsights) among the others
#########################################