rm(list=ls(all=TRUE)) getwd() # with setwd() you change the current working directory of the R process # setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL") setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(ggplot2) library(stm) library(igraph) library(wordcloud) library(dplyr) # We will now use a dataset that contains the lead paragraph of around 5,000 articles about the economy published # in the New York Times between 1980 and 2014. nyt <- readtext("nyt.csv", text_field = "lead_paragraph") str(nyt) nyt$datetime <- as.Date(nyt$datetime, "%m/%d/%Y") # let's convert the string of the date in a date str(nyt) # let's create a new variable according to the party of the president in charge during the temporal period considered year <- as.numeric(substr(nyt$datetime, 1, 4)) # let's extract the year [i.e., the four first numbers that appear] table(year) repub <- ifelse(year %in% c(1981:1992, 2000:2008), 1, 0) # 1 for repub; 0 for dems table(repub) nyt$repub <- repub nyt$year <- year str(nyt) # let's extract a random sample of 1,000 texts (to make things faster in the Lab!) set.seed(123) nyt2 <- sample_n(nyt, 1000) str(nyt2) myCorpus <- corpus(nyt2) head(summary(myCorpus)) myDfm <- dfm(myCorpus, remove = stopwords("english"), tolower = TRUE, stem = TRUE, remove_punct = TRUE, remove_numbers=TRUE) # Some form of trim is usually suggested when you have many documents and you want to run a STM. # For example, we can decide to keep only words occurring in at least 2 documents myDfm.trim <-dfm_trim(myDfm, min_docfreq = 2, verbose=TRUE) length(myDfm@Dimnames$features) # 6553 features length(myDfm.trim@Dimnames$features) # 3332 features # In our case, given the relative low number of documents and features, we are going to proceed with our # original dfm (i.e., myDfm) # Convert the dfm from Quanteda to STM: this is a crucial step!!! # When coverting the dfm, you have also to list the document variables that are present in the corpus! # in our case: datetime, repub, year DfmStm <- convert(myDfm, to = "stm", docvars = docvars(myCorpus)) head(docvars(myCorpus)) str(DfmStm) ################################################################ # STM run ################################################################ # 1) we want to extract 15 topics (i.e., K=15); # 2) we want to control for the following covariates in affecting topic prevalence: repub and year; # We could have ran a model with covariates affecting topical content as well (see below for an example). # While including more covariates in topic prevalence will rarely affect the speed of the model, # including additional levels of the content covariates (topical content) can make the model much slower to converge. # This is due to the model operating in the much higher dimensional space of words in dictionary # (which tend to be in the thousands) as opposed to topics. That's why I suggest you to run first a model w/o covariates # for topical content. REMEMBER however: the results you get when you add variables for topic prevalence but no # variables fro topical content will be (slighly or more) different to the ones when you add variables for both # topic prevalence and topical content together # 3) it is highly suggested to employ the "spectral" intitialization based on the method of moments, which is deterministic # and globally consistent under reasonable conditions. This means that no matter of the seed that is set, the SAME results will be generated. # As an alternative you could employ a LDA initialization. In this, case, however, the answers the estimation procedure comes up with may depend # on starting values of the parameters (e.g., the distribution over words for a particular topic). # So remember always to define a set seed to replicate your analysis in this latter case (as we did with Topic Models)! # Note that by writing "s(year)" we use a spline functions for non-linear transformations of the time-variable. # if you want to add just a linear relationship between "year" and topics, just write "year" # You can also relax the number of maximum required iterations if you want, but that would require you (much) more # time especially with large datasets. However 75 is a very small number! We apply that here just to save time! system.time(stmFitted <- stm(DfmStm $documents, DfmStm $vocab, K = 15, max.em.its = 75, prevalence = ~ repub + s(year), data = DfmStm $meta, init.type = "Spectral")) # around 26 seconds in my laptop ############################################################################################################## # NB: running a model with or without covariates for topic prevalance produces of course different results; # the same is true if you run a model with covariates just for the topic prevalance part, or with both the # topic prevalance part as well as the topical content part ############################################################################################################## ################################################################ # Interpreting the STM by plotting and inspecting results ################################################################ # Summary visualization: understanding the extracted topics through words and example documents # First: which words/topic association? labelTopics(stmFitted, n=7) # 7 features for each topic (including FREX words!) # The frequency/exclusivity (FREX) scoring summarizes words according to their probability of appearance under a topic and the exclusivity # to that topic. These words provide more semantically intuitive representations of each topic labelTopics(stmFitted, n=7, topics=1) # the same just for topic 1 plot(stmFitted, type = "labels", labeltype = c("frex"), n=5) # plot just frex words plot(stmFitted, type = "summary", labeltype = c("frex"), n=5) # topic 5 is the most frequent one # topic meaning: according to frex words, topic 5 seems to be related to the trend in the economy; topic 10 to inflation; # topic 9 about politics; etc. plot(stmFitted, type = "hist", labeltype = c("frex")) # Here topic 5 appears as more "evenly" distributed across documents than # for example topic 11 for example # Let's read the documents most associated with each topic # In particular, let's identify the most representative documents for a particular topic. # We can also use this in order to get a better sense of the content of actual documents with a high topical content. docs <- nyt2$text str(docs) # Let's focus on topic 5 for example... # save only the first 200 words from each document and display them. Otherwise you cannot read the texts properly docs2 <- substr(docs, start = 1, stop = 200) thought5 <- findThoughts(stmFitted, texts=docs2, topics=5, n=3)$docs[[1]] par(mfrow = c(1, 2),mar = c(.5, .5, 1, .5)) plotQuote(thought5, width = 30, main = "Topic 5") # Let's focus on topic 9 thought9 <- findThoughts(stmFitted, texts=docs2, topics=9, n=3)$docs[[1]] par(mfrow = c(1, 2),mar = c(.5, .5, 1, .5)) plotQuote(thought9, width = 30, main = "Topic 9") # Let's plot Topic 5 and 9 together par(mfrow = c(1, 2),mar = c(.5, .5, 1, .5)) plotQuote(thought5, width = 30, main = "Topic 5 - Economy") plotQuote(thought9, width = 30, main = "Topic 9 - Politics") # which are the the most likely topics across our documents? apply(stmFitted$theta,1,which.max) table(apply(stmFitted$theta,1,which.max) ) # you can save them as a document-level variable. docvars(myDfm , 'STMtopic') <- apply(stmFitted$theta,1,which.max) str(myDfm) head(docvars(myDfm)) # or you can also save them back in the original dataframe nyt2$topic <- apply(stmFitted$theta,1,which.max) str(nyt2) # Topic 5 - 5 random documents associated to it set.seed(123) sample(nyt2$text[nyt2$topic==5], 5) # But we already know that STM actually computes a distribution over topics. # In other words, each document is considered to be about a mixture of topics as we have already discussed! # This information is included in the matrix thera in the STM object # For example, news 1 is 43% about topic 7 and 28% about topic 4 for example round(stmFitted$theta[1,], 2) ################################################################ # Checking for topic correlation ################################################################ # Uses a topic correlation graph estimated by topicCorr and the igraph package to plot a network where nodes are topics # and edges indicate a positive correlation. If no edges = only negative correlations! mod.out.corr <- topicCorr(stmFitted) plot(mod.out.corr) # in our case topics 8 and 15 are positively correlated plot(stmFitted, type = "summary", labeltype = c("frex"), n=5) # i.e. when an article discusses about Reagan there is a positive # probability of also discussing about public budget mod.out.corr$cor ######################################### # Estimating topic prevalence using as IV: repub and year ######################################### set.seed(123) prep <- estimateEffect(1:15 ~ repub + s(year), stmFitted, meta = DfmStm $meta) set.seed(123) summary(prep) # for example there is a tendency to talk less about topic 10 under a Republican Presidency # Method used for plotting topic prevalence: # a) "continuous" estimates how topic proportions vary over the support of a continuous covariate. # b) "pointestimate" estimates mean topic proportions for each value of the categorical covariate # c) "difference" estimates the mean difference in topic proportions for two different values of the covariate (cov.value1 and cov.value2 must be specified) # Let's start with year (i.e., a continuum variable): # let's plot it! as you can see topic 10 prevalence (inflation) is stable but at the end it declines becoming plot(prep, "year", method = "continuous", topics = 10, model = stmFitted, printlegend = FALSE, xaxt = "n", xlab = "Year") seq <- seq(from = as.numeric("1980"), to = as.numeric("2014")) axis(1, at = seq) title("Topic 10 - Inflation") abline(h=0, col="blue") # the opposite happens with topics 9 (politics) plot(prep, "year", method = "continuous", topics = 9, model = stmFitted, printlegend = FALSE, xaxt = "n", xlab = "Year") seq <- seq(from = as.numeric("1980"), to = as.numeric("2014")) axis(1, at = seq) title("Topic 9 - Politics") abline(h=0, col="blue") # Let's now move to repub (i.e., a categorical variable) for topic 10 (inflation) and topic 14 (stock-market) plot(stmFitted, type = "summary", labeltype = c("frex"), n=5) # the salience to topic 14 seems to be the same under different partisan presidency. While there is a tendency to discuss more # about topic 10 under a democratic presidency plot(prep, "repub", model=stmFitted, method="pointestimate", topic=c(10, 14)) # let's see if the difference between the two coefficients is statistically different from 0. # Always remember that you plot "coef.value1-coef.value2". In this case (see below the script) # it implies "coeff. of republican-coeff. of democratic" given that republic is a 0/1 variable and cov.value1 = "1" (i.e., Republican) # and cov.value2 = "0" (i.e., Democratic) plot(prep, covariate = "repub", topics = c(10, 14), model = stmFitted, method = "difference", cov.value1 = "1", cov.value2 = "0", xlim = c(-1, 1), xlab = "More Democratic... More Repubblican", main = "Effect of Democratic vs. Repubblican Presidency", labeltype = "custom", custom.labels = c('Topic 10 - Inflation', 'Topic 14 - Stock-Market')) ######################################### # Estimating topical content using as IV: repub ######################################### # Let's run a topical content analysis by including as a covariate repub. # As currently implemented the variable used for topical content must be a single variable which defines a discrete partition of the dataset # (each document is in one and only one group - so only categorical variables can be employed) # You will need around 4 mins to run it. So let's call the RData where the model is already fitted # system.time(stmFitted_topical <- stm(DfmStm $documents, DfmStm $vocab, K = 15, max.em.its = 75, # prevalence = ~ repub + s(year), content = ~ repub, data = DfmStm $meta, init.type = "Spectral")) # around 222 seconds [4 mins] # load("C:\\Users\\mw\\Dropbox (VOICES)\\TOPIC MODEL\\Lecture 5\\Material 2020\\STM.RData") load("C:\\Users\\luigi\\Dropbox\\TOPIC MODEL\\Lecture 5\\Material 2020\\STM.RData") ls() # You see that the results you get with and w/o topical content are somehow different table(apply(stmFitted$theta,1,which.max) ) table(apply(stmFitted_topical$theta,1,which.max) ) apply(stmFitted$theta,2,mean) # mean distribution of topics apply(stmFitted_topical$theta,2,mean) # mean distribution of topics # high - but not perfect - correlation cor(apply(stmFitted$theta,2,mean), apply(stmFitted_topical$theta,2,mean)) # This figure shows how the nyt articles talk about topic 10 (inflation) under a democratic or repubblican presidency plot(stmFitted_topical, type = "perspectives", topics = 9) # This function can also be used to plot the contrast in words across two topics # This plot calculates the difference in probability of a word for the two topics, normalized by the maximum # difference in probability of any word between the two topics. plot(stmFitted_topical, type = "perspectives", topics = c(10, 14)) ######################################### # Model selection ######################################### # Model search across numbers of topics STM assumes a fixed user-specified number of topics. # There is not a right answer to the number of topics that are appropriate for a given corpus, but # the function searchK uses a data-driven approach to selecting the number of topics (similar to FindTopicsNumber # for LDA but with a different algorithm). # The function will perform several automated tests to help choose the number of topics. # For example, one could estimate a STM model for 3, 4 and 5 topics (just as an example!) # and compare the results along each of the criteria # But REMEMBER: nothing replaces your interpretation!!!! ######## ADD always set.seed here: Due to the need to calculate the heldout-likelihood N documents have proportion ####### of the documents heldout at random (see more below) set.seed(02138) K <-c(3,4,5) system.time(storage <- searchK(DfmStm $documents, DfmStm $vocab, K, max.em.its = 75, prevalence = ~ repub + s(year), data = DfmStm $meta, init.type = "Spectral")) # around 1 minute # important: whenever the documents you want to analyze are pretty long, whenever you employ the searchK # command, it could be advisable also adding this line: N = floor(0.2 * length(Dfmstm2$documents)) - # where Dfmstm2$documents are the documents that you want to analyze. See below for an example # storage2 <- searchK(DfmStm $documents, DfmStm $vocab, K, data = DfmStm $meta, max.em.its = 500, # N = floor(0.2 * length(DfmStm $documents)), prevalence = ~ repub + s(year), init.type = "Spectral") # str(storage2) ##################################################### # The semantic coherence-exclusivity ‘frontier’ ##################################################### # You should focus on those models that lie on the semantic coherence-exclusivity ‘frontier’, that # is, where no model strictly dominates another in terms of semantic coherence and # exclusivity (i.e., models with average scores towards the upper right side of the plot). # The figure below identifies model 4 as the one that better satisfies the # ‘frontier’ criterion, i.e., a model with desirable properties in both dimensions plot(storage$results$semcoh, storage$results$exclus, xlab= "Semantic coherence", ylab= "Exclusivity", col= "blue", pch = 19, cex = 1, lty = "solid", lwd = 2) text(storage$results$semcoh, storage$results$exclus, labels=storage$results$K, cex= 1, pos=4) ##################################################### # The held-out likelihood - OPTIONAL ##################################################### # The document-completion held-out likelihood method: it is the estimation of the probability of words appearing # within a document when those words have been removed from the document in the estimation step. # The basic idea is the following one: 1) let's hold out some fraction of the words in a set of documents; # 2) let's train the model and then 3) let's use the document-level latent variables to evaluate the probability of the heldout portion. # Similar to cross-validation, when some of the data is removed from estimation and then later used for # validation, the held-out likelihood helps the user assess the model’s prediction performance. # The higher the likelihood, the better! # So compare this graph and the previous one to arrive to a conclusion about the number of k to select plot(storage$results$K, storage$results$heldout, xlab= "Number of Topics", ylab= "held-out likelihood", col= "blue", pch = 19, cex = 1, lty = "solid", lwd = 2, xaxt="n") text(storage$results$K, storage$results$heldout, labels=storage$results$K, cex= 1, pos=4) xtick<-seq(3, 5, by=1) axis(side=1, at=xtick, labels = FALSE) text(x=xtick, par("usr")[3], labels = xtick, pos = 1, xpd = TRUE) # When init.type="Spectral" and K=0 the number of topics is set using the algorithm in Lee and Mimno (2014). # This method does not estimate the "true" number of topics and does not necessarily have any particular statistical # properties for consistently estimating the number of topics. It can however provide a useful starting point. # The added randomness from the projection means that the algorithm is not deterministic # as with the standard "Spectral" initialization type. Running it with a different seed can result in not only different # results but a different number of topics. # Do not run in the Lab!!! # library(geometry) # library(Rtsne) # library(rsvd) # set.seed(123) # system.time(total <- stm(DfmStm $documents, DfmStm $vocab, K=0, max.em.its = 500, # prevalence = ~ Party +Year, data = DfmStm $meta, init.type = "Spectral")) # labelTopics(total) ######################################### ##### graphic extensions about STM. There are several of them! ##### Take a look at library(stmCorrViz) and library(stminsights) among the others ######################################### # Let's see two of them: # First: LDAvis # for more info about LDAvis: https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf library(LDAvis) library(servr) toLDAvis(mod=stmFitted, docs=DfmStm $documents) # Distance between the topics is an approximation of semantic relationship between the topics # The topic which shares common words will be overlapping (closer in distance) in comparison # to the non-overlapping topic # On selecting a topic (clicking on a topic bubble), top 30 words (with the red-shaded area) are shown # Finally, decreasing the lambda parameter increases the weight of the ratio of the frequency of word given a specific topic. # Moving the slider allows to adjust the rank of terms based on much discriminatory (or "relevant") are for the # specific topic: try to move lambda from 1 to 0 and check the results! # Second: stmBrowser library(stmBrowser) str(DfmStm) str(nyt2) DfmStm $meta$docs <- nyt2$text str(DfmStm $meta) stmBrowser(stmFitted, data= DfmStm $meta, c("repub", "year"), text="docs", labeltype = "frex") # you have an error! Why? cause in the original nyt2$text you have some characters that are not valid UTF-8 # This creates probelm when transforming the object to JSON (using the rjson package). # Therefore: let's use iconv nyt2$text2 <- iconv(nyt2$text, "UTF-8", "UTF-8",sub='') str(nyt2) DfmStm $meta$docs <- nyt2$text2 str(DfmStm $meta) stmBrowser(stmFitted, data= DfmStm $meta, c("repub", "datetime"), text="docs", labeltype = "frex") # as an example: select in the HTML file you produced "X-axis=Topic 10", "Y-axis=datetime", "Radius=none", "Color=repub"