rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(ggplot2) library(stm) library(igraph) library(dplyr) ######################################### ######################################### ################################################################ # Let's estimate a STM ################################################################ ######################################### ######################################### # We will now use a dataset that contains the lead paragraph of around 5,000 articles about the economy published # in the New York Times between 1980 and 2014. nyt <- readtext("nyt.csv", text_field = "lead_paragraph") str(nyt) nyt$datetime <- as.Date(nyt$datetime, "%m/%d/%Y") # let's convert the string of the date in a date str(nyt) # in the original nyt2$text you have some characters that are not valid UTF-8 # Therefore: let's use iconv nyt$text <- iconv(nyt$text, "", "UTF-8") # let's create a new variable according to the party of the president in charge during the temporal period considered year <- as.numeric(substr(nyt$datetime, 1, 4)) # let's extract the year [i.e., the four first numbers that appear] table(year) repub <- ifelse(year %in% c(1981:1992, 2000:2008), 1, 0) # 1 for repub; 0 for dems table(repub) nyt$repub <- repub nyt$year <- year str(nyt) # let's extract a random sample of 1,000 texts (to make the computation of the STM faster in the Lab!) set.seed(123) nyt2 <- sample_n(nyt, 1000) str(nyt2) # Let's create a new document-level variable "text2" storing the text of each of our document. Later it will be useful # Moreover, this option would allow you to use the command findThoughts below, even if after trimming the dfm you have deleted some rows nyt2$text2 <- nyt2$text myCorpus <- corpus(nyt2) head(summary(myCorpus)) tok2 <- tokens(myCorpus , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) myDfm <- dfm(tok2) # Some form of trim is usually suggested when you have many documents and you want to run a STM (exactly as it happens with any topic-models) # In our case, we are going to proceed with our original dfm (i.e., myDfm) # REMEMBER: if you trim the dfm, be sure that each of the documents (i.e., rows) in the Dfm does not report just 0s! myDfm [ntoken(myDfm ) == 0,] # Convert the dfm from Quanteda to STM: this is a crucial step. # When coverting the dfm, you have also to identify the document variables that are present in the corpus. # In our case: datetime, repub, year head(docvars(myCorpus)) head(docvars(myDfm)) DfmStm <- convert(myDfm, to = "stm", docvars = docvars(myDfm)) str(DfmStm) str(DfmStm$meta) ################################################################ # STM run ################################################################ # 1) we want to extract 15 topics (i.e., K=15) - but remember: this is just a toy-example! # As always, we should first of all start to investigate which could be a "good number" for the Ks to extract # 2) we want to control for the following covariates in affecting topic prevalence: repub and year. # We need therefore to list such variables in the command below via "prevalence = ~ repub + s(year)" # Note that by writing "s(year)" we use a spline functions for non-linear transformations of the time-variable. # If you want to add just a linear relationship between "year" and topics, just write "prevalence = ~ repub + year" # We could have ran a model with covariates affecting topical content as well (see below for an example). # While including more covariates in topic prevalence will rarely affect the speed of the model, # including additional levels of the content covariates (topical content) can make the model much slower to converge. # This is due to the model operating in the much higher dimensional space of words in dictionary # (which tend to be in the thousands) as opposed to topics # 3) below we employ the "spectral" intitialization, which is deterministic and globally consistent under reasonable conditions. # This means that no matter of the seed that is set, the SAME results will be generated. # As an alternative we could employ a LDA initialization. In this, case, however, the results you get out of the estimation procedure may depend # on starting values of the parameters (e.g., the distribution over words for a particular topic) - as it is always the case with a LDA (remember?). # So remember always to define a set seed to replicate your analysis in this latter case # You can also relax the number of maximum required iterations if you want, but that would require you (much) more # time especially with large datasets. However 75 is a very small number! We apply that here just to save time! # The default value is 500. str(DfmStm) system.time(stmFitted <- stm(DfmStm $documents, DfmStm $vocab, K = 15, max.em.its = 75, prevalence = ~ repub + s(year), data = DfmStm $meta, init.type = "Spectral")) # around 36 seconds on my laptop ############################################################################################################## # NB: running a model with or without covariates for topic prevalance produces of course different results; # the same is true if you run a model with covariates just for the topic prevalance part, or with both the # topic prevalance part as well as the topical content part ############################################################################################################## ################################################################ # Interpreting the STM by plotting and inspecting results ################################################################ # Summary visualization: understanding the extracted topics through words and example documents # First: which words/topic association? labelTopics(stmFitted, n=7) # 7 features for each topic # The frequency/exclusivity (FREX) scoring summarizes words according to their probability of appearance under a topic and the exclusivity to that topic. # As a result, FREX words provide quite often more semantically intuitive representations of a given topic. # More in details: the FREX for a word is computed as the harmonic mean of the word's rank in terms of exclusivity and frequency. # [Harmonic mean? The harmonic mean can be expressed as the reciprocal of the arithmetic mean of the reciprocals of the given set of observations. # Say you have 2, 3 and 4, the harmonic mean is 3/(1/2+1/3+1/4)] # The harmonic mean is attractive here because it does not allow a high rank along one of the dimensions to compensate for the lower rank in another (as it happens with an # arithmetic mean. Thus words with a high score must be high along both dimensions - exclusivity AND frequency). labelTopics(stmFitted, n=7, topics=1) # the same just for topic 1 plot(stmFitted, type = "labels", labeltype = c("frex"), n=5) # plot just frex words plot(stmFitted, type = "summary", labeltype = c("frex"), n=5) # topic 6 is the most frequent one # topic meaning: according to frex words, topic 6 seems to be related to the trend in the economy; topic 13 to politics; # topic 10 to inflation; etc. plot(stmFitted, type = "hist", labeltype = c("frex")) # Here topic 6 appears as more "evenly" distributed across documents than topic 10 for example # Let's read the documents most associated with each topic # In particular, let's identify the most representative documents for a particular topic. # We can also use this in order to get a better sense of the content of actual documents with a high topical content. # Let's focus on topic 6 for example and let's identify the 3 texts with the highest theta for that topic # and let's read the first 200 words from each of them str(DfmStm $meta) text <- DfmStm $meta$text2 str(text) # let's keep only the first 200 words for each article text2 <- substr(text, start = 1, stop = 200) str(text2 ) thought6 <- findThoughts(stmFitted, texts=text2, topics=6, n=3) str(thought6) plotQuote(thought6$docs[[1]], width = 30, main = "Topic 6 - Economy") # Let's focus on topic 13 and let's do the same thought13 <- findThoughts(stmFitted, texts=text2, topics=10, n=3) plotQuote(thought13$docs[[1]], width = 30, main = "Topic 13 - Politics") # Let's plot Topic 6 and 13 together par(mfrow = c(1, 2),mar = c(.5, .5, 1, .5)) plotQuote(thought6$docs[[1]], width = 30, main = "Topic 6 - Economy") plotQuote(thought13$docs[[1]], width = 30, main = "Topic 13 - Politics") # or alternatively: thought6_10 <- findThoughts(stmFitted, texts=text2, topics=c(6,13), n=3) str(thought6_10) par(mfrow = c(1, 2),mar = c(.5, .5, 1, .5)) plotQuote(thought6_10$docs[[1]], width = 30, main = "Topic 6 - Economy") plotQuote(thought6_10$docs[[2]], width = 30, main = "Topic 13 - Politics") # you can also query in terms of other topics thought6ALT <- findThoughts(stmFitted, texts=text2, topics=6, n=3, where = Topic13>.2) par(mfrow = c(1, 2),mar = c(.5, .5, 1, .5)) plotQuote(thought6$docs[[1]], width = 30, main = "Topic 6 - Economy") plotQuote(thought6ALT$docs[[1]], width = 30, main = "Topic 6 - Economy (with extra condition)") # which are the most likely topics across our documents? # Note: when you write "1", the apply function works over rows; "2" over columns. We want here to identify the highest # theta over rows (i.e., the highest theta for each single document) apply(stmFitted$theta,1,which.max) table(apply(stmFitted$theta,1,which.max) ) # you can then save such info as a document-level variable. docvars(myDfm , 'STMtopic') <- apply(stmFitted$theta,1,which.max) head(docvars(myDfm)) # But we already know that STM actually computes a distribution over topics. # In other words, each document is considered to be about a mixture of topics as we have already discussed! # This information is included in the matrix thera in the STM object # For example, news 1 is 44% about topic 9 and 34% about topic 7 for example round(stmFitted$theta[1,], 2) ################################################################ # Checking for topic correlation ################################################################ # Uses a topic correlation graph estimated by topicCorr and the igraph package to plot a network where nodes are topics # and edges indicate a positive correlation. If no edges = only negative correlations! mod.out.corr <- topicCorr(stmFitted) set.seed(123) plot(mod.out.corr) # in our case topics 14 and 15 are positively correlated # i.e. when an article discusses about stock-exchange (topic 14) there is a positive # probability of discussing about bank (topic 15) plot(stmFitted, type = "summary", labeltype = c("frex"), n=5) ######################################### # Estimating topic prevalence using as IV: repub and year ######################################### # The estimateEffect function performs basically a regression where topic-proportions are the outcome variable, incorporating # measurement uncertainty from the STM model. Such method is a stochastic one, so if you want to replicate the same exact results # below, always first defines a seed! # The function can handle factors and numeric variables. Dates should be converted to numeric variables before analysis. # When you estimate the effects, you need always to seet a seed if you wanto to replicate your analysis. set.seed(123) prep <- estimateEffect(1:15 ~ repub + s(year), stmFitted, meta = DfmStm $meta) set.seed(123) summary(prep) # for example there is a tendency to talk more about topic 15 under a Republican Presidency # Method used for plotting topic prevalence: # a) "continuous" estimates how topic proportions vary over the support of a continuous covariate. # b) "pointestimate" estimates mean topic proportions for each value of the categorical covariate # c) "difference" estimates the mean difference in topic proportions for two different values of the covariate (cov.value1 and cov.value2 must be specified) # Let's start with year (i.e., a continuous variable): # let's plot it! as you can see topic 13 prevalence (politics) is stable but at the end it increases (after the 2008 economic crisis) plot(prep, "year", method = "continuous", topics = 13, model = stmFitted, printlegend = FALSE, xaxt = "n", xlab = "Year") seq <- seq(from = as.numeric("1980"), to = as.numeric("2014")) axis(1, at = seq) title("Topic 13 - Politics") abline(h=0, col="blue") # the opposite happens with topics 10 (inflation) - although the decrease does not appear to be significant plot(prep, "year", method = "continuous", topics = 10, model = stmFitted, printlegend = FALSE, xaxt = "n", xlab = "Year") seq <- seq(from = as.numeric("1980"), to = as.numeric("2014")) axis(1, at = seq) title("Topic 10 - Inflation") abline(h=0, col="blue") # Let's now move to repub (i.e., a categorical variable) for topic 10 (inflation) and topic 15 (bank system) plot(stmFitted, type = "summary", labeltype = c("frex"), n=5) # the salience of topic 10 seems to be higher under a Democratic presidency (repub=0). The oppposite happens for topic 15 plot(prep, "repub", model=stmFitted, method="pointestimate", topic=c(10,15)) # let's see if the difference between the two coefficients is statistically different from 0. # Always remember that you plot "coef.value1-coef.value2". In this case (see below the script) # it implies "coeff. of republican-coeff. of democratic" given that republic is a 0/1 variable # and we define below cov.value1 = "1" (i.e., Republican) and cov.value2 = "0" (i.e., Democratic) plot(prep, covariate = "repub", topics = c(10, 15), model = stmFitted, method = "difference", cov.value1 = "1", cov.value2 = "0", xlim = c(-1, 1), xlab = "More Democratic... More Repubblican", main = "Effect of Democratic vs. Repubblican Presidency", labeltype = "custom", custom.labels = c('Topic 10 - Inflation', "Topic 15 - Bank System")) # and what if we plot all the topics together? a=c() for (i in 1:15) { d=paste("Topic ", i) a=c(a,d) } a plot(prep, covariate = "repub", topics = c(1:15), model = stmFitted, method = "difference", cov.value1 = "1", cov.value2 = "0", xlim = c(-1, 1), xlab = "More Democratic... More Repubblican", main = "Effect of Democratic vs. Repubblican Presidency", labeltype = "custom", custom.labels = a) ######################################### # Estimating topical content using as IV: repub - CHECK THIS PART!!!!!!!!!!!!!! ######################################### # Let's run a topical content analysis by including as a covariate repub. # As currently implemented the variable used for topical content must be a single variable which defines a discrete partition of the dataset # (each document is in one and only one group - so only categorical variables can be employed) # You will need around 4 mins to run it. So let's call the rds where the model is already fitted # system.time(stmFitted_topical <- stm(DfmStm $documents, DfmStm $vocab, K = 15, max.em.its = 75, # prevalence = ~ repub + s(year), content = ~ repub, data = DfmStm $meta, init.type = "Spectral")) # around 258 seconds [4 mins] # saveRDS(stmFitted_topical, file = "stmFitted_topicalALT2.rds") stmFitted_topical <- readRDS("stmFitted_topicalALT2.rds") # You see that the results you get with and w/o topical content are somehow different table(apply(stmFitted$theta,1,which.max) ) table(apply(stmFitted_topical$theta,1,which.max) ) apply(stmFitted$theta,2,mean) # mean distribution of topics apply(stmFitted_topical$theta,2,mean) # mean distribution of topics # high - but not perfect - correlation cor(apply(stmFitted$theta,2,mean), apply(stmFitted_topical$theta,2,mean)) labelTopics(stmFitted_topical, n=7) # 7 features for each topic plot(stmFitted_topical, type = "summary", n=5) # topic 6 is the most frequent one # This figure shows how the nyt articles talk about topic 10 (inflation) and topic 13 (politics) under a democratic or repubblican presidency # In the plot, words are sized proportional to their use within the plotted topic-covariate combinations and oriented along the X-axis # based on how much they favor one of the two configurations. # You define the seed if you want to get always the same scatter of words in the graph set.seed(1235) plot(stmFitted_topical, type = "perspectives", topics = 10) # intuition: when NyT discusses about inflation and there is a Dem President, the issue is more linked to market forces, # when it discusses the same topic under a Gop President, the issue is more linked to the banking system # From where you get the result reported in the figure above? By comparing the difference of betas attached to topics=11 in those documents # wherein repub=1 vs. those documents wherein repub=0 # Let's do the same thing for topic 13 (politics) set.seed(123) plot(stmFitted_topical, type = "perspectives", topics = 13) # intuition: when NyT discusses about politics and there is a Dem President, the issue is more politicized, # when it discusses the same topic under a Gop President, the issue is more linked to policies # This function can also be used to plot the contrast in words across two topics # This plot calculates the difference in probability of a word for the selected two topics, normalized by the maximum # difference in probability of any word between the two topics (here topics 13 and 10) plot(stmFitted_topical, type = "perspectives", topics = c(13, 10)) ######################################### # Model selection - REMEMBER: this is the very first thing you HAVE to do before running a STM! ######################################### # Model search across numbers of topics STM assumes a fixed user-specified number of topics. # There is not a right answer to the number of topics that are appropriate for a given corpus, but # the function searchK uses a data-driven approach to selecting the number of topics # The function will perform several automated tests to help choose the number of topics. # For example, one could estimate a STM model for 3, 4 and 5 topics (just as an example!) # and compare the results along each of the criteria # But REMEMBER: nothing replaces your interpretation! # Add always a set.seed here (despite you employ a Spectral model, given that in this case there is a probabilistic part of the analysis. Why? # Cause the function also calculate the heldout-likelihood - and therefore documents have proportion of them heldout at random # FINALLY: always applies to searchK the same model you want to use later on, i.e., if you want to use a model with a given "prevalence" # you have to specify that "prevalence" also under searchK) set.seed(02138) K <-c(3:7) system.time(storage <- searchK(DfmStm $documents, DfmStm $vocab, K, max.em.its = 75, prevalence = ~ repub + s(year), data = DfmStm $meta, init.type = "Spectral")) # around 1:30 minute ##################################################### # The semantic coherence-exclusivity frontier ##################################################### # You should focus, as we did in the topic model class, on those models that lie on the semantic coherence-exclusivity �frontier�, # that is, where no model strictly dominates another in terms of semantic coherence and # exclusivity (i.e., models with average scores towards the upper right side of the plot). # The figure below identifies model 5 as the one that better satisfies the # �frontier� criterion, i.e., a model with desirable properties in both dimensions, # IFF we focus on k=(3:7) of course plot(storage$results$semcoh, storage$results$exclus, xlab= "Semantic coherence", ylab= "Exclusivity", col= "blue", pch = 19, cex = 1, lty = "solid", lwd = 2) text(storage$results$semcoh, storage$results$exclus, labels=storage$results$K, cex= 1, pos=4) # or if you want to plot the results via ggplot results <- data.frame(Coherence=unlist(storage$results$semcoh), Exclusivity=unlist(storage$results$exclus), K=unlist(storage$results$K)) results ggplot(results , aes(x=Coherence, y=Exclusivity)) + geom_point() + geom_text(label=results$K, vjust=-1) + ylab(label="Exclusivity ") + xlab("Semantic Coherence") + theme_light() # When init.type="Spectral" and K=0 the number of topics is set using the algorithm in Lee and Mimno (2014). # This method does not estimate the "true" number of topics and does not necessarily have any particular statistical # properties for consistently estimating the number of topics. It can however provide a useful starting point. # Do not run in the Lab!!! # library(geometry) # library(Rtsne) # library(rsvd) # set.seed(123) # system.time(total <- stm(DfmStm $documents, DfmStm $vocab, K=0, max.em.its = 75, # prevalence = ~ repub + s(year), data = DfmStm $meta, init.type = "Spectral")) # labelTopics(total)