rm(list=ls(all=TRUE)) getwd() # with setwd() you change the current working directory of the R process setwd("C:/Users/mw/Dropbox (Personal)/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(stm) library(ggplot2) ################################################################ # Example with U.S. Presidential Inaugural Speeches since 1970s ################################################################ summary(data_corpus_inaugural) # we could add some document-level variables – what quanteda calls docvars – to this corpus. meta.pr <- read.csv("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/List President USA party.csv", 1) str(meta.pr) levels(meta.pr$Party) table(meta.pr$Party) # Suggestion: if you want to include in a corpus a Factor variable that you are going to include as a covariate # in a STM, be sure that all the levels of such categorical variable will all be actually used in the analysis. This is NOT what # happens in our case, given that by focusing just on speeches made after 1970 in our example below, we will have 0 observations # for the levels Democratic-Republican, Federalist, Unaffiliated, Whig! So let's just keep the levels for Democratic and Repubblican! levels(meta.pr$Party) <- list(Democratic="Dem", Republican="Rep") levels(meta.pr$Party) table(meta.pr$Party) docvars(data_corpus_inaugural, "Party") <- meta.pr$Party summary(data_corpus_inaugural) # inspect the document-level variables (that's very important as an option, as we will see..) head(docvars(data_corpus_inaugural)) myCorpus <- corpus_subset(data_corpus_inaugural, Year > 1970) summary(myCorpus) myDfm <- dfm(myCorpus, remove = stopwords("english"), tolower = TRUE, stem = TRUE, remove_punct = TRUE, remove_numbers=TRUE) topfeatures(myDfm , 20) # 20 top words # if you want, you can decide to keep only words occuring >=10 times and in >=2 docs. This is highly suggested # when you have many documents and you want to run a STM myDfm.trim <- dfm_trim(myDfm, min_termfreq = 10, min_docfreq = 2) # Or just drop the words that occurred either too frequently (in more than 99% of documents) # or too infrequently (in less than 1% of the documents) myDfm.trim2 <- dfm_trim(myDfm, max_docfreq = 0.99, min_docfreq = 0.01) topfeatures(myDfm.trim2, 20) # 20 top words # convert the dfm from Quanteda to STM: this is a crucial step!!! # note that you have also list the document variables that are present in the corpus # Note that in this case we keep all the words for simplicity as an example (no trimming!) Dfmstm2 <- convert(myDfm, to = "stm", docvars = docvars(myCorpus)) ################################################################ # STM run ################################################################ # 1) we want to extract 5 topics; # 2) we want to control for the following covariates in affecting topic prevalence: Party and Year; # We could have ran also at the same time a model with covariates affecting topical content (see below for an example). # While including more covariates in topic prevalence will rarely affect the speed of the model, # including additional levels of the content covariates (topical content) can make the model much slower to converge. # This is due to the model operating in the much higher dimensional space of words in dictionary # (which tend to be in the thousands) as opposed to topics. That's why I suggest you to run first a model w/o covariates # for topical content; # 3) you can also run a model w/o covariates. In this case you have an unstructured # topic model, by fitting a Correlated Topic Model (see below for an example); # 4) it is highly suggested to employ the "spectral" intitialization based on the method of moments, which is deterministic # and globally consistent under reasonable conditions. This means that no matter the seed that is set, the SAME results will be generated. # As an alternative you could employ a LDA initialization. In this, case, however, the answers the estimation procedure comes up with may depend # on starting values of the parameters (e.g., the distribution over words for a particular topic). # So remember always to define a set seed to replicate your analysis in this latter case! str(Dfmstm2) stmFitted <- stm(Dfmstm2$documents, Dfmstm2$vocab, K = 5, max.em.its = 75, prevalence = ~ Party +Year, data = Dfmstm2$meta, init.type = "Spectral") ################################################################ # Interpreting the STM by plotting and inspecting results ################################################################ # Summary visualization: understanding the extracted topics through words and example documents # First: which words/topic association? labelTopics(stmFitted) plot(stmFitted) # note these grahps also show you the frequency distribution of topics across the 12 documents plot(stmFitted, type = "summary", labeltype = c("frex")) plot(stmFitted, type = "hist", labeltype = c("frex")) plot(stmFitted, type = "labels", labeltype = c("frex")) cloud(stmFitted, topic = 3) # Let's read the documents most associated with each topic docs <- myCorpus$documents$texts str(docs) # you cannot read it properly! thought <- findThoughts(stmFitted, texts=docs, topics=2, n=3)$docs[[1]] par(mfrow = c(1, 2),mar = c(.5, .5, 1, .5)) plotQuote(thought, width = 30, main = "Topic 2") # save only the first 200 words from each document and display them! docs2 <- substr(docs, start = 1, stop = 200) thought2 <- findThoughts(stmFitted, texts=docs2, topics=2, n=3)$docs[[1]] par(mfrow = c(1, 2),mar = c(.5, .5, 1, .5)) plotQuote(thought2, width = 30, main = "Topic 2") thought5 <- findThoughts(stmFitted, texts=docs2, topics=5, n=3)$docs[[1]] par(mfrow = c(1, 2),mar = c(.5, .5, 1, .5)) plotQuote(thought5, width = 30, main = "Topic 5") par(mfrow = c(1, 2),mar = c(.5, .5, 1, .5)) plotQuote(thought2, width = 30, main = "Topic 2") plotQuote(thought5, width = 30, main = "Topic 5") # but to whom those speeches belong to? str(stmFitted$theta) theta <-as.data.frame(stmFitted$theta) str(theta ) combine <- paste(myCorpus$documents$President, myCorpus$documents$Year, sep = " ") combine rownames(theta ) <- combine combine # which is the document with the highest value of Topic 2? rownames(theta )[which.max(theta $V2)] # which are the documents with the highest value for each Topic? rownames(theta )[apply(theta , 2, which.max)] # and the 2nd and 3rd largest one? maxn <- function(n) function(x) order(x, decreasing = TRUE)[n] rownames(theta )[apply(theta , 2, maxn(1))] rownames(theta )[apply(theta , 2, maxn(2))] rownames(theta )[apply(theta , 2, maxn(3))] # I want to read the Obama speech of 2009 to better understand the content of Topic 4 Obama <- texts(myCorpus)[10] strwrap(Obama [[1]]) ################################################################ # Additional tools for interpretation and visualization ################################################################ # Uses a topic correlation graph estimated by topicCorr and the igraph package to plot a network where nodes are topics # and edges indicate a positive correlation. If no edges = only negative correlations! library(igraph) mod.out.corr <- topicCorr(stmFitted) plot(mod.out.corr) mod.out.corr$cor library(corrplot) corrplot(mod.out.corr$cor, method="circle", type="lower") corrplot(mod.out.corr$cor, method="circle", type="upper") corrplot(mod.out.corr$cor, method="pie") corrplot(mod.out.corr$cor, method="color") corrplot(mod.out.corr$cor, method="number") ######################################### # Estimating topic prevalence ######################################### prep <- estimateEffect(1:5 ~ Party+ Year, stmFitted, meta = Dfmstm2$meta, uncertainty = "Global") # Method used for plotting topic prevalence: # a) "continuous" estimates how topic proportions vary over the support of a continuous covariate. # b) "pointestimate" estimates mean topic proportions for each value of the categorical covariate # c) "difference" estimates the mean difference in topic proportions for two different values of the covariate (cov.value1 and cov.value2 must be specified) plot(prep, "Year", method = "continuous", topics = 5, model = stmFitted, printlegend = FALSE, xaxt = "n", xlab = "Year") seq <- seq(from = as.numeric("1970"), to = as.numeric("2017")) axis(1, at = seq) title("Topic 5") abline(h=0, col="blue") plot(prep, "Party", model=stmFitted, method="pointestimate") plot(prep, covariate = "Party", topics = c(1, 2, 3, 4, 5), model = stmFitted, method = "difference", cov.value1 = "Democratic", cov.value2 = "Republican", xlim = c(-1, 1), xlab = "More Repubblican... More Democratic", main = "Effect of Democratic vs. Repubblican", labeltype = "custom", custom.labels = c('Topic 1', 'Topic 2', 'Topic 3','Topic 4', 'Topic 5')) # Plotting covariate interactions stmFitted_interaction <- stm(Dfmstm2$documents, Dfmstm2$vocab, K = 5, max.em.its = 75, prevalence = ~ Party * Year, data = Dfmstm2$meta, init.type = "Spectral") prep <- estimateEffect(c(5) ~ Party * Year, stmFitted_interaction, meta = Dfmstm2$meta, uncertainty = "Global") # This plots the interaction between Year and Party (Democratic versus Repubblican). Topic 5 prevalence is plotted as linear # function of year, holding the rating at either 0 (Democratic) or 1 (Republican). Were other # variables included in the model, they would be held at their sample medians. # As you can see now, the effect of Year on Topic 5 prevalance is true but ONLY for Repubblican Presidents plot(prep, covariate = "Year", model = stmFitted_interaction, method = "continuous", xlab = "Year", moderator = "Party", moderator.value = "Democratic", linecol = "blue", ylim = c(-1, 1), printlegend = F) plot(prep, covariate = "Year", model = stmFitted_interaction, method = "continuous", xlab = "Year", moderator = "Party", moderator.value = "Republican", linecol = "red", add = T, printlegend = F) legend(2000, -0.7, c("Democratic", "Republican"), lwd = 2, col = c("blue", "red")) abline(h=0, col="black") ######################################### # Estimating topical content ######################################### # Let's run a topical content analysis by including as a covariate Party. # As currently implemented the variable used for topical content must be a single variable which defines a discrete partition of the dataset # (each document is in one and only one group) stmFitted_content <- stm(Dfmstm2$documents, Dfmstm2$vocab, K = 5, max.em.its = 75, prevalence = ~ Party +Year, content = ~ Party, data = Dfmstm2$meta, init.type = "Spectral") # This figure shows how democratic and repubblican Presidents talk about topic 5 differently plot(stmFitted_content, type = "perspectives", topics = 5) # This function can also be used to plot the contrast in words across two topics # This plot calculates the difference in probability of a word for the two topics, normalized by the maximum # difference in probability of any word between the two topics. plot(stmFitted, type = "perspectives", topics = c(2, 5)) plot(stmFitted, type = "perspectives", labeltype = c("frex"), topics = c(2, 5)) ######################################### # Model selection ######################################### # Model search across numbers of topics STM assumes a fixed user-specified number of topics. # There is not a right answer to the number of topics that are appropriate for a given corpus, but # the function searchK uses a data-driven approach to selecting the number of topics. # The function will perform several automated tests to help choose the number of topics. # For example, one could estimate a STM model for 3, 4 and 5 topics and compare the results along each of the criteria # But REMEMBER: nothing replaces your interpretation!!!! str(Dfmstm2) str(Dfmstm2$vocab) str(Dfmstm2$documents) K <-c(3,4,5) # you can also relax the number of maximum required iterations if you want, but that would require you (much) more # times especially with large datasets storage <- searchK(Dfmstm2$documents, Dfmstm2$vocab, K, max.em.its = 500, prevalence = ~ Party +Year, data = Dfmstm2$meta, init.type = "Spectral") str(storage) # important: whenever the documents you want to analyze are pretty long, whenever you employ the searchK # command, it is always advisable also adding this line: N = floor(0.2 * length(Dfmstm2$documents)) - # where Dfmstm2$documents are the documents that you want to analyze. See below for an example! # Do not run in the Lab: # storage2 <- searchK(Dfmstm2$documents, Dfmstm2$vocab, K, data = Dfmstm2$meta, max.em.its = 500, # N = floor(0.2 * length(Dfmstm2$documents)), prevalence = ~ Party +Year, init.type = "Spectral") # str(storage2) # You should focus on those models that lie on the semantic coherence-exclusivity ‘frontier’, that # is, where no model strictly dominates another in terms of semantic coherence and # exclusivity (i.e., models with average scores towards the upper right side of the plot). # The figure below identifies model 4 as the one that better satisfies the # ‘frontier’ criterion, i.e., a model with desirable properties in both dimensions plot(storage$results$semcoh, storage$results$exclus, xlab= "Semantic coherence", ylab= "Exclusivity", col= "blue", pch = 19, cex = 1, lty = "solid", lwd = 2) text(storage$results$semcoh, storage$results$exclus, labels=storage$results$K, cex= 1, pos=4) # When init.type="Spectral" and K=0 the number of topics is set using the algorithm in Lee and Mimno (2014). # This method does not estimate the "true" number of topics and does not necessarily have any particular statistical # properties for consistently estimating the number of topics. It can however provide a useful starting point. # The added randomness from the projection means that the algorithm is not deterministic # as with the standard "Spectral" initialization type. Running it with a different seed can result in not only different # results but a different number of topics. # Do not run in the Lab: # total <- stm(Dfmstm2$documents, Dfmstm2$vocab, K=0, max.em.its = 500, # prevalence = ~ Party +Year, data = Dfmstm2$meta, init.type = "Spectral", seed = 8458159) labelTopics(total) # In addition to these functions one can also explore if there are words that are extremely highly associated with # a single topic via the checkBeta function. The ouput gives the user lists of which topics have problems, which words in which topics have problems, # as well as a count of the total problems in topics and the total number of problem words. # In our case, we do not have such kind of problem check <- checkBeta(stmFitted, tolerance = 0.01) str(check) ######################################### ########## Topic-model (LDA without correlations) ######################################### library(quanteda.corpora) library(lubridate) library(topicmodels) news_corp <- download('data_corpus_guardian') # This corpus contains 6,000 Guardian news articles from 2012 to 2016. # We only select news stories published in 2016 using corpus_subset(). news_corp <- corpus_subset(news_corp, year(docvars(news_corp, 'date')) >= 2016) ndoc(news_corp) # Further, after removal of function words and punctuations in dfm(), we keep only the top 5% of the most frequent features # (min_termfreq = 0.95) that appear less than 10% in all the documents (max_docfreq = 0.1) using dfm_trim() # to focus on common but distinguishing features. news_dfm <- dfm(news_corp, remove =(stopwords("english")), tolower = TRUE, stem = TRUE, remove_punct = TRUE, remove_numbers=TRUE) news_dfm <- dfm_remove(news_dfm, c('*-time', '*-timeUpdated', 'GMT', 'BST')) news_dfm <- dfm_trim(news_dfm, min_termfreq = 0.95, termfreq_type = "quantile", max_docfreq = 0.1, docfreq_type = "prop") news_dfm[1:2, 1:5] # 20 top words topfeatures(news_dfm, 20) # keeping only docuements with number of tokens >0 news_dfm <- news_dfm[ntoken(news_dfm) > 0,] # quanteda does not implement own topic models, but you can easily access to LDA() from the topicmodel package # through convert(). k = 5 specifies the number of topics to be discovered. dtm <- convert(news_dfm, to = "topicmodels") lda <- LDA(dtm, k = 5) # You can extract most important terms from the model using terms(). terms(lda, 5) # you can then obtain the most likely topics using topics() and save them as a document-level variable. docvars(news_dfm, 'topic') <- topics(lda) head(topics(lda), 20) # compare results with STM: which of the two makes much more sense to you? # fagli fare questo esercizio a casa! Dfmstm_guardian <- convert(news_dfm, to = "stm") stmFitted_guardian <- stm(Dfmstm_guardian$documents, Dfmstm_guardian$vocab, K = 5, max.em.its = 75, init.type = "Spectral") labelTopics(stmFitted_guardian) terms(lda, 5) ######################################### ##### graphic extensions ######################################### # Generate STM Correlation Tree # This function generates an interactive, full-model HTML visualization of topic hierarchies (using hierarchical clustering) for a fitted STM model. # The visualization highlights the correlations among topics, and can be used to view the model at differing levels of complexity. # The visualization takes the form of an indented tree. The leaves of the tree correspond to topics. The leaf nodes are grouped in topic clusters. # This allows the model to be visualized at differing levels of aggregation library(stmCorrViz) stmCorrViz(stmFitted, "corrviz.html") labelTopics(stmFitted) # to show also the documents example for each topic stmCorrViz(stmFitted, "corrviz.html", documents_raw=myCorpus$documents$texts, documents_matrix=Dfmstm2$documents) # with the topics that you get when K=0 # stmCorrViz(total, "corrviz.html", documents_raw=myCorpus$documents$texts, documents_matrix=Dfmstm2$documents) # better call this library after a STM with only topic prevalence (and not topical content) library(stmBrowser) Dfmstm2$meta$docs <- myCorpus$documents$texts str(Dfmstm2$meta) stmBrowser(stmFitted, data= Dfmstm2$meta, c("Year", "Party"), text="docs", labeltype = "frex") # select in the HTML file you produced "X-axis=Topic 4", "Y-axis=Year", "Radius=none", "Color=party"; # by doing that you can clearly see that topic 4 is more popular on average among Democratic Presidents # [exactly as found in the STM analysis below!] # if you select "Year" on the Y axis you see however only some numbers, because "Year" in the meta data is a numeric not a date. # Let's then create a date out of the numeric variable of Year (and let's call it "Year2"), and let's use it as well! Dfmstm2$meta$Year2 <- Dfmstm2$meta$Year Dfmstm2$meta$Year2 <- as.character(Dfmstm2$meta$Year2) str(Dfmstm2$meta) paste(Dfmstm2$meta$Year2,"-01","-01",sep="") Dfmstm2$meta$Year2<- as.Date(paste(Dfmstm2$meta$Year2,"-01", "-01",sep="")) str(Dfmstm2$meta) # now if you select "Year2" on the Y axis you see the year not numbers! stmBrowser(stmFitted, data= Dfmstm2$meta, c("Year2", "Year", "Party"), text="docs", labeltype = "frex") # A third possible graphic visualization is via LDAvis # To learn more about this last visualization, take a look at: https://rdrr.io/cran/stm/man/toLDAvis.html library(LDAvis) library(servr) toLDAvis(mod=stmFitted, docs=Dfmstm2$documents) # topics displayed according to their frequency # focus on topic 4 is 5 in toLADavis