rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(readtext) library(quanteda) library(quanteda.textstats) ######################################################################### # Extra: Plotting a deongram on relative dissimilarities between texts ######################################################################### corpus_Pres <- corpus_subset(data_corpus_inaugural, Year > 1980) tok2 <- tokens(corpus_Pres , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) # here we estimate a dfm w/o stemming and w/o removing stopwords as just an example presDfm <- dfm(tok2) # dfm_trim (do you remember?) returns a document by feature matrix reduced in size based on document and term # frequency. In the following example: minimum number of times a word must appear: 5; # morevoer that word should appear in at least 3 document presDfm2 <- dfm_trim(presDfm, min_termfreq = 5, min_docfreq = 3) # dfm_weight returns a document by feature matrix with the feature frequencies weighted according to one of several common methods # (here: relative frequencies). You could run the analysis either on this or on the original dfm. Let's run the analysis on the weighted dfm here. normDtm <- dfm_weight(presDfm ,scheme = "prop") # estimating distances on normalized dfm presDistMat <- textstat_dist(normDtm , margin = "documents", method="euclidean") # with as.dist you save your results as hclust needs them to operate! presDistMat <- as.dist(presDistMat ) presDistMat # Let's perform a hierarchical agglomerative cluster analysis # The algorithm works as follows: # 1) Put each document in its own cluster. # 2) Identify the closest two clusters and combine them into one cluster # 3) Repeat the above step till all the documents are in a single cluster. # hierarchical clustering the distance object presCluster <- hclust(presDistMat) # label with document names presCluster$labels <- docnames(presDfm) # plot the dendrogram plot(presCluster, xlab = "", sub = "", main = "Euclidean Distance on Normalized Token Frequency")