rm(list=ls(all=TRUE)) getwd() setwd("write here your working directory!!!") getwd() library(readtext) library(quanteda) ######################################################################### # Plotting a deongram on relative dissimilarities between texts ######################################################################### presDfm <- dfm(corpus_subset(data_corpus_inaugural, Year > 1980), remove_punct = TRUE) # dfm_trim returns a document by feature matrix reduced in size based on document and term frequency. # in the following example: minimum number of times a word appears across documents: 5; within a document: 3 presDfm2 <- dfm_trim(presDfm, min_count = 5, min_docfreq = 3) # dfm_weight returns a document by feature matrix with the feature frequencies weighted according to one of several common methods (here: relative frequencies) normDtm <- dfm_weight(presDfm2 , "relFreq") # get distances on normalized dfm presDistMat <- textstat_dist(normDtm) # hierarchical clustering the distance object presCluster <- hclust(presDistMat) # label with document names presCluster$labels <- docnames(presDfm) # plot the dendrogram plot(presCluster, xlab = "", sub = "", main = "Euclidean Distance on Normalized Token Frequency") textstat_dist(normDtm)