rm(list=ls(all=TRUE))
getwd()
setwd("write here your working directory!!!")
getwd()

library(readtext)
library(quanteda)

#########################################################################
# Plotting a deongram on relative dissimilarities between texts
#########################################################################

presDfm <- dfm(corpus_subset(data_corpus_inaugural, Year > 1980), 
               remove_punct = TRUE)
# dfm_trim returns a document by feature matrix reduced in size based on document and term frequency.
# in the following example: minimum number of times a word appears across documents: 5; within a document: 3
presDfm2 <- dfm_trim(presDfm, min_count = 5, min_docfreq = 3)
# dfm_weight returns a document by feature matrix with the feature frequencies weighted according to one of several common methods (here: relative frequencies)
normDtm <- dfm_weight(presDfm2 , "relFreq")
# get distances on normalized dfm
presDistMat <- textstat_dist(normDtm)

# hierarchical clustering the distance object
presCluster <- hclust(presDistMat)
# label with document names
presCluster$labels <- docnames(presDfm)
# plot the dendrogram
plot(presCluster, xlab = "", sub = "", main = "Euclidean Distance on Normalized Token Frequency")
textstat_dist(normDtm)