rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()

library(readtext)
library(quanteda)
library(quanteda.textstats)

#########################################################################
# Extra: Plotting a deongram on relative dissimilarities between texts
#########################################################################

corpus_Pres <- corpus_subset(data_corpus_inaugural, Year > 1980)
tok2 <- tokens(corpus_Pres , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)

# here we estimate a dfm w/o stemming and w/o removing stopwords as just an example
presDfm <- dfm(tok2)

# dfm_trim (do you remember?) returns a document by feature matrix reduced in size based on document and term 
# frequency. In the following example: minimum number of times a word must appear: 5; 
# morevoer that word should appear in at least 3 document

presDfm2 <- dfm_trim(presDfm, min_termfreq = 5, min_docfreq = 3)

# dfm_weight returns a document by feature matrix with the feature frequencies weighted according to one of several common methods 
# (here: relative frequencies). You could run the analysis either on this or on the original dfm. Let's run the analysis on the weighted dfm here.
normDtm <- dfm_weight(presDfm ,scheme = "prop")

# estimating distances on normalized dfm
presDistMat <- textstat_dist(normDtm , margin = "documents", method="euclidean")
# with as.dist you save your results as hclust needs them to operate!
presDistMat <- as.dist(presDistMat )
presDistMat

# Let's perform a hierarchical agglomerative cluster analysis
# The algorithm works as follows:
# 1) Put each document in its own cluster.
# 2) Identify the closest two clusters and combine them into one cluster 
# 3) Repeat the above step till all the documents are in a single cluster.

# hierarchical clustering the distance object
presCluster <- hclust(presDistMat)

# label with document names
presCluster$labels <- docnames(presDfm)

# plot the dendrogram
plot(presCluster, xlab = "", sub = "", main = "Euclidean Distance on Normalized Token Frequency")