rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()

library(readtext)
library(quanteda)
library(quanteda.textstats)

#########################################################################
# Extra: Plotting a deongram on relative dissimilarities between texts
#########################################################################

corpus_Pres <- corpus_subset(data_corpus_inaugural, Year > 1980)
tok2 <- tokens(corpus_Pres , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)

# here we estimate a dfm w/o stemming and w/o removing stopwords as just an example
presDfm <- dfm(tok2)

# dfm_trim (do you remember?) returns a document by feature matrix reduced in size based on document and term 
# frequency. In the following example: minimum number of times a word must appear: 5; 
# morevoer that word should appear in at least 3 document

presDfm2 <- dfm_trim(presDfm, min_termfreq = 5, min_docfreq = 3)

# dfm_weight (do you remember?) returns a document by feature matrix with the feature frequencies weighted according to one of several common methods 
# (here: relative frequencies). You could run the analysis either on this or on the original dfm. Let's run the analysis on the weighted dfm here.
normDtm <- dfm_weight(presDfm ,scheme = "prop")

# estimating distances on normalized dfm
presDistMat <- textstat_dist(normDtm , margin = "documents", method="euclidean")
# with as.dist you save your results as hclust needs them to operate!
presDistMat <- as.dist(presDistMat )
presDistMat

# Clustering technique is based on the possibility of rearranging observations into homogenous subgroups according to some notion of distance among the data 
# Given a dissimilarity measure d, clustering algorithms proceed by grouping (agglomerative methods - as in our case) or splitting
# (dissociative methods) subsequently the whole set of data. If this procedure is sequential, the method is called hierarchical. 
# For example an agglomerative hierarchical method is as follows: a first group is formed by taking the closest units in the data. 
# Then each new aggregation occurs, either forming a new group of two units or aggregating a unit to the closest group (according to d) already formed
# or aggregating two distinct groups.

# In our case we perform a hierarchical agglomerative cluster analysis
# The algorithm works as follows:
# 1) Put each document in its own cluster.
# 2) Identify the closest two clusters and combine them into one cluster 
# 3) Repeat the above step till all the documents are in a single cluster.

# hierarchical clustering the distance object
presCluster <- hclust(presDistMat)

# label with document names
presCluster$labels <- docnames(presDfm)

# plot the dendrogram
# The height of the vertical lines and the range of the dissimilarity axis in the dendrograms give visual clues about the
# strength of the clustering. Long vertical lines indicate more distinct separation between the groups, while
# short vertical bars show observations that are all close to each other.
# Long vertical lines at the top of the dendrogram indicate that the groups represented by those lines
# are well separated from one another. Shorter lines indicate groups that are not as distinct.

plot(presCluster, xlab = "", sub = "", main = "Euclidean Distance on Normalized Token Frequency")