rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(readtext) library(quanteda) library(quanteda.textstats) ######################################################################### # Extra: Plotting a deongram on relative dissimilarities between texts ######################################################################### corpus_Pres <- corpus_subset(data_corpus_inaugural, Year > 1980) tok2 <- tokens(corpus_Pres , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) # here we estimate a dfm w/o stemming and w/o removing stopwords as just an example presDfm <- dfm(tok2) # dfm_trim (do you remember?) returns a document by feature matrix reduced in size based on document and term # frequency. In the following example: minimum number of times a word must appear: 5; # morevoer that word should appear in at least 3 document presDfm2 <- dfm_trim(presDfm, min_termfreq = 5, min_docfreq = 3) # dfm_weight (do you remember?) returns a document by feature matrix with the feature frequencies weighted according to one of several common methods # (here: relative frequencies). You could run the analysis either on this or on the original dfm. Let's run the analysis on the weighted dfm here. normDtm <- dfm_weight(presDfm ,scheme = "prop") # estimating distances on normalized dfm presDistMat <- textstat_dist(normDtm , margin = "documents", method="euclidean") # with as.dist you save your results as hclust needs them to operate! presDistMat <- as.dist(presDistMat ) presDistMat # Clustering technique is based on the possibility of rearranging observations into homogenous subgroups according to some notion of distance among the data # Given a dissimilarity measure d, clustering algorithms proceed by grouping (agglomerative methods - as in our case) or splitting # (dissociative methods) subsequently the whole set of data. If this procedure is sequential, the method is called hierarchical. # For example an agglomerative hierarchical method is as follows: a first group is formed by taking the closest units in the data. # Then each new aggregation occurs, either forming a new group of two units or aggregating a unit to the closest group (according to d) already formed # or aggregating two distinct groups. # In our case we perform a hierarchical agglomerative cluster analysis # The algorithm works as follows: # 1) Put each document in its own cluster. # 2) Identify the closest two clusters and combine them into one cluster # 3) Repeat the above step till all the documents are in a single cluster. # hierarchical clustering the distance object presCluster <- hclust(presDistMat) # label with document names presCluster$labels <- docnames(presDfm) # plot the dendrogram # The height of the vertical lines and the range of the dissimilarity axis in the dendrograms give visual clues about the # strength of the clustering. Long vertical lines indicate more distinct separation between the groups, while # short vertical bars show observations that are all close to each other. # Long vertical lines at the top of the dendrogram indicate that the groups represented by those lines # are well separated from one another. Shorter lines indicate groups that are not as distinct. plot(presCluster, xlab = "", sub = "", main = "Euclidean Distance on Normalized Token Frequency")