rm(list=ls(all=TRUE)) # set your working directory (i.e., where you have saved the datasets, etc.) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(readtext) library(quanteda) library(ggplot2) library(quanteda.textstats) library(quanteda.textplots ) library(SnowballC) library(corrplot) library(DT) # General preamble: I prefer in my scripts to avoid to employ the Pipe Operator(%>%) just for a matter of clarity # The Pipe Operator - as you should know - makes coding easier, i.e., "a %>% mean = mean(a)" # Everything that's to the left of the pipe is passed as the first argument to the function that's after the pipe ######################################################################### ######################################################################### # Creating and Working with a Corpus ######################################################################### ######################################################################### # Several different ways to create a corpus in Quanteda. Let's look at 2 of them: single vs. multiple files ######################################################################### # FIRST: you have already a matrix file with a text for each row (such as .csv or .xls) - i.e., you have already a pre-formatted file ######################################################################### # This dataset is a sample of 100 tweet from boston area discussing about food # Data have been collected through Twitter API also specifying language and origin of tweets. # We will discuss later about how to retrieve such type of data in the next weeks. x <- read.csv("boston.csv", stringsAsFactors=FALSE) str(x) # Albeit you can generate directly a corpus after having read a .csv file via the function "read.csv", I suggest you to read a .csv file always # via the function "readtext". In this case however always remember to identify the name of the column in the dataset that includes # the texts (in this case "text"). If you have in your dataset a column named "text", you can avoid to write it in the command. # You can notice that a new column "doc_id" (including an index of texts) has been automatically created. # The function "readtext" is pretty flexible, and it allows you to read several different types of files (.csv,.html,.txt,.xls,.pdf,.doc) # check "?readtext" myText2 <- readtext("boston.csv", text_field = "text") str(myText2) # You create your corpus via the function "corpus" myCorpus2 <- corpus(myText2) # Jargon: types=number of unique terms; tokens=number of words head(summary(myCorpus2)) # number of documents in the corpus ndoc(myCorpus2 ) print(myCorpus2) # print the first text as.character(myCorpus2)[1] # same thing but w/o interruption of the text strwrap(as.character(myCorpus2)[1]) # print the first 3 texts as.character(myCorpus2)[1:3] strwrap(as.character(myCorpus2)[1:3]) # Let's move from the corpus to the document-feature-matrix! # In quanteda, we first tokenize the texts via the function "tokens" then we use the function "dfm" to produce such # a matrix, where documents are in rows and features (aka: words) are columns tokens(myCorpus2) tok2 <- tokens(myCorpus2) tok2 # Note that we can tokenize directly the columns including the texts from our original dataframe. # However in this latter case, we won't save the document level variables # Compare the two objects below: tok_txt <- tokens(myText2$text) str(tok_txt ) str(tok2) # when you create a dfm by default "tolower=TRUE", i.e., we convert all features to lowercase myDfm <- dfm(tok2) # we can get the number of documents and features ndoc() and nfeat() that build our DfM ndoc(myDfm) nfeat(myDfm) # we can also obtain the names of documents and features by docnames() and featnames() head(docnames(myDfm), 20) head(featnames(myDfm), 20) # Let's see the first five documents and the first 10 words of our dfm myDfm[1:5, 1:10] # Let's print the texts of the first two documents strwrap(as.character(myCorpus2)[1:2]) # to find the document-level variables attached to each document and stored in our dfm str(myDfm@docvars) head(docvars(myDfm)) # 20 top features in the dfm topfeatures(myDfm , 20) # let's improve the dfm! # FIRST: let's remove numbers, separators, etc. Note that I decided also to remove the URLs tok2_clean <- tokens(myCorpus2, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_url=TRUE) # SECOND: let's remove the stopwords head(stopwords("english"), 20) head(stopwords("russian"), 10) head(stopwords("italian"), 10) # the source "marino" for som given language (in particular asian languages: japanese, chinese, korean; but also arabic and hebrew) # is a better option. See: https://github.com/koheiw/marimo stopwords("en") stopwords("en", source = "marimo") # If you are interested in dealing with Japanese/Chinese texts, write me an email # The stopwords options available in Quanteda (based on the Snowball stopwords list: see http://snowball.tartarus.org/) # works for all the main European languages (ftp://cran.r-project.org/pub/R/web/packages/stopwords/stopwords.pdf). getStemLanguages() # For other languages things are a bit more complex. For Arabic ones, for example, a good source is also the stemming package arabicStemR tok2_clean <- tokens_remove(tok2_clean , stopwords("english")) # alternatively: you can first identify the list of tokens you want to delete and then you pass it to tokens_remove. # let's do that with the stopwords x <- stopwords("en") x tok2_clean <- tokens_remove(tok2_clean , x) # the nice thing here is that you can play with the list of stopwords, by deleting some elements, such as "her" x <-x[x != "her"] x # or adding new elements to the list of stopwords, such as "ten" x <-append(x,"ten") x # THIRD: let's stem the words tok2_clean<- tokens_wordstem (tok2_clean, language =("english")) # Let's now re-create the dfm myDfm2 <- dfm(tok2_clean) topfeatures(myDfm , 50) topfeatures(myDfm2 , 20) # the featurea "for", "with" and "the" have disappeared! while #smoked is now #smoke # still some symbols to remove! for example the words starting with "00" (they are unicode characters) "\U00BD" tok2_clean <- tokens_remove(tok2_clean, c(("rt"), ("00*"), ("ed"), ("u"))) myDfm2 <- dfm(tok2_clean ) topfeatures(myDfm2 , 20) # 20 top features [better!] # alternatively and as above: you can first identify the list of tokens you want to delete and then you pass it to tokens_remove x <- c(("rt"), ("00*"), ("ed"), ("u")) str(x) tok2_clean <- tokens_remove(tok2_clean, x) # You can save (and then eventually plot) the frequency of the top features in a dfm (in this case the top-20 features) features_dfm <- textstat_frequency(myDfm2, n = 20) features_dfm ggplot(features_dfm, aes(x = reorder(feature, frequency), y = frequency)) + geom_point() + coord_flip() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) # We can also create a dfm identifying only some specific words, such as for example only the hashtags # in the tweets using select = "#*" when creating the dfm. dfm_hashtag <- dfm_select(myDfm2, pattern = c("#*")) topfeatures(myDfm2 , 20) # 20 top features topfeatures(dfm_hashtag, 20) # 20 top features # You can also decide to exclude some features. For example, let's exclude all the hashtags dfm_NOhashtag <- dfm_remove(myDfm2, pattern = c("#*")) topfeatures(myDfm2 , 20) # 20 top features topfeatures(dfm_hashtag, 20) # 20 top features topfeatures(dfm_NOhashtag , 20) # 20 top features # Following the same logic, you can remove stopwords also with dfm_remove, after you have created a dfm dfm_remove(myDfm, pattern = stopwords("en")) # trimming the dfm myDfm[1:10, 1:10] # keep only words occurring >= 10 times and in >= 2 documents dfm_trim(myDfm, min_termfreq = 10, min_docfreq = 2) # keep only words occurring <= 10 times and in <=2 documents dfm_trim(myDfm, max_termfreq = 10, max_docfreq = 2) # keep only words occurring in 4 of 10 of documents dfm_trim(myDfm, min_docfreq = 0.4, termfreq_type = "prop") # keep only words occurring in all the 100 documents of my corpus at least once dfm_trim(myDfm, min_termfreq = 1, min_docfreq = 100) ######################################################################### # SECOND: you have saved in a directory a set of files (one for each document) in a given format (.txt, .doc, .pdf) - i.e., you have multiple file texts ######################################################################### # SOURCE: http://www.presidency.ucsb.edu/inaugurals.php # our txt files are included in the folder called "Inaugural Speeches" included in our working directory myText <- readtext("Inaugural Speeches/*.txt") str(myText) # we can actually extract three pieces of info from each file name (Name, Surname, Year: i.e., "George_Washington_1789.txt") # Note: to use the docvarsfrom = "filenames" option, the "file names" should be consistent, i.e., in the below example, in ALL the txts' title # you should have the same ordering: Name, Surname, Year using the same separators (i.e., "_") # It is always better to save your .txt files using the UTF-8 encoding when you are analyzing texts written in English (or Japanese/Chinese # for example) before reading them in R. With other languages, UTF-8 is generally fine. But other encoding can be better myText <- readtext("Inaugural Speeches/*.txt", docvarsfrom = "filenames", dvsep = "_", docvarnames = c("Name", "Surname", "Year")) str(myText) print(myText) testCorpus <- corpus(myText ) summary(testCorpus) print(testCorpus) # inspect the document-level variables (this is a very important option, as we will see later on...) head(docvars(testCorpus)) # if we want to extract individual elements of the document variables, we can specify field. docvars(testCorpus, field = "Year") # alternatively: testCorpus$Year # docvars() also allows you to create or update the document-level variables ndoc(testCorpus) testCorpus$index<- c(1:5) head(docvars(testCorpus)) # Let's now tokenize the texts tok4 <- tokens(testCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok4 <- tokens_remove(tok4 , stopwords("en")) tok4 <- tokens_wordstem (tok4 ) myDfm <- dfm(tok4) topfeatures(myDfm , 20) # 20 top words # Note that you can also extract the topfeatures according to some document level variable, for example let's do it # with respect to the Surname of the President (i.e., we have two speeches for President in our dfm) topfeatures(myDfm , 5, groups=Surname) # 5 top words for each President # we can also use some other document level variables such as Year in which a speech was given topfeatures(myDfm , 5, groups=Year) # 5 top words for each Year ######################################################################### # Playing with the corpus ######################################################################### # let's load a corpus already presented in Quanteda: the corpus of all the US Presidents' Inaugural Speeches # To summarize the texts from a corpus, we can call a summary() method defined for a corpus. summary(data_corpus_inaugural) # inspect the document-level variables head(docvars(data_corpus_inaugural)) # let's print the speech by the first US President (i.e., George Washington) strwrap(as.character(data_corpus_inaugural)[1]) # we can subset a corpus according to some document-level variable value via the corpus_subset command # For example, let's extract only the Trump speech (by taking advantage of the document-level variable "President" presented in our corpus) # Note: you can also subset a token according to some document-level variable via the tokens_subset command trump <- corpus_subset(data_corpus_inaugural, President == "Trump") summary(trump) strwrap(as.character(trump [[1]])) # Let's extract the first five inaug. speeches mycorpus1<- corpus_subset(data_corpus_inaugural, Year <1806) summary(mycorpus1) # We could have done it also by selecting the index of the documents in our corpus mycorpus1_alt <- corpus(data_corpus_inaugural[1:5]) summary(mycorpus1_alt ) # Let's extract the last six inaug. speeches (i.e., since 2000) mycorpus2<- corpus_subset(data_corpus_inaugural, Year >2000) summary(mycorpus2) # We can add two corpus together mycorpus3 <- mycorpus1 + mycorpus2 summary(mycorpus3) # We can also subset a corpus according to more than one single condition: note the use of the logical operator "&" summary(corpus_subset(data_corpus_inaugural, Year > 1990 & Party== "Republican")) # the function corpus_reshape() allows to change the unit of texts between documents, paragraphs and sentences summary(data_corpus_inaugural) ndoc(data_corpus_inaugural) print(data_corpus_inaugural) corp_sent <- corpus_reshape(data_corpus_inaugural, to = "sentences") ndoc(corp_sent ) print(corp_sent) # If you apply corpus_subset() to corp_sent, you can for example keep only long sentences (more than 50 words for examples) corp_sent_long <- corpus_subset(corp_sent, ntoken(corp_sent) >= 50) ndoc(corp_sent_long) # Restore the original documents. corp_sent2 <- corpus_reshape(corp_sent, to = "documents") ndoc(corp_sent2 ) print(corp_sent2 ) ######################################################################### # Let's explore some statistical summaries methods ######################################################################### ######################################################################### # Statistical summaries (1): Lexical dispersion plot (Positional Analysis: i.e., analysis that retain the original text sequence) ######################################################################### # The kwic function (keywords-in-context) performs a search for a word in a corpus and it allows us to view the contexts in which it occurs # NOTE: by working on a token object, we retain the original text sequence! options(width = 200) kwic(tokens(data_corpus_inaugural), "terror") # if you want to display the search in a more cool way: x <- kwic(tokens(data_corpus_inaugural), "terror") datatable(x, caption="Keywords in context", rownames=FALSE, options = list(scrollX = TRUE,pageLength = 10, lengthMenu = c(5, 10, 15, 20))) # with the window argument, we can specify the number of words to be displayed around the keyword (here=1) kwic(tokens(data_corpus_inaugural), "terror", window = 1) # also the words starting with "terror" including "terrorism" kwic(tokens(data_corpus_inaugural), "terror*") # same result as above via the "regex" specification. # A regex, or Regular Expression, is a sequence of characters that forms a search pattern kwic(tokens(data_corpus_inaugural), "terror", valuetype = "regex") # Note that by default, the kwic() is word-based. If you like to look up a multiword combination, use phrase() kwic(tokens(data_corpus_inaugural), phrase("by terror")) # alternative way to view your result x <- kwic(tokens(data_corpus_inaugural), phrase("by terror")) View(x) # We can plot a kwic object via a Lexical dispersion plot # a Lexical dispersion plot allow you to detect both the relative frequency of an employed word across documents as well as the ?timing? of that word in a given text textplot_xray(kwic(tokens(data_corpus_inaugural[40:59]), "american")) textplot_xray( kwic(tokens(data_corpus_inaugural[40:59]), "american"), kwic(tokens(data_corpus_inaugural[40:59]), "people"), kwic(tokens(data_corpus_inaugural[40:59]), "communis*") ) # If you are only plotting a single document, but with multiple keywords, then the keywords are displayed # one below the other rather than side-by-side. textplot_xray( kwic(tokens(corpus_subset(data_corpus_inaugural, Year > 2015 & Year<2020)), "america"), kwic(tokens(corpus_subset(data_corpus_inaugural, Year > 2015 & Year<2020)), "people"), kwic(tokens(corpus_subset(data_corpus_inaugural, Year > 2015 & Year<2020)), "chief") ) # You might also have noticed that the x-axis scale is the absolute token index for single texts # and relative token index when multiple texts are being compared. # If you prefer, you can specify that you want an absolute scale textplot_xray( kwic(tokens(data_corpus_inaugural[40:59]), "american"), kwic(tokens(data_corpus_inaugural[40:59]), "people"), kwic(tokens(data_corpus_inaugural[40:59]), "communis*"), scale = 'absolute' ) # The object returned is a ggplot object, which can be modified using ggplot plot <- textplot_xray( kwic(tokens(data_corpus_inaugural[40:59]), "american"), kwic(tokens(data_corpus_inaugural[40:59]), "people"), kwic(tokens(data_corpus_inaugural[40:59]), "communist")) plot + aes(color = keyword) + scale_color_manual(values = c('red', 'blue', "green")) ######################################################################### # Statistical summaries (2): Plotting the wordclouds (Non-positional Analysis: i.e., analysis that DO NOT retain the original text sequence - bag of words) ######################################################################### # One of the simplest statistical summary method you can apply to a DfM is a tag cloud. # A tag cloud is a visual representation of text data, in which tags are single words whose frequency is shown with different font size (and/or color) myCorpus <- corpus_subset(data_corpus_inaugural, Year > 1990) summary(myCorpus) tok2 <- tokens(myCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) myDfm <- dfm(tok2) # if you define a seed, each time you get always the same plot set.seed(123) textplot_wordcloud(myDfm , min_count = 6, rotation = .25, color = RColorBrewer::brewer.pal(8,"Dark2")) textplot_wordcloud(myDfm , min_count = 10, color = c('red', 'pink', 'green', 'purple', 'orange', 'blue')) # You can also plot a "comparison cloud", but this can only be done with fewer than eight documents: # Let's plot for example a "comparison cloud" between Biden, Trump and Obama corp2 <- corpus_subset(data_corpus_inaugural, President %in% c("Biden", "Trump", "Obama")) summary(corp2) # alternatively you could write: corp3 <- corpus_subset(data_corpus_inaugural, President== "Biden" | President== "Trump" | President== "Obama") summary(corp3) tok2 <- tokens(corp2, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) myDfm <- dfm(tok2) # let's group the speeches made by the same President (such as the two speeches made by Obama) in one single dfm # using the function "dfm_group" myDfm2 <- dfm_group(myDfm, groups = President) ndoc(myDfm) # 4 documents before grouping ndoc(myDfm2) # 3 documents after grouping (the two speeches by Obama are compressed into 1) set.seed(123) textplot_wordcloud(myDfm2, comparison = TRUE, min_count = 5, color = c("blue", "green", "red")) # how is that the feature "america" is shown just for Trump? After all "america" is employed in the other speeches as well! kwic(tok2, "america", window = 1) x <- kwic(tok2, "america", window = 1) table(x$docname) # Frequency of the feature "america" by President: # Obama (both speeches=: 14 # Trump: 18 # Biden: 18 # however take a look at the difference lenght of the speeches (once we keep only the features that appear at least 5 time as # in the plot above): myDfm3 <- dfm_trim(myDfm2, min_termfreq = 5) ntoken(myDfm3 ) # A "comparison cloud" works like that: for each feature - # 1) it computes its relative frequency in each document # 2) in the comparison wordcloud plot it will assign a feature to only that document that presents the maximum value in 1 # In our case: table(x$docname) ntoken(myDfm3 ) # 1) Biden=18/541; Obama=14/864; Trump: 18/321 # Biden 18/541 # Obama: 18/541 # Trump 18/321 # 2) as a result "america" will be assigned to Trump # what about the feature "us"? kwic(tok2, "us", window = 1) x <- kwic(tok2, "us", window = 1) table(x$docname) ntoken(myDfm3 ) # 1) Biden=27/541; Obama=44/864; Trump: 2/321 # Biden 27/541 # Obama: 44/864 # Trump 2/321 # 2) as a result "us" will be assigned to Obama set.seed(123) textplot_wordcloud(myDfm2, comparison = TRUE, min_count = 5, color = c("blue", "green", "red")) # we could also have used another document level variables such as Party in our previous subsample of 3 Presidential speeches myDfm2 <- dfm_group(myDfm, groups = Party) set.seed(123) textplot_wordcloud(dfm_trim(myDfm2 , min_termfreq = 5, verbose = FALSE), comparison = TRUE, color = c("blue", "red")) ######################################################################### # Statistical summaries (3): Lexical diversity (Non-positional Analysis) ######################################################################### # Other quantitative summary measures of documents are designed to characterize specific qualities of texts # Comparing the rates of types and tokens forms the foundation for measures of lexical diversity (the rate of vocabulary usage), # with most common such measure comparing the number of types to the number of tokens (the "type-token ratio"). # For example, it is argued that populist communication means simplified political discourse (lower diversity), in an attempt to reach the public more easily # textstat_lexdiv() function calcuates precisely lexical diversity in various measures based on the number of unique types of tokens # and the length of a document inaug_tokens <- tokens(data_corpus_inaugural) tok2 <- tokens_remove(inaug_tokens , stopwords("en")) inaug_dfm <- dfm(tok2) # Note that when you run textstat_lexdiv it automatically removes numbers punctuation etc from the corpus [w/o the need for you # to specify that when you create your dfm: see ?textstat_lexdiv] lexdiv <- textstat_lexdiv(inaug_dfm) str(lexdiv) # TTR is estimated as V/N, where # V (types=total number of unique terms); N (tokens=total number of words in the dfm) head(lexdiv, 5) tail(lexdiv, 5) # a temporal decreasing in the level of complexity? # let's see ggplot(data=lexdiv , aes(x=document, y=TTR, group=1)) + geom_line()+ geom_point()+ theme_minimal() + scale_x_discrete(breaks=c("1789-Washington","1933-Roosevelt","1961-Kennedy", "2017-Trump")) + geom_smooth(method = "lm") ######################################################################### # Statistical summaries (4): Comparing words associated with a target group vs. reference group (Non-positional Analysis) ######################################################################### # More sophisticated methods compare the differential occurrences of words across texts or partitions of a corpus, using statistical # association measures, to identify the words that belong for example to different sub-groups of texts, such as those predominantly # associated with male versus female authored documents, etc. # In this respect we can for example employ a chi2 test. Chi-squared test is used to determine whether there is a statistically significant # difference between the expected frequencies and the observed frequencies in one or more categories of a contingency table (in our cases # we are talking about the frequencies of words in two different set of texts). To understand a bit more about a chi2 test, # plz tak a look at the Lab 1 EXTRA A pdf file on the course home-page # The textstat_keyness command allows you precisely to compare the differential association of keywords in a target and reference group # For example, let's try to understand if there are some specific words associated to Trump vs. the other (more mainstream?) GOP Presidents pres_GOP <- corpus_subset(data_corpus_inaugural, Year > 1946 & Party== "Republican") summary(pres_GOP) # let's create a document variables = 1 for President Trump, and 0 otherwise pres_GOP$Trump <- ifelse(pres_GOP$President=="Trump", "Trump", "Other GPO") docvars(pres_GOP) tok2 <- tokens(pres_GOP, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2 ) pres_GOP_dfm<- dfm(tok2) # Let's group the documents according to the variable Trump myDfm <- dfm_group(pres_GOP_dfm, groups = Trump) result_keyness <- textstat_keyness(myDfm , target = "Trump") textplot_keyness(result_keyness) # Note however that if you look at the p-values not all of them are lower than 0.05 head(result_keyness , 10) tail(result_keyness , 10) # Therefore, let's keep only such significant values str(result_keyness ) result_keyness2 <- result_keyness[ which(result_keyness$p<=0.05), ] str(result_keyness2 ) # let's add back the attributes of the data (i.e., "Trump" "Other GPO") attr(result_keyness2, 'groups') <- c( "Trump", "Other GPO") str(result_keyness2 ) # very interesting results (check "us" "freedom" vs. "protect" "jobs" "american") textplot_keyness(result_keyness2) # another example: Republican vs. Democratic presidents myCorpus <- corpus_subset(data_corpus_inaugural, Year > 1945) summary(myCorpus) tok2 <- tokens(myCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) myDfm <- dfm(tok2) myDfm <- dfm_group(myDfm, groups = Party) str(myDfm @docvars) # You calculate keyness and determine Republican Presidents' as target group vs. Democratic Presidents' speeches as the reference group result_keyness <- textstat_keyness(myDfm , target = "Republican") result_keyness2 <- result_keyness[ which(result_keyness$p<=0.05), ] str(result_keyness2 ) attr(result_keyness2, 'groups') <- c( "Gop", "Dem") textplot_keyness(result_keyness2) ######################################################################### # Statistical summaries (5): Cosine similarities (Non-positional Analysis) ######################################################################### # "Cosine similarity" is an intuitive measure of semantic similarity. # More in details, the cosine similarity between two texts ranges between 0 and 1, where 0 is reached when two texts are completely # different and 1 is reached when two texts have identical feature proportions (plz tak a look at the Lab 1 EXTRA B pdf file on the course home-page # for a more detailed info on how to compute similiary between documents from a dfm) # create a dfm from inaugural addresses from Reagan onwards myCorpus <- corpus_subset(data_corpus_inaugural, Year > 1980) summary(myCorpus) tok2 <- tokens(myCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) presDfm<- dfm(tok2) presDfm # compute some document similarities Simil <- textstat_simil(presDfm, margin = "documents", method = "cosine") Simil # Let's plot it! Simil2 <-as.matrix(Simil) str(Simil2) corrplot(Simil2, method = 'number') corrplot(Simil2, method = 'color') corrplot(Simil2, method = 'shade', type = 'lower') # for specific comparisons: here the two speeches by Obama obamaSimil <- textstat_simil(presDfm, presDfm[c("2009-Obama", "2013-Obama"), ], margin = "documents", method = "cosine") obamaSimil