rm(list=ls(all=TRUE))
# set your working directory (i.e., where you have saved the datasets, etc.) 
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()

library(readtext)
library(quanteda)
library(ggplot2)
library(quanteda.textstats)
library(quanteda.textplots )
library(SnowballC)
library(corrplot)
library(DT)

# General preamble: I prefer in my scripts to avoid to employ the Pipe Operator(%>%) just for a matter of clarity
# The Pipe Operator - as you should know - makes coding easier, i.e., "a %>% mean = mean(a)"
# Everything that's to the left of the pipe is passed as the first argument to the function that's after the pipe

#########################################################################
#########################################################################
# Creating and Working with a Corpus
#########################################################################
#########################################################################

# Several different ways to create a corpus in Quanteda. Let's look at 2 of them: single vs. multiple files

#########################################################################
# FIRST: you have already a matrix file with a text for each row (such as .csv or .xls) - i.e., you have already a pre-formatted file
#########################################################################

# This dataset is a sample of 100 tweet from boston area discussing about food
# Data have been collected through Twitter API also specifying language and origin of tweets. 
# We will discuss later about how to retrieve such type of data in the next weeks.

x <- read.csv("boston.csv", stringsAsFactors=FALSE)
str(x)

# Albeit you can generate directly a corpus after having read a .csv file via the function "read.csv", I suggest you to read a .csv file always
# via the function "readtext". In this case however always remember to identify the name of the column in the dataset that includes
# the texts (in this case "text"). If you have in your dataset a column named "text", you can avoid to write it in the command.
# You can notice that a new column "doc_id" (including an index of texts) has been automatically created.
# The function "readtext" is pretty flexible, and it allows you to read several different types of files (.csv,.html,.txt,.xls,.pdf,.doc)
# check "?readtext"

myText2 <- readtext("boston.csv", text_field = "text")
str(myText2)

# You create your corpus via the function "corpus"
myCorpus2 <- corpus(myText2)

# Jargon: types=number of unique terms; tokens=number of words 
head(summary(myCorpus2))

# number of documents in the corpus
ndoc(myCorpus2 )

print(myCorpus2)
# print the first text
as.character(myCorpus2)[1]
# same thing but w/o interruption of the text
strwrap(as.character(myCorpus2)[1])
# print the first 3 texts
as.character(myCorpus2)[1:3]
strwrap(as.character(myCorpus2)[1:3])

# Let's move from the corpus to the document-feature-matrix! 
# In quanteda, we first tokenize the texts via the function "tokens" then we use the function "dfm" to produce such
# a matrix, where documents are in rows and features (aka: words) are columns
tokens(myCorpus2)

tok2 <- tokens(myCorpus2)
tok2

# Note that we can tokenize directly the columns including the texts from our original dataframe. 
# However in this latter case, we won't save the document level variables
# Compare the two objects below:
 
tok_txt <- tokens(myText2$text)
str(tok_txt )
str(tok2)

# when you create a dfm by default "tolower=TRUE", i.e., we convert all features to lowercase
myDfm <- dfm(tok2)

# we can get the number of documents and features ndoc() and nfeat() that build our DfM
ndoc(myDfm)
nfeat(myDfm)

# we can also obtain the names of documents and features by docnames() and featnames()
head(docnames(myDfm), 20)
head(featnames(myDfm), 20)

# Let's see the first five documents and the first 10 words of our dfm
myDfm[1:5, 1:10]
# Let's print the texts of the first two documents
strwrap(as.character(myCorpus2)[1:2])

# to find the document-level variables attached to each document and stored in our dfm
str(myDfm@docvars)
head(docvars(myDfm))

# 20 top features in the dfm
topfeatures(myDfm , 20) 

# let's improve the dfm! 
# FIRST: let's remove numbers, separators,  etc. Note that I decided also to remove the URLs
tok2_clean  <- tokens(myCorpus2, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, 
remove_separators = TRUE, remove_url=TRUE)

# SECOND: let's remove the stopwords
head(stopwords("english"), 20)
head(stopwords("russian"), 10)
head(stopwords("italian"), 10)

# the source "marino" for som given language (in particular asian languages: japanese, chinese, korean; but also arabic and hebrew)
# is a better option. See: https://github.com/koheiw/marimo
stopwords("en")
stopwords("en", source = "marimo")
# If you are interested in dealing with Japanese/Chinese texts, write me an email

# The stopwords options available in Quanteda (based on the Snowball stopwords list: see http://snowball.tartarus.org/) 
# works for all the main European languages (ftp://cran.r-project.org/pub/R/web/packages/stopwords/stopwords.pdf). 
getStemLanguages()

# For other languages things are a bit more complex. For Arabic ones, for example, a good source is also the stemming package arabicStemR
tok2_clean <- tokens_remove(tok2_clean , stopwords("english"))

# alternatively: you can first identify the list of tokens you want to delete and then you pass it to tokens_remove.
# let's do that with the stopwords
x <- stopwords("en")
x
tok2_clean <- tokens_remove(tok2_clean , x)
# the nice thing here is that you can play with the list of stopwords, by deleting some elements, such as "her"
x <-x[x != "her"]
x
# or adding new elements to the list of stopwords, such as "ten"
x <-append(x,"ten")
x

# THIRD: let's stem the words
tok2_clean<- tokens_wordstem (tok2_clean, language =("english"))

# Let's now re-create the dfm
myDfm2 <- dfm(tok2_clean)

topfeatures(myDfm , 50) 
topfeatures(myDfm2 , 20) # the featurea "for", "with" and "the" have disappeared! while #smoked is now #smoke

# still some symbols to remove! for example the words starting with "00" (they are unicode characters)
"\U00BD"

tok2_clean <- tokens_remove(tok2_clean, c(("rt"), ("00*"), ("ed"), ("u")))
myDfm2 <- dfm(tok2_clean )
topfeatures(myDfm2 , 20)  # 20 top features [better!]

# alternatively and as above: you can first identify the list of tokens you want to delete and then you pass it to tokens_remove
x <- c(("rt"), ("00*"), ("ed"), ("u"))
str(x)
tok2_clean <- tokens_remove(tok2_clean, x)

# You can save (and then eventually plot) the frequency of the top features in a dfm (in this case the top-20 features)
features_dfm <- textstat_frequency(myDfm2, n = 20)
features_dfm

ggplot(features_dfm, aes(x = reorder(feature, frequency), y = frequency)) +
  geom_point() +
  coord_flip() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))

# We can also create a dfm identifying only some specific words, such as for example only the hashtags 
# in the tweets using select = "#*" when creating the dfm.
dfm_hashtag <- dfm_select(myDfm2, pattern = c("#*"))

topfeatures(myDfm2 , 20)  # 20 top features 
topfeatures(dfm_hashtag, 20)  # 20 top features 

# You can also decide to exclude some features. For example, let's exclude all the hashtags
dfm_NOhashtag <- dfm_remove(myDfm2, pattern = c("#*"))
topfeatures(myDfm2 , 20)  # 20 top features 
topfeatures(dfm_hashtag, 20)  # 20 top features 
topfeatures(dfm_NOhashtag , 20)  # 20 top features 

# Following the same logic, you can remove stopwords also with dfm_remove, after you have created a dfm 
dfm_remove(myDfm, pattern = stopwords("en"))

# trimming the dfm
myDfm[1:10, 1:10]

# keep only words occurring >= 10 times and in >= 2 documents
dfm_trim(myDfm, min_termfreq = 10, min_docfreq = 2)

# keep only words occurring <= 10 times and in <=2 documents
dfm_trim(myDfm, max_termfreq = 10, max_docfreq = 2)

# keep only words occurring in 4 of 10 of documents
dfm_trim(myDfm, min_docfreq = 0.4, termfreq_type = "prop")

# keep only words occurring in all the 100 documents of my corpus at least once
dfm_trim(myDfm,  min_termfreq = 1, min_docfreq = 100)

#########################################################################
# SECOND: you have saved in a directory a set of files (one for each document) in a given format (.txt, .doc, .pdf) - i.e., you have multiple file texts
#########################################################################

# SOURCE: http://www.presidency.ucsb.edu/inaugurals.php

# our txt files are included in the folder called "Inaugural Speeches" included in our working directory
myText <- readtext("Inaugural Speeches/*.txt")
str(myText)

# we can actually extract three pieces of info from each file name (Name, Surname, Year: i.e., "George_Washington_1789.txt")
# Note: to use the docvarsfrom = "filenames" option, the "file names" should be consistent, i.e., in the below example, in ALL the txts' title 
# you should have the same ordering: Name, Surname, Year using the same separators (i.e., "_")

# It is always better to save your .txt files using the UTF-8 encoding when you are analyzing texts written in English (or Japanese/Chinese
# for example) before reading them in R. With other languages, UTF-8 is generally fine. But other encoding can be better

myText <- readtext("Inaugural Speeches/*.txt", 
docvarsfrom = "filenames", dvsep = "_", docvarnames = c("Name", "Surname", "Year"))
str(myText)
print(myText)

testCorpus <- corpus(myText )
summary(testCorpus)
print(testCorpus)

# inspect the document-level variables (this is a very important option, as we will see later on...)
head(docvars(testCorpus))

# if we want to extract individual elements of the document variables, we can specify field.
docvars(testCorpus, field = "Year")
# alternatively: 
testCorpus$Year

# docvars() also allows you to create or update the document-level variables
ndoc(testCorpus)
testCorpus$index<- c(1:5)
head(docvars(testCorpus))

# Let's now tokenize the texts
tok4 <- tokens(testCorpus,  remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok4 <- tokens_remove(tok4 , stopwords("en"))
tok4 <- tokens_wordstem (tok4 )
myDfm <- dfm(tok4)
topfeatures(myDfm , 20)  # 20 top words

# Note that you can also extract the topfeatures according to some document level variable, for example let's do it
# with respect to the Surname of the President (i.e., we have two speeches for President in our dfm)
topfeatures(myDfm , 5, groups=Surname)  # 5 top words for each President

# we can also use some other document level variables such as Year in which a speech was given
topfeatures(myDfm , 5, groups=Year)  # 5 top words for each Year

#########################################################################
# Playing with the corpus
#########################################################################

# let's load a corpus already presented in Quanteda: the corpus of all the US Presidents' Inaugural Speeches
# To summarize the texts from a corpus, we can call a summary() method defined for a corpus.
summary(data_corpus_inaugural)
# inspect the document-level variables 
head(docvars(data_corpus_inaugural))

# let's print the speech by the first US President (i.e., George Washington)
strwrap(as.character(data_corpus_inaugural)[1])

# we can subset a corpus according to some document-level variable value via the corpus_subset command
# For example, let's extract only the Trump speech (by taking advantage of the document-level variable "President" presented in our corpus)
# Note: you can also subset a token according to some document-level variable via the tokens_subset command

trump <- corpus_subset(data_corpus_inaugural, President == "Trump")
summary(trump)
strwrap(as.character(trump [[1]]))

# Let's extract the first five inaug. speeches 
mycorpus1<-  corpus_subset(data_corpus_inaugural, Year <1806)
summary(mycorpus1)

# We could have done it also by selecting the index of the documents in our corpus
mycorpus1_alt  <- corpus(data_corpus_inaugural[1:5])
summary(mycorpus1_alt )

# Let's extract the last six inaug. speeches (i.e., since 2000)
mycorpus2<-  corpus_subset(data_corpus_inaugural, Year >2000)
summary(mycorpus2)

# We can add two corpus together
mycorpus3 <- mycorpus1 + mycorpus2
summary(mycorpus3)

# We can also subset a corpus according to more than one single condition: note the use of the logical operator "&"
summary(corpus_subset(data_corpus_inaugural, Year > 1990 & Party== "Republican"))

# the function corpus_reshape() allows to change the unit of texts between documents, paragraphs and sentences
summary(data_corpus_inaugural)
ndoc(data_corpus_inaugural)
print(data_corpus_inaugural)

corp_sent <- corpus_reshape(data_corpus_inaugural, to = "sentences")
ndoc(corp_sent )
print(corp_sent)

# If you apply corpus_subset() to corp_sent, you can for example keep only long sentences (more than 50 words for examples)
corp_sent_long <- corpus_subset(corp_sent, ntoken(corp_sent) >= 50)
ndoc(corp_sent_long)

# Restore the original documents.
corp_sent2 <- corpus_reshape(corp_sent, to = "documents")
ndoc(corp_sent2 )
print(corp_sent2 )

#########################################################################
#  Let's explore some statistical summaries methods
#########################################################################

#########################################################################
#  Statistical summaries (1): Lexical dispersion plot (Positional Analysis: i.e., analysis that retain the original text sequence)
#########################################################################

# The kwic function (keywords-in-context) performs a search for a word in a corpus and it allows us to view the contexts in which it occurs
# NOTE: by working on a token object, we retain the original text sequence!

options(width = 200)
kwic(tokens(data_corpus_inaugural), "terror")

# if you want to display the search in a more cool way:
x <- kwic(tokens(data_corpus_inaugural), "terror")
datatable(x, caption="Keywords in context", rownames=FALSE, options = list(scrollX = TRUE,pageLength = 10, lengthMenu = c(5, 10, 15, 20)))

# with the window argument, we can specify the number of words to be displayed around the keyword (here=1)
kwic(tokens(data_corpus_inaugural), "terror", window = 1)

# also the words starting with "terror" including "terrorism"
kwic(tokens(data_corpus_inaugural), "terror*")
# same result as above via the "regex" specification.
# A regex, or Regular Expression, is a sequence of characters that forms a search pattern
kwic(tokens(data_corpus_inaugural), "terror", valuetype = "regex")

# Note that by default, the kwic() is word-based. If you like to look up a multiword combination, use phrase()
kwic(tokens(data_corpus_inaugural), phrase("by terror"))

# alternative way to view your result
x <- kwic(tokens(data_corpus_inaugural), phrase("by terror"))
View(x)

# We can plot a kwic object via a Lexical dispersion plot 
# a Lexical dispersion plot allow you to detect both the relative frequency of an employed word across documents as well as the ?timing? of that word in a given text

textplot_xray(kwic(tokens(data_corpus_inaugural[40:59]), "american"))

textplot_xray(
     kwic(tokens(data_corpus_inaugural[40:59]), "american"),
     kwic(tokens(data_corpus_inaugural[40:59]), "people"),
     kwic(tokens(data_corpus_inaugural[40:59]), "communis*")
)

# If you are only plotting a single document, but with multiple keywords, then the keywords are displayed 
# one below the other rather than side-by-side.

textplot_xray(
    kwic(tokens(corpus_subset(data_corpus_inaugural, Year > 2015 & Year<2020)), "america"),
    kwic(tokens(corpus_subset(data_corpus_inaugural, Year > 2015 & Year<2020)), "people"),
    kwic(tokens(corpus_subset(data_corpus_inaugural, Year > 2015 & Year<2020)), "chief")
)

# You might also have noticed that the x-axis scale is the absolute token index for single texts 
# and relative token index when multiple texts are being compared. 
# If you prefer, you can specify that you want an absolute scale 

textplot_xray(
     kwic(tokens(data_corpus_inaugural[40:59]), "american"),
     kwic(tokens(data_corpus_inaugural[40:59]), "people"),
     kwic(tokens(data_corpus_inaugural[40:59]), "communis*"),
     scale = 'absolute'
)

# The object returned is a ggplot object, which can be modified using ggplot

plot <- textplot_xray(
     kwic(tokens(data_corpus_inaugural[40:59]), "american"),
     kwic(tokens(data_corpus_inaugural[40:59]), "people"),
     kwic(tokens(data_corpus_inaugural[40:59]), "communist"))
plot + aes(color = keyword) + scale_color_manual(values = c('red', 'blue', "green"))

#########################################################################
# Statistical summaries (2): Plotting the wordclouds (Non-positional Analysis: i.e., analysis that DO NOT retain the original text sequence - bag of words)
#########################################################################

# One of the simplest statistical summary method you can apply to a DfM is a tag cloud.
# A tag cloud is a visual representation of text data, in which tags are single words whose frequency is shown with different font size (and/or color)

myCorpus <- corpus_subset(data_corpus_inaugural, Year > 1990)
summary(myCorpus)
tok2 <- tokens(myCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
myDfm <- dfm(tok2)

# if you define a seed, each time you get always the same plot
set.seed(123)
textplot_wordcloud(myDfm ,  min_count = 6, rotation  = .25, color = RColorBrewer::brewer.pal(8,"Dark2"))

textplot_wordcloud(myDfm , min_count = 10,
     color = c('red', 'pink', 'green', 'purple', 'orange', 'blue'))

# You can also plot a "comparison cloud", but this can only be done with fewer than eight documents:
# Let's plot for example a "comparison cloud" between Biden, Trump and Obama
corp2 <- corpus_subset(data_corpus_inaugural, President %in% c("Biden", "Trump", "Obama"))
summary(corp2)

# alternatively you could write:
corp3 <- corpus_subset(data_corpus_inaugural, President== "Biden" |  President==   "Trump" |  President==  "Obama")
summary(corp3)

tok2 <- tokens(corp2, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
myDfm <- dfm(tok2)
# let's group the speeches made by the same President (such as the two speeches made by Obama) in one single dfm 
# using the function "dfm_group"
myDfm2 <- dfm_group(myDfm, groups = President)

ndoc(myDfm)  # 4 documents before grouping
ndoc(myDfm2) # 3 documents after grouping (the two speeches by Obama are compressed into 1)

set.seed(123)
textplot_wordcloud(myDfm2, comparison = TRUE,  min_count = 5, color = c("blue", "green", "red"))

# how is that the feature "america" is shown just for Trump? After all "america" is employed in the other speeches as well!
kwic(tok2, "america", window = 1)
x <- kwic(tok2, "america", window = 1)
table(x$docname)

# Frequency of the feature "america" by President:
# Obama (both speeches=: 14
# Trump: 18
# Biden: 18

# however take a look at the difference lenght of the speeches (once we keep only the features that appear at least 5 time as
# in the plot above):

myDfm3 <- dfm_trim(myDfm2, min_termfreq = 5)
ntoken(myDfm3 )

# A "comparison cloud" works like that: for each feature - 
# 1) it computes its relative frequency in each document
# 2) in the comparison wordcloud plot it will assign a feature to only that document that presents the maximum value in 1

# In our case:
table(x$docname)
ntoken(myDfm3 )

# 1) Biden=18/541; Obama=14/864; Trump: 18/321
# Biden
18/541
# Obama:
18/541
# Trump
18/321

# 2) as a result "america" will be assigned to Trump

# what about the feature "us"?
kwic(tok2, "us", window = 1)
x <- kwic(tok2, "us", window = 1)
table(x$docname)
ntoken(myDfm3 )

# 1) Biden=27/541; Obama=44/864; Trump: 2/321
# Biden
27/541
# Obama:
44/864
# Trump
2/321

# 2) as a result "us" will be assigned to Obama

set.seed(123)
textplot_wordcloud(myDfm2, comparison = TRUE,  min_count = 5, color = c("blue", "green", "red"))

# we could also have used another document level variables such as Party in our previous subsample of 3 Presidential speeches
myDfm2 <- dfm_group(myDfm, groups = Party)
set.seed(123)
textplot_wordcloud(dfm_trim(myDfm2 , min_termfreq = 5, verbose = FALSE), comparison = TRUE, color = c("blue", "red"))

#########################################################################
# Statistical summaries (3): Lexical diversity (Non-positional Analysis)
#########################################################################

# Other quantitative summary measures of documents are designed to characterize specific qualities of texts 
# Comparing the rates of types and tokens forms the foundation for measures of lexical diversity (the rate of vocabulary usage), 
# with most common such measure comparing the number of types to the number of tokens (the "type-token ratio").
# For example, it is argued that populist communication means simplified political discourse (lower diversity), in an attempt to reach the public more easily

# textstat_lexdiv() function calcuates precisely lexical diversity in various measures based on the number of unique types of tokens 
# and the length of a document

inaug_tokens <- tokens(data_corpus_inaugural)
tok2 <- tokens_remove(inaug_tokens , stopwords("en"))
inaug_dfm <- dfm(tok2)

# Note that when you run textstat_lexdiv it automatically removes numbers punctuation etc from the corpus [w/o the need for you
# to specify that when you create your dfm: see ?textstat_lexdiv]

lexdiv <- textstat_lexdiv(inaug_dfm)
str(lexdiv)

# TTR is estimated as V/N, where
# V (types=total number of unique terms); N (tokens=total number of words in the dfm)
head(lexdiv, 5)
tail(lexdiv, 5)

# a temporal decreasing in the level of complexity?
# let's see
ggplot(data=lexdiv , aes(x=document, y=TTR, group=1)) +
  geom_line()+
  geom_point()+
  theme_minimal() + scale_x_discrete(breaks=c("1789-Washington","1933-Roosevelt","1961-Kennedy", "2017-Trump")) + geom_smooth(method = "lm")


#########################################################################
# Statistical summaries (4): Comparing words associated with a target group vs. reference group (Non-positional Analysis)
#########################################################################

# More sophisticated methods compare the differential occurrences of words across texts or partitions of a corpus, using statistical 
# association measures, to identify the words that belong for example to different sub-groups of texts, such as those predominantly 
# associated with male versus female authored documents, etc.

# In this respect we can for example employ a chi2 test. Chi-squared test is used to determine whether there is a statistically significant 
# difference between the expected frequencies and the observed frequencies in one or more categories of a contingency table (in our cases
# we are talking about the frequencies of words in two different set of texts). To understand a bit more about a chi2 test, 
# plz tak a look at the Lab 1 EXTRA A pdf file on the course home-page

# The textstat_keyness command allows you precisely to compare the differential association of keywords in a target and reference group
# For example, let's try to understand if there are some specific words associated to Trump vs. the other (more mainstream?) GOP Presidents

pres_GOP <- corpus_subset(data_corpus_inaugural, Year > 1946 & Party== "Republican")
summary(pres_GOP)
# let's create a document variables = 1 for President Trump, and 0 otherwise
pres_GOP$Trump <- ifelse(pres_GOP$President=="Trump", "Trump", "Other GPO")
docvars(pres_GOP)
tok2 <- tokens(pres_GOP, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2 )
pres_GOP_dfm<- dfm(tok2)
# Let's group the documents according to the variable Trump
myDfm  <- dfm_group(pres_GOP_dfm, groups = Trump)
result_keyness <- textstat_keyness(myDfm , target = "Trump")
textplot_keyness(result_keyness) 

# Note however that if you look at the p-values not all of them are lower than 0.05 
head(result_keyness , 10)
tail(result_keyness , 10)

# Therefore, let's keep only such significant values
str(result_keyness )
result_keyness2 <- result_keyness[ which(result_keyness$p<=0.05), ]
str(result_keyness2 )
# let's add back the attributes of the data (i.e., "Trump" "Other GPO")
attr(result_keyness2, 'groups') <-  c(  "Trump", "Other GPO")
str(result_keyness2 )
# very interesting results (check "us" "freedom" vs. "protect" "jobs" "american")
textplot_keyness(result_keyness2) 

# another example: Republican vs. Democratic presidents
myCorpus <- corpus_subset(data_corpus_inaugural, Year > 1945)
summary(myCorpus)
tok2 <- tokens(myCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
myDfm <- dfm(tok2)
myDfm  <- dfm_group(myDfm, groups = Party)
str(myDfm @docvars)
# You calculate keyness and determine Republican Presidents' as target group vs. Democratic Presidents' speeches as the reference group
result_keyness <- textstat_keyness(myDfm , target = "Republican")
result_keyness2 <- result_keyness[ which(result_keyness$p<=0.05), ]
str(result_keyness2 )
attr(result_keyness2, 'groups') <- c( "Gop", "Dem")
textplot_keyness(result_keyness2) 

#########################################################################
# Statistical summaries (5): Cosine similarities (Non-positional Analysis)
#########################################################################

# "Cosine similarity" is an intuitive measure of semantic similarity.
# More in details, the cosine similarity between two texts ranges between 0 and 1, where 0 is reached when two texts are completely 
# different and 1 is reached when two texts have identical feature proportions (plz tak a look at the Lab 1 EXTRA B pdf file on the course home-page 
# for a more detailed info on how to compute similiary between documents from a dfm)

# create a dfm from inaugural addresses from Reagan onwards
myCorpus <- corpus_subset(data_corpus_inaugural, Year > 1980)
summary(myCorpus)
tok2 <- tokens(myCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
presDfm<- dfm(tok2)
presDfm

# compute some document similarities
Simil <- textstat_simil(presDfm, margin = "documents", method = "cosine")
Simil

# Let's plot it!
Simil2 <-as.matrix(Simil)
str(Simil2)
corrplot(Simil2, method = 'number')
corrplot(Simil2, method = 'color') 
corrplot(Simil2, method = 'shade', type = 'lower') 

# for specific comparisons: here the two speeches by Obama
obamaSimil <- textstat_simil(presDfm, presDfm[c("2009-Obama", "2013-Obama"), ], margin = "documents", method = "cosine")
obamaSimil