rm(list=ls(all=TRUE)) getwd() # set your working directory (i.e., where you have saved the datasets, etc.) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(readtext) library(quanteda) library(ggplot2) library(quanteda.textstats) library(quanteda.textplots ) library(SnowballC) library(corrplot) # With quanteda_options() you can get or set global options affecting functions across quanteda. # One very useful feature is changing the number of threads to use in parallelised functions via . # For instance, quanteda_options("threads" = 10) will use ten threads which massively # reduces the time to execute the parallelised functions. ######################################################################### ######################################################################### # Creating and Working with a Corpus ######################################################################### ######################################################################### ######################################################################### # Several different ways to create a corpus in Quanteda. Let's look at 3 of them # FIRST: you have already a file .csv with a text for each row ######################################################################### # This dataset is a sample of 100 tweet from boston area discussing about food # Data have been collected through Twitter API also specifying language and origin of tweets. # We will discuss later about how to retrieve such type of data in the next weeks x <- read.csv("boston.csv", stringsAsFactors=FALSE) str(x) # You read the dataset via readtext by identifying the name of the column in the dataset with the texts (in this case "text") myText2 <- readtext("boston.csv", text_field = "text") str(myText2) # You create your corpus myCorpus2 <- corpus(myText2) # Jargon: types=number of unique terms; tokens=number of words head(summary(myCorpus2)) # number of documents in the corpus ndoc(myCorpus2 ) # Show the first text as.character(myCorpus2)[1] strwrap(as.character(myCorpus2)[1]) # Show the first 3 texts as.character(myCorpus2)[1:3] strwrap(as.character(myCorpus2)[1:3]) # Let's move from the corpus to the document-feature(term) matrix! # In quanteda, we first tokenize the texts via tokens then we use the dfm function to produce such a matrix, # where documents are in rows and “features” (aka: words) are columns tokens(myCorpus2) tok2 <- tokens(myCorpus2) str(tok2) # when you create a dfm by default "tolower=TRUE", i.e., we convert all features to lowercase myDfm <- dfm(tok2) # Let's see the first five documents and the first 10 words of our dfm myDfm[1:5, 1:10] # Let's see the texts of the first two documents as.character(myCorpus2)[1:2] # 20 top features in the dfm topfeatures(myDfm , 20) # let's clean the dfm! # FIRST: let's remove numbers, separators, etc. tok2_clean <- tokens(myCorpus2, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) # SECOND: let's remove the stopwords head(stopwords("english"), 20) head(stopwords("russian"), 10) head(stopwords("italian"), 10) # the source "marino" for som given language (in particular asian languages: japanese, chinese, korean; but also arabic and hebrew) # is a better option. See: https://github.com/koheiw/marimo stopwords("en") stopwords("en", source = "marimo") # The stop words options available in Quanteda (based on the Snowball stopwords list: see http://snowball.tartarus.org/) # works for all the main European languages (ftp://cran.r-project.org/pub/R/web/packages/stopwords/stopwords.pdf). getStemLanguages() # For other languages things are a bit more complex. For Arabic ones, for example, a good source is also the stemming package ‘arabicStemR’ tok2_clean_stop <- tokens_remove(tok2_clean , stopwords("english")) # THIRD: let's stem the words tok2_clean_stop_stem<- tokens_wordstem (tok2_clean_stop, language =("english")) # Let's now re-create the dfm myDfm2 <- dfm(tok2_clean_stop_stem) topfeatures(myDfm , 50) topfeatures(myDfm2 , 20) # the featurea "for", "with" and "the" have disappeared! while #smoked is now #smoke # still some symbols to remove! for example the words starting with "00" (they are unicode characters) "\U00BD" tok2_clean_stop_stem2 <- tokens_remove(tok2_clean_stop_stem, c(("rt"), ("00*"), ("ed"), ("u"))) myDfm2 <- dfm(tok2_clean_stop_stem2) topfeatures(myDfm2 , 20) # 20 top features [better!] # We can also create a dfm identifying only some specific words, such as for example only the hashtags # in the tweets using select = "#*" when creating the dfm. dfm_hashtag <- dfm_select(myDfm, pattern = c("#*")) topfeatures(myDfm , 20) # 20 top features topfeatures(dfm_hashtag, 20) # 20 top features # You can plot the frequency of the top features in a text using topfeatures features_dfm <- textstat_frequency(dfm_hashtag, n = 20) features_dfm str(features_dfm) # Sort by reverse frequency order features_dfm$feature <- with(features_dfm, reorder(feature, -frequency)) ggplot(features_dfm, aes(x = feature, y = frequency)) + geom_point() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) # Note that you can remove stopwords also with dfm_remove, after you have created a dfm dfm_remove(myDfm, pattern = stopwords("en")) # You can also decide to exclude some features. For example, let's exclude all the hashtags dfm_NOhashtag <- dfm_remove(myDfm, pattern = c("#*")) topfeatures(myDfm , 20) # 20 top features topfeatures(dfm_hashtag, 20) # 20 top features topfeatures(dfm_NOhashtag , 20) # 20 top features # trimming the dfm myDfm[1:10, 1:10] # keep only words occurring >= 10 times and in >= 2 documents dfm_trim(myDfm, min_termfreq = 10, min_docfreq = 2) # keep only words occurring >= 20 times and in at least 0.4 of the documents dfm_trim(myDfm, min_termfreq = 20, min_docfreq = 0.4) # keep only words occurring <= 10 times and in <=2 documents dfm_trim(myDfm, max_termfreq = 10, max_docfreq = 2) # keep only words occurring 5 times in 1000, and in 2 of 5 of documents dfm_trim(myDfm, min_docfreq = 0.4, min_termfreq = 0.005, termfreq_type = "prop") # keep only words occurring in all the 100 documents of my corpus dfm_trim(myDfm, min_termfreq = 1, min_docfreq = 100) # weighting a dfm according to the relative term frequency, i.e., # normalizing a dfm by considering the proportions of the feature counts within each document myDfm_weight <- dfm_weight(myDfm, scheme = "prop") # compare the two matrices below (the first one: unweighted; the second one: weighted) myDfm[1:5, 1:5] myDfm_weight [1:5, 1:5] # weighting a dfm by tf-idf # remember: tf-idf adds a weight that approaches zero as the number of documents in which a term appears # (in any frequency) approaches the number of documents in the collection. And indeed here #dinner close to 0! myDfm_tf <- dfm_tfidf(myDfm) myDfm[1:5, 1:5] myDfm_tf [1:5, 1:5] # By collocation analysis, we can identify contiguous collocations of words (this will become very important when discussing # about word-embeddings!) # In this example, we want to understand which two words (size=2) appear more often together in at least 10 texts # You can apply it either to a corpus or to a tokens object. Of course the results in the two cases will be different if, # as in our case, we have already cleaned the texts when creating our tokens object textstat_collocations(myCorpus2 , size = 2, min_count = 10) textstat_collocations(tok2_clean_stop_stem2, size = 2, min_count = 10) ######################################################################### # SECOND: you write your own corpus ######################################################################### testText <- "The quick brown fox named Seamus jumps over the dog also named Seamus, with the newspaper from a boy named Seamus, in his mouth" testText testCorpus <- corpus(testText) tok3 <- tokens(testCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok3 <- tokens_remove(tok3 , stopwords("en")) tok3 <- tokens_wordstem (tok3 ) myDfm <- dfm(tok3) topfeatures(myDfm , 10) # 20 top features ######################################################################### # THIRD: you have saved in a directory a set of files (one for each document) ######################################################################### # SOURCE: http://www.presidency.ucsb.edu/inaugurals.php # it is always better to save txt files in the UTF-8 format. If you have saved them in some other formats (ISO, etc.), # it's a good idea to specify it in the encoding command # Note that readtext reads also other file formats: check "?readtext" # our txt files are included in the folder called "Inaugural Speeches" included in our working directory myText <- readtext("Inaugural Speeches/*.txt", encoding = "UTF-8") str(myText) # we can actually extract three pieces of info from each file name (Name, Surname, Year: i.e., "George_Washington_1789.txt") # Note: to use the docvarsfrom = "filenames" option, the "file names" should be consistent, i.e., in the below example, in ALL the txts' title # you should have the same ordering: Name, Surname, Year using the same separators (i.e., "_") myText <- readtext("Inaugural Speeches/*.txt", docvarsfrom = "filenames", dvsep = "_", docvarnames = c("Name", "Surname", "Year"), encoding = "UTF-8") str(myText) testCorpus <- corpus(myText ) summary(testCorpus) # inspect the document-level variables (this is a very important option, as we will see later on...) head(docvars(testCorpus)) tok4 <- tokens(testCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok4 <- tokens_remove(tok4 , stopwords("en")) tok4 <- tokens_wordstem (tok4 ) myDfm <- dfm(tok4) topfeatures(myDfm , 20) # 20 top words ######################################################################### # Playing with the corpus ######################################################################### # let's load a corpus already presented in Quanteda: the corpus of all the US Presidents' Inaugural Speeches # To summarize the texts from a corpus, we can call a summary() method defined for a corpus. summary(data_corpus_inaugural) # inspect the document-level variables head(docvars(data_corpus_inaugural)) as.character(data_corpus_inaugural)[1] as.character(data_corpus_inaugural)[2] # to see the entire Trump speech trump <- corpus_subset(data_corpus_inaugural, President == "Trump") strwrap(as.character(trump [[1]])) # Adding two corpus together # First five inaug. speeches mycorpus1 <- corpus(data_corpus_inaugural[1:5]) # Last five inaug. speeches mycorpus2 <- corpus(data_corpus_inaugural[53:58]) mycorpus3 <- mycorpus1 + mycorpus2 summary(mycorpus3) # subsetting a corpus summary(corpus_subset(data_corpus_inaugural, Year > 1990)) summary(corpus_subset(data_corpus_inaugural, President == "Adams")) summary(corpus_subset(data_corpus_inaugural, Year > 1990 & Party== "Republican")) ######################################################################### # Statistical summaries (1): Lexical dispersion plot (Positional Analysis) ######################################################################### # The kwic function (keywords-in-context) performs a search for a word in a corpus and it allows us to view the contexts in which it occurs # NOTE: by working on a token object, we retain the original text sequence! options(width = 200) kwic(tokens(data_corpus_inaugural), "terror") # also the words starting with "terror" including "terrorism" kwic(tokens(data_corpus_inaugural), "terror", valuetype = "regex") # same result as above kwic(tokens(data_corpus_inaugural), "terror*") kwic(tokens(data_corpus_inaugural), "communist*") # Note that by default, the kwic() is word-based. If you like to look up a multiword combination, use phrase() kwic(tokens(data_corpus_inaugural), phrase("by terror")) # We can plot a kwic object via a Lexical dispersion plot textplot_xray(kwic(tokens(data_corpus_inaugural[40:59]), "american")) textplot_xray( kwic(tokens(data_corpus_inaugural[40:59]), "american"), kwic(tokens(data_corpus_inaugural[40:59]), "people"), kwic(tokens(data_corpus_inaugural[40:59]), "communist") ) # If you’re only plotting a single document, but with multiple keywords, then the keywords are displayed # one below the other rather than side-by-side. textplot_xray( kwic(tokens(corpus_subset(data_corpus_inaugural, Year > 2015 & Year<2020)), "america"), kwic(tokens(corpus_subset(data_corpus_inaugural, Year > 2015 & Year<2020)), "people"), kwic(tokens(corpus_subset(data_corpus_inaugural, Year > 2015 & Year<2020)), "chief") ) # You might also have noticed that the x-axis scale is the absolute token index for single texts # and relative token index when multiple texts are being compared. # If you prefer, you can specify that you want an absolute scale textplot_xray( kwic(tokens(data_corpus_inaugural[40:59]), "american"), kwic(tokens(data_corpus_inaugural[40:59]), "people"), kwic(tokens(data_corpus_inaugural[40:59]), "communist"), scale = 'absolute' ) # The object returned is a ggplot object, which can be modified using ggplot plot <- textplot_xray( kwic(tokens(data_corpus_inaugural[40:59]), "american"), kwic(tokens(data_corpus_inaugural[40:59]), "people"), kwic(tokens(data_corpus_inaugural[40:59]), "communist")) plot + aes(color = keyword) + scale_color_manual(values = c('red', 'blue', "green")) ######################################################################### # Statistical summaries (2): Plotting the wordclouds (Non-positional Analysis) ######################################################################### # One of the simplest statistical summary method you can apply to a DfM is a tag cloud. # A tag cloud is a visual representation of text data, in which tags are single words whose frequency is shown with different font size (and/or color) myCorpus <- corpus_subset(data_corpus_inaugural, Year > 1990) summary(myCorpus) tok2 <- tokens(myCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) myDfm <- dfm(tok2) # if you define a seed, each time you get always the same plot set.seed(123) textplot_wordcloud(myDfm , min.count = 6, rot.per = .25, colors = RColorBrewer::brewer.pal(8,"Dark2")) textplot_wordcloud(myDfm , min.count = 10, colors = c('red', 'pink', 'green', 'purple', 'orange', 'blue')) # You can also plot a “comparison cloud”, but this can only be done with fewer than eight documents: corp2 <- corpus_subset(data_corpus_inaugural, President %in% c("Washington", "Jefferson", "Madison")) summary(corp2) tok2 <- tokens(corp2, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) myDfm <- dfm(tok2) # let's group the speeches made by the same President (such as the two speeches made by Washington) in one single dfm using # the command we can use is dfm_group myDfm2 <- dfm_group(myDfm, groups = President) str(myDfm) length(myDfm@Dimnames$docs) # 6 documents str(myDfm2) length(myDfm2@Dimnames$docs) # 3 documents! set.seed(123) textplot_wordcloud(myDfm2, comparison = TRUE) # let's plot the dfm trimmed set.seed(123) textplot_wordcloud(dfm_trim(myDfm2, min_termfreq = 5, verbose = FALSE), comparison = TRUE) # Exercise: let's plot a "comparison cloud" between Biden, Trump and Obama corp2 <- corpus_subset(data_corpus_inaugural, President %in% c("Biden", "Trump", "Obama")) tok2 <- tokens(corp2, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) myDfm <- dfm(tok2) myDfm2 <- dfm_group(myDfm, groups = President) set.seed(123) textplot_wordcloud(dfm_trim(myDfm2 , min_termfreq = 5, verbose = FALSE), comparison = TRUE) ######################################################################### # Statistical summaries (3): Frequency plots comparison (Non-positional Analysis) ######################################################################### # If you want to compare the frequency of a single term across different texts, # you can also use textstat_frequency, group the frequency by speech and extract the term. pres_corpus <- corpus_subset(data_corpus_inaugural, President %in% c("Obama", "Trump", "Biden")) summary(pres_corpus) tok2 <- tokens(pres_corpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) pres_dfm<- dfm(tok2) str(pres_dfm) docvars(pres_dfm) # Let's first create a new string variable with the name of the President and of the Year pres_dfm@docvars$Year_str <-paste0(pres_dfm@docvars$President," ", as.character(pres_dfm@docvars$Year)) docvars(pres_dfm) freq_grouped <- textstat_frequency(pres_dfm, groups=Year_str) str(freq_grouped) # Filter the term "american" freq_american <- subset(freq_grouped, freq_grouped$feature %in% "american") ggplot(freq_american, aes(x = group, y = frequency)) + geom_point() + scale_y_continuous(limits = c(0, 14), breaks = c(seq(0, 14, 2))) + xlab(NULL) + ylab("Frequency of word - american -") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) # texstat_frequency allows also to plot the most frequent words by group # Calculate frequency by Presidential speeches - first 10 words for speech freq_weight <- textstat_frequency(pres_dfm, n = 10, groups = Year_str) ggplot(data = freq_weight, aes(x = nrow(freq_weight):1, y = frequency)) + geom_point() + facet_wrap(~ group, scales = "free") + coord_flip() + scale_x_continuous(breaks = nrow(freq_weight):1, labels = freq_weight$feature) + labs(x = NULL, y = "Frequency") ######################################################################### # Statistical summaries (4): Comparing words associated with a target group vs. reference group (Non-positional Analysis) ######################################################################### # More sophisticated methods compare the differential occurrences of words across texts or partitions of a corpus, using statistical association measures, # to identify the words that belong primarily to sub-groups such as those predominantly associated with male- versus female - authored documents, or Democratic versus Republican speeches # In particular, if you want to compare the differential associations of keywords in a target and reference group, # you can calculate “keyness” which is based on the textstat_keyness command. # In this example, we compare the inaugural speech by Donald Trump with the speech by Joe Biden pres_corpus <- corpus_subset(data_corpus_inaugural, President %in% c("Trump", "Biden")) summary(pres_corpus) # Create a dfm grouped by president tok2 <- tokens(pres_corpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) docnames(pres_corpus) # Calculate keyness and determine Trump as target group result_keyness <- textstat_keyness(pres_dfm, target = "2017-Trump") # if you get a negative value, it means that Biden uses that word more than Trump (i.e., the target group) and viceversa # Plot estimated word keyness textplot_keyness(result_keyness) # what is chi2? Chi-squared test is used to determine whether there is a statistically significant difference between # the expected frequencies and the observed frequencies in one or more categories of a contingency table (in our cases # we are talking about the frequencies of words in two different set of texts) # Plot without the reference text (in this case Biden) textplot_keyness(result_keyness, show_reference = FALSE) head(result_keyness , 10) tail(result_keyness , 10) ######################################################################### # Statistical summaries (5): Lexical diversity (Non-positional Analysis) ######################################################################### # Other quantitative summary measures of documents are designed to characterize specific qualities of texts # Comparing the rates of types and tokens forms the foundation for measures of lexical diversity (the rate of vocabulary usage), with most common such measure # comparing the number of types to the number of tokens (the “type-token ratio”) # For example, it is argued that populist communication means simplified political discourse (lower diversity), in an attempt to reach the public more easily # textstat_lexdiv() command calcuates precisely lexical diversity in various measures based on the number of unique types of tokens # and the length of a document. It is useful for analysing speakers’ or writers’ linguistic skill, or complexity # of ideas expressed in documents. inaug_tokens <- tokens(data_corpus_inaugural) tok2 <- tokens_remove(inaug_tokens , stopwords("en")) inaug_dfm <- dfm(tok2) lexdiv <- textstat_lexdiv(inaug_dfm) str(lexdiv) ggplot(data=lexdiv , aes(x=document, y=TTR, group=1)) + geom_line()+ geom_point()+ theme_minimal() + scale_x_discrete(breaks=c("1789-Washington","1933-Roosevelt","1961-Kennedy", "2017-Trump")) # TTR is estimated as V/N, where # V (types=total number of unique terms); N (tokens=total number of words in the dfm) head(lexdiv, 5) tail(lexdiv, 5) # when you run textstat_lexdiv it automatically removes numbers punctuation etc from the corpus [w/o the need for you # to specify that when you create your dfm: see ?textstat_lexdiv] # that's why the ratio you get between Types and Tokens from the corpus gives you a different value: summary(data_corpus_inaugural) # for Washington 1789 625/1537 head(lexdiv, 1) ######################################################################### # Statistical summaries (6): Cosine similarities (Non-positional Analysis) ######################################################################### # cosine similarity is an intuitive measure of semantic distance that has been increasingly used in a number of fields of social sciences. # By representing texts as vectors in a Cartesian space, cosine similarity estimates the differences between two texts based on vectors # of word occurrences. The possible divergence between two texts ranges between 0 and 1, where 0 is reached when two texts are completely # different and 1 is reached when two texts have identical feature proportions # create a dfm from inaugural addresses from Reagan onwards myCorpus <- corpus_subset(data_corpus_inaugural, Year > 1980) # alternative way to get the speeches after 1980 [note the use of the logical operator "&"] # myCorpus <- corpus_subset(data_corpus_inaugural, Year > 1980 & Year<2021) summary(myCorpus) tok2 <- tokens(myCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) presDfm<- dfm(tok2) presDfm # compute some document similarities Simil <- textstat_simil(presDfm, margin = "documents", method = "cosine") Simil # Let's plot it! Simil2 <-as.matrix(Simil) str(Simil2) corrplot(Simil2, method = 'number') corrplot(Simil2, method = 'number') corrplot(Simil2, method = 'color') corrplot(Simil2, method = 'shade', type = 'lower') # for specific comparisons: here the two speeches by Obama obamaSimil <- textstat_simil(presDfm, presDfm[c("2009-Obama", "2013-Obama"), ], margin = "documents", method = "cosine") obamaSimil # compute some term similarities tstat3 <- textstat_simil(presDfm, presDfm[, c("fair", "health", "terror")], method = "cosine", margin = "features", 20) head(as.matrix(tstat3), 10) as.list(tstat3, n = 6)