rm(list=ls(all=TRUE)) getwd() # set your working directory (i.e., where you have saved the datasets, etc.) # In my case my working directory is C:/Users/luigi/Dropbox/TOPIC MODEL setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(readtext) library(quanteda) library(ggplot2) library(quanteda.textstats) print(sessionInfo()) print(version) ######################################################################### ######################################################################### # Creating and Working with a Corpus ######################################################################### ######################################################################### ######################################################################### # Several different ways to create a corpus in Quanteda. Let's look at 3 of them # FIRST: you have already a file .csv with a text for each row ######################################################################### # This dataset is a sample of 100 tweet from boston area discussing about food # Data have been collected through Twitter API also specifying language and origin of tweets. # We will discuss later about how to retrieve such type of data in the next weeks x <- read.csv("boston.csv", stringsAsFactors=FALSE) str(x) # You read the dataset via readtext by identifying the name of the column in the dataset with the texts (in this case "text") myText2 <- readtext("boston.csv", text_field = "text") str(myText2) # You create your corpus myCorpus2 <- corpus(myText2) # Jargon: types=number of unique terms; tokens=number of words head(summary(myCorpus2)) # another interesting command for summarize a corpus head(textstat_summary(myCorpus2)) # number of documents in the corpus ndoc(myCorpus2 ) # Show the first text texts(myCorpus2)[1] # other way to recover info about your corpus print(myCorpus2 , max_ndoc = 2, max_nchar = 250) print(tokens(myCorpus2 ), max_ndoc = 3, max_ntok = 10) # Let's move from the corpus to the document-feature(term) matrix! # In quanteda, we use the dfm function to produce such a matrix, where documents are in rows and “features” (aka: words) are columns dfm(myCorpus2) myDfm <- dfm(myCorpus2) # Let's see the first two documents and the first 5 words of our dfm myDfm[1:2, 1:5] # Let's see the texts of the first two documents texts(myCorpus2)[1:2] # 20 top features in the dfm topfeatures(myDfm , 20) # there are a lot of symbols: let's remove them! and let's also use the "stopwords" and "stem" options head(stopwords("english"), 20) head(stopwords("russian"), 10) head(stopwords("italian"), 10) # the source "marino" for som given language (in particular asian languages: japanese, chinese, korean; but also arabic and hebrew) # is a better option. See: https://github.com/koheiw/marimo stopwords("en") stopwords("en", source = "marimo") # The stop words options available in Quanteda (based on the Snowball stopwords list: see http://snowball.tartarus.org/) # works for all the main European languages (ftp://cran.r-project.org/pub/R/web/packages/stopwords/stopwords.pdf). # For other languages things are a bit more complex. For Arabic ones, for example, a good source is also the stemming package ‘arabicStemR’ myDfm <- dfm(myCorpus2, remove = stopwords("english"), tolower = TRUE, stem = TRUE, remove_punct = TRUE, remove_numbers=TRUE) topfeatures(myDfm , 20) # 20 top words [a bit better] # Suppose I want to remove "+", "<" ">", "rt" and all those words that begin with "00" (unicode-symbols) myDfm <- dfm(myCorpus2, remove = c(stopwords("english"), ("+"), ("<"), (">"), ("rt"), ("00*")), tolower = TRUE, stem = TRUE, remove_punct = TRUE, remove_numbers=TRUE) topfeatures(myDfm , 20) # 20 top words # We can also create a dfm identifying only some specific words, such as for example only the hashtags # in the tweets using select = "#*" when creating the dfm. dfm_hashtag <- dfm(myCorpus2, select = "#*") topfeatures(myDfm , 20) # 20 top words topfeatures(dfm_hashtag, 20) # 20 top words # You can plot the frequency of the top features in a text using topfeatures. features_dfm <- textstat_frequency(dfm_hashtag, n = 20) features_dfm str(features_dfm) # Sort by reverse frequency order features_dfm$feature <- with(features_dfm, reorder(feature, -frequency)) ggplot(features_dfm, aes(x = feature, y = frequency)) + geom_point() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) # trimming the dfm myDfm[1:10, 1:10] # keep only words occurring >= 10 times and in >= 2 documents dfm_trim(myDfm, min_termfreq = 10, min_docfreq = 2) # keep only words occurring >= 20 times and in at least 0.4 of the documents dfm_trim(myDfm, min_termfreq = 20, min_docfreq = 0.4) # keep only words occurring <= 10 times and in <=2 documents dfm_trim(myDfm, max_termfreq = 10, max_docfreq = 2) # keep only words occurring 5 times in 1000, and in 2 of 5 of documents dfm_trim(myDfm, min_docfreq = 0.4, min_termfreq = 0.005, termfreq_type = "prop") # keep only words occurring in all the 100 documents of my corpus dfm_trim(myDfm, min_termfreq = 1, min_docfreq = 100) # weighting a dfm according to the relative term frequency, i.e., # normalizing a dfm by considering the proportions of the feature counts within each document myDfm_weight <- dfm_weight(myDfm, scheme = "prop") # compare the two matrices below (the first one: unweighted; the second one: weighted) myDfm[1:5, 1:5] myDfm_weight [1:5, 1:5] # weighting a dfm by tf-idf # remember: tf-idf adds a weight that approaches zero as the number of documents in which a term appears # (in any frequency) approaches the number of documents in the collection. And indeed here #dinner close to 0! myDfm_tf <- dfm_tfidf(myDfm) myDfm[1:5, 1:5] myDfm_tf [1:5, 1:5] # By collocation analysis, we can identify contiguous collocations of words # In this example, we want to understand which two words (size=2) appear # more often together in at least 10 texts textstat_collocations(myCorpus2 , size = 2, min_count = 10) ######################################################################### # SECOND: you write your own corpus ######################################################################### testText <- "The quick brown fox named Seamus jumps over the lazy dog also named Seamus, with the newspaper from a boy named Seamus, in his mouth" testText testCorpus <- corpus(testText) myDfm <- dfm(testCorpus , remove = stopwords("english"), tolower = TRUE, stem = TRUE, remove_punct = TRUE, remove_numbers=TRUE) # note: "also" is not in the default stopwords("english") topfeatures(myDfm , 20) # 20 top words ######################################################################### # THIRD: you have saved in a directory a set of files (one for each document) ######################################################################### # SOURCE: http://www.presidency.ucsb.edu/inaugurals.php # it is always better to save txt files in the UTF-8 format. If you have saved them in some other formats (ISO, etc.), # specify it in the encoding command # Note that readtext reads also other file formats: check "?readtext" # our txt files are included in the folder called "Inaugural Speeches" included in our working directory; # to use the docvarsfrom = "filenames" option, the "file names" should be consistent, i.e., in the below example, in ALL the txts' title # you should have the same ordering: Name, Surname, Year myText <- readtext("Inaugural Speeches/*.txt", docvarsfrom = "filenames", dvsep = "_", docvarnames = c("Name", "Surname", "Year"), encoding = "UTF-8") str(myText) testCorpus <- corpus(myText ) summary(testCorpus) myDfm <- dfm(testCorpus , remove = stopwords("english"), tolower = TRUE, stem = TRUE, remove_punct = TRUE, remove_numbers=TRUE) # note: "also" is not in the default stopwords("english") topfeatures(myDfm , 20) # 20 top words ######################################################################### # Running example: playing with the corpus ######################################################################### # let's load a corpus already presented in Quanteda: the corpus of all the US Presidents' Inaugural Speeches # To summarize the texts from a corpus, we can call a summary() method defined for a corpus. summary(data_corpus_inaugural) # inspect the document-level variables (this is a very important option, as we will see later on...) head(docvars(data_corpus_inaugural)) # to extract texts from a a corpus, we use an extractor, called texts() texts(data_corpus_inaugural)[1] texts(data_corpus_inaugural)[2] # to see the entire Trump speech trump <- corpus_subset(data_corpus_inaugural, President == "Trump") texts(trump )[1] strwrap(trump [[1]]) # Adding two corpus together # First five inaug. speeches mycorpus1 <- corpus(data_corpus_inaugural[1:5]) # Last five inaug. speeches mycorpus2 <- corpus(data_corpus_inaugural[53:58]) mycorpus3 <- mycorpus1 + mycorpus2 summary(mycorpus3) # subsetting a corpus summary(corpus_subset(data_corpus_inaugural, Year > 1990)) summary(corpus_subset(data_corpus_inaugural, President == "Adams")) summary(corpus_subset(data_corpus_inaugural, Year > 1990 & Party== "Republican")) ######################################################################### # Statistical summaries (1): Lexical dispersion plot ######################################################################### # The kwic function (keywords-in-context) performs a search for a word and allows us to view # the contexts in which it occurs: options(width = 200) kwic(data_corpus_inaugural, "terror") # also the words starting with "terror" including "terrorism" kwic(data_corpus_inaugural, "terror", valuetype = "regex") kwic(data_corpus_inaugural, "communist*") # We can also plot a kwic object. It produces a lexical dispersion plot which allows us to visualize the occurrences of particular # terms throughout the text textplot_xray(kwic(data_corpus_inaugural[40:58], "american")) textplot_xray( kwic(data_corpus_inaugural[40:58], "american"), kwic(data_corpus_inaugural[40:58], "people"), kwic(data_corpus_inaugural[40:58], "communist") ) # If you’re only plotting a single document, but with multiple keywords, then the keywords are displayed # one below the other rather than side-by-side. trump <- corpus_subset(data_corpus_inaugural, Year > 2015) textplot_xray( kwic(trump , "america"), kwic(trump , "people"), kwic(trump , "chief") ) # You might also have noticed that the x-axis scale is the absolute token index for single texts # and relative token index when multiple texts are being compared. # If you prefer, you can specify that you want an absolute scale textplot_xray( kwic(data_corpus_inaugural[40:58], "american"), kwic(data_corpus_inaugural[40:58], "people"), kwic(data_corpus_inaugural[40:58], "communist"), scale = 'absolute' ) # The object returned is a ggplot object, which can be modified using ggplot ######################################################################### # Statistical summaries (2): Plotting the wordclouds ######################################################################### myCorpus <- corpus_subset(data_corpus_inaugural, Year > 1990) summary(myCorpus) # make a dfm myDfm <- dfm(myCorpus, remove = stopwords("english"), tolower = TRUE, stem = TRUE, remove_punct = TRUE, remove_numbers=TRUE) # by defining a seed number, you are sure to get always the same plot set.seed(100) textplot_wordcloud(myDfm , min.count = 6, random.order = FALSE, rot.per = .25, colors = RColorBrewer::brewer.pal(8,"Dark2")) textplot_wordcloud(myDfm , min.count = 10, colors = c('red', 'pink', 'green', 'purple', 'orange', 'blue')) # Trump's speech trump <- corpus_subset(data_corpus_inaugural, Year > 2015) summary(trump) trump2 <- dfm(trump, remove = stopwords("english"), remove_punct = TRUE) set.seed(100) textplot_wordcloud(trump2, min.count= 1, random.order = FALSE, rot.per = .25, colors = RColorBrewer::brewer.pal(8,"Dark2")) # You can also plot a “comparison cloud”, but this can only be done with fewer than eight documents: compDfm <- dfm(corpus_subset(data_corpus_inaugural, President %in% c("Washington", "Jefferson", "Madison")), groups = "President", remove = stopwords("english"), remove_punct = TRUE) textplot_wordcloud(dfm_trim(compDfm, min_termfreq = 5, verbose = FALSE), comparison = TRUE) # Plot a "comparison cloud" between Trump, Obama and Reagan compDfm <- dfm(corpus_subset(data_corpus_inaugural, President %in% c("Trump", "Obama", "Reagan")), groups = "President", remove = stopwords("english"), remove_punct = TRUE) textplot_wordcloud(dfm_trim(compDfm, min_termfreq = 5, verbose = FALSE), comparison = TRUE) ######################################################################### # Statistical summaries (3): Lexical diversity ######################################################################### # textstat_lexdiv() calcuates lexical diversity in various measures based on the number of unique types of tokens # and the length of a document. It is useful for analysing speakers’ or writers’ linguistic skill, or complexity # of ideas expressed in documents. inaug_dfm <- dfm(data_corpus_inaugural, remove = stopwords('en')) lexdiv <- textstat_lexdiv(inaug_dfm) # TTR is estimated as V/N, where # V (types=total number of unique terms); N (tokens=total number of words in the dfm) head(lexdiv, 5) tail(lexdiv, 5) # when you run textstat_lexdiv it automatically removes numbers punctuation etc from the corpus [w/o the need for you # to specify that when you create your dfm: see ?textstat_lexdiv] # that's why the ratio you get between Types and Tokens from the corpus gives you a different value: summary(data_corpus_inaugural) # for Washington 1789 625/1537 head(lexdiv, 1) ######################################################################### # Statistical summaries (4): Comparing words associated with a target group vs. reference group ######################################################################### # If you want to compare the differential associations of keywords in a target and reference group, # you can calculate “keyness” which is based on textstat_keyness. # In this example, we compare the inaugural speech by Donald Trump with the speeches by Barack Obama pres_corpus <- corpus_subset(data_corpus_inaugural, President %in% c("Obama", "Trump")) summary(pres_corpus) # Create a dfm grouped by president pres_dfm <- dfm(pres_corpus, remove = stopwords("english"), remove_punct = TRUE) # Calculate keyness and determine Trump as target group result_keyness <- textstat_keyness(pres_dfm, target = "2017-Trump") # if you get a negative value, it means that Obama uses that word more than Trump (i.e., the target group) and viceversa # Plot estimated word keyness textplot_keyness(result_keyness) # what is chi2? Chi-squared test is used to determine whether there is a statistically significant difference between # the expected frequencies and the observed frequencies in one or more categories of a contingency table (in our cases # we are talking about the frequencies of words in two different set of texts) # Plot without the reference text (in this case Obama) textplot_keyness(result_keyness, show_reference = FALSE) head(result_keyness , 10) tail(result_keyness , 10) # compare Trump 2017 to other post-war presidents pwdfm <- dfm(corpus_subset(data_corpus_inaugural, Year > 1945)) head(textstat_keyness(pwdfm, target = "2017-Trump"), 10) # using the likelihood ratio method head(textstat_keyness(dfm_smooth(pwdfm), measure = "lr", target = "2017-Trump"), 10) # Plot estimated word keyness result_keyness3 <- textstat_keyness(pwdfm, target = "2017-Trump") textplot_keyness(result_keyness3) ######################################################################### # Statistical summaries (5): Frequency plots comparison ######################################################################### # If you want to compare the frequency of a single term across different texts, # you can also use textstat_frequency, group the frequency by speech and extract the term. str(pres_dfm) docvars(pres_dfm) # Let's first create a new string variable with the name of the President and of the Year pres_dfm@docvars$Year_str <-paste0(pres_dfm@docvars$President," ", as.character(pres_dfm@docvars$Year)) docvars(pres_dfm) str(pres_dfm) # Get frequency grouped by president freq_grouped <- textstat_frequency(dfm(pres_dfm), groups = "Year_str") str(freq_grouped) table(freq_grouped$group) # Filter the term "american" freq_american <- subset(freq_grouped, freq_grouped$feature %in% "american") ggplot(freq_american, aes(x = group, y = frequency)) + geom_point() + scale_y_continuous(limits = c(0, 14), breaks = c(seq(0, 14, 2))) + xlab(NULL) + ylab("Frequency of word - american -") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) # texstat_frequency allows also to plot the most frequent words by group # Calculate frequency by Presidential speeches - first 10 words for speech freq_weight <- textstat_frequency(pres_dfm, n = 10, groups = "Year") ggplot(data = freq_weight, aes(x = nrow(freq_weight):1, y = frequency)) + geom_point() + facet_wrap(~ group, scales = "free") + coord_flip() + scale_x_continuous(breaks = nrow(freq_weight):1, labels = freq_weight$feature) + labs(x = NULL, y = "Frequency")