rm(list=ls(all=TRUE)) # set your working directory (i.e., where you have saved the datasets, etc.) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(readtext) library(quanteda) library(ggplot2) library(quanteda.textstats) library(quanteda.textplots ) library(SnowballC) library(corrplot) ######################################################################### ######################################################################### # Creating and Working with a Corpus ######################################################################### ######################################################################### ######################################################################### # Several different ways to create a corpus in Quanteda. Let's look at 2 of them # FIRST: you have already a matrix file with a text for each row (such as .csv or .xls) ######################################################################### # This dataset is a sample of 100 tweet from boston area discussing about food # Data have been collected through Twitter API also specifying language and origin of tweets. # We will discuss later about how to retrieve such type of data in the next weeks x <- read.csv("boston.csv", stringsAsFactors=FALSE) str(x) # You read the dataset via readtext by identifying the name of the column in the dataset with the texts (in this case "text") myText2 <- readtext("boston.csv", text_field = "text") str(myText2) # You create your corpus myCorpus2 <- corpus(myText2) # Jargon: types=number of unique terms; tokens=number of words head(summary(myCorpus2)) # number of documents in the corpus ndoc(myCorpus2 ) # Show the first text as.character(myCorpus2)[1] strwrap(as.character(myCorpus2)[1]) # Show the first 3 texts as.character(myCorpus2)[1:3] strwrap(as.character(myCorpus2)[1:3]) # Let's move from the corpus to the document-feature(term) matrix! # In quanteda, we first tokenize the texts via tokens then we use the dfm function to produce such a matrix, # where documents are in rows and “features” (aka: words) are columns tokens(myCorpus2) strwrap(as.character(myCorpus2)[3]) tok2 <- tokens(myCorpus2) str(tok2) # when you create a dfm by default "tolower=TRUE", i.e., we convert all features to lowercase myDfm <- dfm(tok2) # Let's see the first five documents and the first 10 words of our dfm myDfm[1:5, 1:10] str(myDfm) # Let's see the texts of the first two documents as.character(myCorpus2)[1:2] # 20 top features in the dfm topfeatures(myDfm , 20) # let's clean the dfm! # FIRST: let's remove numbers, separators, etc. tok2_clean <- tokens(myCorpus2, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) # SECOND: let's remove the stopwords head(stopwords("english"), 20) head(stopwords("russian"), 10) head(stopwords("italian"), 10) # the source "marino" for som given language (in particular asian languages: japanese, chinese, korean; but also arabic and hebrew) # is a better option. See: https://github.com/koheiw/marimo stopwords("en") stopwords("en", source = "marimo") # The stop words options available in Quanteda (based on the Snowball stopwords list: see http://snowball.tartarus.org/) # works for all the main European languages (ftp://cran.r-project.org/pub/R/web/packages/stopwords/stopwords.pdf). getStemLanguages() # For other languages things are a bit more complex. For Arabic ones, for example, a good source is also the stemming package ‘arabicStemR’ tok2_clean_stop <- tokens_remove(tok2_clean , stopwords("english")) # THIRD: let's stem the words tok2_clean_stop_stem<- tokens_wordstem (tok2_clean_stop, language =("english")) # Let's now re-create the dfm myDfm2 <- dfm(tok2_clean_stop_stem) topfeatures(myDfm , 50) topfeatures(myDfm2 , 20) # the featurea "for", "with" and "the" have disappeared! while #smoked is now #smoke # still some symbols to remove! for example the words starting with "00" (they are unicode characters) "\U00BD" tok2_clean_stop_stem2 <- tokens_remove(tok2_clean_stop_stem, c(("rt"), ("00*"), ("ed"), ("u"), ("lyft"))) myDfm2 <- dfm(tok2_clean_stop_stem2) topfeatures(myDfm2 , 20) # 20 top features [better!] # We can also create a dfm identifying only some specific words, such as for example only the hashtags # in the tweets using select = "#*" when creating the dfm. dfm_hashtag <- dfm_select(myDfm2, pattern = c("#*")) topfeatures(myDfm2 , 20) # 20 top features topfeatures(dfm_hashtag, 20) # 20 top features # Note that you can remove stopwords also with dfm_remove, after you have created a dfm dfm_remove(myDfm, pattern = stopwords("en")) # You can also decide to exclude some features. For example, let's exclude all the hashtags dfm_NOhashtag <- dfm_remove(myDfm2, pattern = c("#*")) topfeatures(myDfm2 , 20) # 20 top features topfeatures(dfm_hashtag, 20) # 20 top features topfeatures(dfm_NOhashtag , 20) # 20 top features # trimming the dfm myDfm[1:10, 1:10] # keep only words occurring >= 10 times and in >= 2 documents dfm_trim(myDfm, min_termfreq = 10, min_docfreq = 2) # keep only words occurring >= 20 times and in at least 0.4 of the documents dfm_trim(myDfm, min_termfreq = 20, min_docfreq = 0.4) # keep only words occurring <= 10 times and in <=2 documents dfm_trim(myDfm, max_termfreq = 10, max_docfreq = 2) # keep only words occurring 5 times in 1000, and in 2 of 5 of documents dfm_trim(myDfm, min_docfreq = 0.4, min_termfreq = 0.005, termfreq_type = "prop") # keep only words occurring in all the 100 documents of my corpus dfm_trim(myDfm, min_termfreq = 1, min_docfreq = 100) # weighting a dfm according to the relative term frequency, i.e., # normalizing a dfm by considering the proportions of the feature counts within each document myDfm_weight <- dfm_weight(myDfm, scheme = "prop") # compare the two matrices below (the first one: unweighted; the second one: weighted) myDfm[1:5, 1:5] myDfm_weight [1:5, 1:5] # weighting a dfm by tf-idf # remember: tf-idf adds a weight that approaches zero as the number of documents in which a term appears # (in any frequency) approaches the number of documents in the collection. And indeed here #dinner close to 0! myDfm_tf <- dfm_tfidf(myDfm) myDfm[1:5, 1:5] myDfm_tf [1:5, 1:5] ######################################################################### # SECOND: you have saved in a directory a set of files (one for each document) in a given format (.txt, .doc, .pdf) ######################################################################### # SOURCE: http://www.presidency.ucsb.edu/inaugurals.php # it is always better to save txt files in the UTF-8 format. If you have saved them in some other formats (ISO, etc.), # it's a good idea to specify it in the encoding command # Note that readtext reads also other file formats: check "?readtext" # our txt files are included in the folder called "Inaugural Speeches" included in our working directory myText <- readtext("Inaugural Speeches/*.txt", encoding = "UTF-8") str(myText) # we can actually extract three pieces of info from each file name (Name, Surname, Year: i.e., "George_Washington_1789.txt") # Note: to use the docvarsfrom = "filenames" option, the "file names" should be consistent, i.e., in the below example, in ALL the txts' title # you should have the same ordering: Name, Surname, Year using the same separators (i.e., "_") myText <- readtext("Inaugural Speeches/*.txt", docvarsfrom = "filenames", dvsep = "_", docvarnames = c("Name", "Surname", "Year"), encoding = "UTF-8") str(myText) testCorpus <- corpus(myText ) summary(testCorpus) # inspect the document-level variables (this is a very important option, as we will see later on...) head(docvars(testCorpus)) tok4 <- tokens(testCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok4 <- tokens_remove(tok4 , stopwords("en")) tok4 <- tokens_wordstem (tok4 ) myDfm <- dfm(tok4) topfeatures(myDfm , 20) # 20 top words ######################################################################### # Playing with the corpus ######################################################################### # let's load a corpus already presented in Quanteda: the corpus of all the US Presidents' Inaugural Speeches # To summarize the texts from a corpus, we can call a summary() method defined for a corpus. summary(data_corpus_inaugural) # inspect the document-level variables head(docvars(data_corpus_inaugural)) as.character(data_corpus_inaugural)[1] as.character(data_corpus_inaugural)[2] # to see the entire Trump speech trump <- corpus_subset(data_corpus_inaugural, President == "Trump") summary(trump) strwrap(as.character(trump [[1]])) # Adding two corpus together # First five inaug. speeches mycorpus1 <- corpus(data_corpus_inaugural[1:5]) # Last five inaug. speeches mycorpus2 <- corpus(data_corpus_inaugural[53:58]) mycorpus3 <- mycorpus1 + mycorpus2 summary(mycorpus3) # subsetting a corpus summary(corpus_subset(data_corpus_inaugural, Year > 1990)) summary(corpus_subset(data_corpus_inaugural, President == "Adams")) summary(corpus_subset(data_corpus_inaugural, Year > 1990 & Party== "Republican")) ######################################################################### # Let's explore some statistical summaries methods ######################################################################### # Statistical summary methods are essentially quantitative summaries of texts to describe their characteristics on some indicator, and may use (or not) statistical methods # based on sampling theory for comparison ######################################################################### # Statistical summaries (1): Lexical dispersion plot (Positional Analysis: i.e., analysis that retain the original text sequence) ######################################################################### # The kwic function (keywords-in-context) performs a search for a word in a corpus and it allows us to view the contexts in which it occurs # NOTE: by working on a token object, we retain the original text sequence! options(width = 200) kwic(tokens(data_corpus_inaugural), "terror") # also the words starting with "terror" including "terrorism" kwic(tokens(data_corpus_inaugural), "terror*") # same result as above kwic(tokens(data_corpus_inaugural), "terror", valuetype = "regex") kwic(tokens(data_corpus_inaugural), "communis*") # Note that by default, the kwic() is word-based. If you like to look up a multiword combination, use phrase() kwic(tokens(data_corpus_inaugural), phrase("by terror")) # We can plot a kwic object via a Lexical dispersion plot # a Lexical dispersion plot allow you to detect both the relative frequency of an employed word across documents as well as the “timing” of that word in a given text textplot_xray(kwic(tokens(data_corpus_inaugural[40:59]), "american")) textplot_xray( kwic(tokens(data_corpus_inaugural[40:59]), "american"), kwic(tokens(data_corpus_inaugural[40:59]), "people"), kwic(tokens(data_corpus_inaugural[40:59]), "communis*") ) # If you’re only plotting a single document, but with multiple keywords, then the keywords are displayed # one below the other rather than side-by-side. textplot_xray( kwic(tokens(corpus_subset(data_corpus_inaugural, Year > 2015 & Year<2020)), "america"), kwic(tokens(corpus_subset(data_corpus_inaugural, Year > 2015 & Year<2020)), "people"), kwic(tokens(corpus_subset(data_corpus_inaugural, Year > 2015 & Year<2020)), "chief") ) # You might also have noticed that the x-axis scale is the absolute token index for single texts # and relative token index when multiple texts are being compared. # If you prefer, you can specify that you want an absolute scale textplot_xray( kwic(tokens(data_corpus_inaugural[40:59]), "american"), kwic(tokens(data_corpus_inaugural[40:59]), "people"), kwic(tokens(data_corpus_inaugural[40:59]), "communis*"), scale = 'absolute' ) # The object returned is a ggplot object, which can be modified using ggplot plot <- textplot_xray( kwic(tokens(data_corpus_inaugural[40:59]), "american"), kwic(tokens(data_corpus_inaugural[40:59]), "people"), kwic(tokens(data_corpus_inaugural[40:59]), "communist")) plot + aes(color = keyword) + scale_color_manual(values = c('red', 'blue', "green")) ######################################################################### # Statistical summaries (2): Plotting the wordclouds (Non-positional Analysis: i.e., analysis that DO NOT retain the original text sequence - bag of words) ######################################################################### # One of the simplest statistical summary method you can apply to a DfM is a tag cloud. # A tag cloud is a visual representation of text data, in which tags are single words whose frequency is shown with different font size (and/or color) myCorpus <- corpus_subset(data_corpus_inaugural, Year > 1990) summary(myCorpus) tok2 <- tokens(myCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) myDfm <- dfm(tok2) # if you define a seed, each time you get always the same plot set.seed(123) textplot_wordcloud(myDfm , min.count = 6, rot.per = .25, colors = RColorBrewer::brewer.pal(8,"Dark2")) textplot_wordcloud(myDfm , min.count = 10, colors = c('red', 'pink', 'green', 'purple', 'orange', 'blue')) # You can also plot a “comparison cloud”, but this can only be done with fewer than eight documents: corp2 <- corpus_subset(data_corpus_inaugural, President %in% c("Washington", "Jefferson", "Madison")) summary(corp2) tok2 <- tokens(corp2, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) myDfm <- dfm(tok2) # let's group the speeches made by the same President (such as the two speeches made by Washington) in one single dfm using # the command we can use is dfm_group myDfm2 <- dfm_group(myDfm, groups = President) str(myDfm) length(myDfm@Dimnames$docs) # 6 documents str(myDfm2) length(myDfm2@Dimnames$docs) # 3 documents! set.seed(123) textplot_wordcloud(myDfm2, comparison = TRUE) # let's plot the dfm set.seed(123) textplot_wordcloud(dfm_trim(myDfm2, min_termfreq = 5, verbose = FALSE), comparison = TRUE) # Exercise: let's plot a "comparison cloud" between Biden, Trump and Obama corp2 <- corpus_subset(data_corpus_inaugural, President %in% c("Biden", "Trump", "Obama")) tok2 <- tokens(corp2, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) myDfm <- dfm(tok2) myDfm2 <- dfm_group(myDfm, groups = President) set.seed(123) textplot_wordcloud(dfm_trim(myDfm2 , min_termfreq = 5, verbose = FALSE), comparison = TRUE) ######################################################################### # Statistical summaries (3): Comparing words associated with a target group vs. reference group (Non-positional Analysis) ######################################################################### # More sophisticated methods compare the differential occurrences of words across texts or partitions of a corpus, using statistical association measures, # to identify the words that belong for example to different sub-groups of texts, such as those predominantly associated with male- versus female - authored documents # In this respect we can employ for example a chi2 test. Chi-squared test is used to determine whether there is a statistically significant difference between # the expected frequencies and the observed frequencies in one or more categories of a contingency table (in our cases # we are talking about the frequencies of words in two different set of texts). If you look at 95% c.i., the chi2 value should be of course # larger than |1.96| # More in details, if you want to compare the differential associations of keywords in a target and reference group, # you can calculate “keyness” which is based on the textstat_keyness command. # In this example, we compare the inaugural speech by Donald Trump with the speech by Joe Biden pres_corpus <- corpus_subset(data_corpus_inaugural, President %in% c("Trump", "Biden")) summary(pres_corpus) # Create a dfm grouped by president tok2 <- tokens(pres_corpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) docnames(pres_corpus) pres_dfm<- dfm(tok2) # Calculate keyness and determine Trump as target group result_keyness <- textstat_keyness(pres_dfm, target = "2017-Trump") # if you get a negative value, it means that Biden uses that word more than Trump (i.e., the target group) and viceversa # Plot estimated word keyness textplot_keyness(result_keyness) # Plot without the reference text (in this case Biden) textplot_keyness(result_keyness, show_reference = FALSE) head(result_keyness , 10) tail(result_keyness , 10) # Your reference text can also include more than one text. For example by typing: pres_corpus <- corpus_subset(data_corpus_inaugural, President %in% c("Trump", "Biden", "Obama")) tok2 <- tokens(pres_corpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) pres_dfm<- dfm(tok2) # You calculate keyness and determine Trump as target group vs. Obama and Biden speeches as the reference group result_keyness <- textstat_keyness(pres_dfm, target = "2017-Trump") ######################################################################### # Statistical summaries (4): Lexical diversity (Non-positional Analysis) ######################################################################### # Other quantitative summary measures of documents are designed to characterize specific qualities of texts # Comparing the rates of types and tokens forms the foundation for measures of lexical diversity (the rate of vocabulary usage), with most common such measure # comparing the number of types to the number of tokens (the “type-token ratio”) # For example, it is argued that populist communication means simplified political discourse (lower diversity), in an attempt to reach the public more easily # textstat_lexdiv() command calcuates precisely lexical diversity in various measures based on the number of unique types of tokens # and the length of a document. It is useful for analysing speakers’ or writers’ linguistic skill, or complexity # of ideas expressed in documents. inaug_tokens <- tokens(data_corpus_inaugural) tok2 <- tokens_remove(inaug_tokens , stopwords("en")) inaug_dfm <- dfm(tok2) lexdiv <- textstat_lexdiv(inaug_dfm) str(lexdiv) ggplot(data=lexdiv , aes(x=document, y=TTR, group=1)) + geom_line()+ geom_point()+ theme_minimal() + scale_x_discrete(breaks=c("1789-Washington","1933-Roosevelt","1961-Kennedy", "2017-Trump")) # a temporal decreasing in the level of complexity? # let's see ggplot(data=lexdiv , aes(x=document, y=TTR, group=1)) + geom_line()+ geom_point()+ theme_minimal() + scale_x_discrete(breaks=c("1789-Washington","1933-Roosevelt","1961-Kennedy", "2017-Trump")) + geom_smooth(method = "lm") # TTR is estimated as V/N, where # V (types=total number of unique terms); N (tokens=total number of words in the dfm) head(lexdiv, 5) tail(lexdiv, 5) # when you run textstat_lexdiv it automatically removes numbers punctuation etc from the corpus [w/o the need for you # to specify that when you create your dfm: see ?textstat_lexdiv] # that's why the ratio you get between Types and Tokens from the corpus gives you a different value: summary(data_corpus_inaugural) # for Washington 1789 625/1537 head(lexdiv, 1) ######################################################################### # Statistical summaries (5): Cosine similarities (Non-positional Analysis) ######################################################################### # cosine similarity is an intuitive measure of semantic distance that has been increasingly used in a number of fields of social sciences. # By representing texts as vectors in a Cartesian space, cosine similarity estimates the differences between two texts based on vectors # of word occurrences. The possible divergence between two texts ranges between 0 and 1, where 0 is reached when two texts are completely # different and 1 is reached when two texts have identical feature proportions # create a dfm from inaugural addresses from Reagan onwards myCorpus <- corpus_subset(data_corpus_inaugural, Year > 1980) # alternative way to get the speeches after 1980 [note the use of the logical operator "&"] # myCorpus <- corpus_subset(data_corpus_inaugural, Year > 1980 & Year<2021) summary(myCorpus) tok2 <- tokens(myCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) presDfm<- dfm(tok2) presDfm # compute some document similarities Simil <- textstat_simil(presDfm, margin = "documents", method = "cosine") Simil # Let's plot it! Simil2 <-as.matrix(Simil) str(Simil2) corrplot(Simil2, method = 'number') corrplot(Simil2, method = 'color') corrplot(Simil2, method = 'shade', type = 'lower') # for specific comparisons: here the two speeches by Obama obamaSimil <- textstat_simil(presDfm, presDfm[c("2009-Obama", "2013-Obama"), ], margin = "documents", method = "cosine") obamaSimil # compute some term similarities tstat3 <- textstat_simil(presDfm, presDfm[, c("fair", "health", "terror")], method = "cosine", margin = "features", 20) head(as.matrix(tstat3), 10) as.list(tstat3, n = 6) ######################################################################### # TOKENIZE Japanese or Chinese language (i.e., languages w/o white spaces) ######################################################################### # When you want to show Japanese kanji on a non-Japanese laptop, always write the following at the beginning of your session: Sys.setlocale("LC_CTYPE","japanese") # Suppose we want to analyze the text of speeches made on 17 Nov, 2017 and 20 Nov, 2017 for the new Japanese parliamentary session myText <- readtext("Diet speeches/*.txt", encoding = "UTF-8") str(myText) # tokens() can segment Japanese texts without additional tools based on the rules defined in the ICU library, # which is available via the stringi package (that comes with the Quanteda one). ICU detects boundaries of Japanese words using a dictionary # with frequency information (see: http://source.icu-project.org/repos/icu/icu/tags/release-58-rc/source/data/brkitr/dictionaries/ # for the list of Asian languages covered, including Chinese language) icu_toks <- tokens(myText$text) # this expression means: consider document 2, and report me the first 40 characters appearing in it head(icu_toks[[2]], 40) # Japanese stopwords stopwords("ja", source = "stopwords-iso") # A better alternative is using the source "marimo" stopwords("ja", source = "marimo") # About stopwords in different languages, take a look at here: https://cran.r-project.org/web/packages/stopwords/README.html # Alternatively, you want to remove them by yourself by using the tokens_remove option discussed above # tokenize corpus and apply pre-processing toks <- tokens(myText$text, remove_punct = TRUE, remove_numbers = TRUE, remove_separators = TRUE) head(icu_toks[[1]], 40) head(toks[[1]], 40) toks <- tokens_remove(toks, stopwords("ja", source = "marimo")) head(toks[[1]], 40) # You can also decide toselect tokens only with Hiragana: # \\p means match a collection of character, not single one # $ means "Match the end of the string" (the position after the last character in the string) to ensure that the entire string # is matched instead of just a substring toksHira <- tokens_select(toks, pattern = "\\p{script=Hira}+$", valuetype = "regex") head(toks[[1]], 40) head(toksHira [[1]], 40) # You can also select only Japanese words (Hiragana, Katakana, Kanji) - in this case you do not see any differences toks2 <- tokens_select(toks, pattern =c("\\p{script=Hira}", "\\p{script=Kana}", "\\p{script=Hani}"), valuetype = "regex") head(toks[[1]], 100) head(toks2[[1]], 100) jap_dfm <- dfm(toks) topfeatures(jap_dfm, 20) jap_dfm[, 1:5] # I want to name properly the texts jap_dfm@docvars jap_dfm@docvars[,2] jap_dfm@docvars[,2] <- myText$doc_id jap_dfm[, 1:5] # I could also write directly the name I want to give to such texts (this is an hypothetical example with fictious names!) jap_dfm@docvars[,2] <- c("A", "B", "C", "D", "E", "F", "G") jap_dfm@docvars[,2] jap_dfm[, 1:5] # If you want to perform more accurate tokenization, you need to install a morphological analysis tool, and call it from R. # For example: RcppMeCab (Chinese, Japanese, and Korean): https://github.com/junhewk/RcppMeCab ######################################################################### # Running example: How to deal with Chinese language ######################################################################### corp <- quanteda.corpora::download(url = "https://www.dropbox.com/s/37ojd5knz1qeyul/data_corpus_chinesegovreport.rds?dl=1") # tokenize corpus and apply pre-processing ch_toks<- tokens(corp, remove_punct = TRUE, remove_numbers = TRUE, remove_separators = TRUE) head(ch_toks[[1]], 40) ch_toks<- tokens_remove(toks, stopwords("zh_cn", source = "marimo")) head(ch_toks[[1]], 40) # Alternative Chinese stopwords stopwords("zh", source = "misc") stopwords("zh", source = "stopwords-iso") # You can keep only Chinese characters ch_toks<- tokens_select(toks, pattern = "\\p{script=Hani}+$", valuetype = "regex") head(ch_toks[[1]], 40) ch_dfm <- dfm(ch_toks) topfeatures(ch_dfm) ch_dfm[, 1:5]