rm(list=ls(all=TRUE)) # set your working directory (i.e., where you have saved the datasets, etc.) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(readtext) library(quanteda) library(ggplot2) library(quanteda.textstats) library(quanteda.textplots ) library(SnowballC) library(corrplot) # General preamble: I prefer in my scripts to avoid to employ the Pipe Operator(%>%) just for a matter of clarity # The Pipe Operator - as you should know - makes coding easier, i.e., "a %>% mean = mean(a)" # Everything that's to the left of the pipe is passed as the first argument to the function that's after the pipe ######################################################################### ######################################################################### # Creating and Working with a Corpus ######################################################################### ######################################################################### # Several different ways to create a corpus in Quanteda. Let's look at 2 of them: single vs. multiple files ######################################################################### # FIRST: you have already a matrix file with a text for each row (such as .csv or .xls) - i.e., you have already a pre-formatted file ######################################################################### # This dataset is a sample of 100 tweet from boston area discussing about food # Data have been collected through Twitter API also specifying language and origin of tweets. # We will discuss later about how to retrieve such type of data in the next weeks. x <- read.csv("boston.csv", stringsAsFactors=FALSE) str(x) # Albeit you can generate directly a corpus after having read a .csv file via the function "read.csv", I suggest you to read a .csv file always # via the function "readtext". In this case always remember to identify the name of the column in the dataset # with the texts (in this case "text"). You can notice that a new column "doc_id" (including an index of texts) has been automatically created. # The function "readtext" is pretty flexible, and it allows you to read several different types of files (.csv,.html,.txt,.xls,.pdf,.doc) # check "?readtext" myText2 <- readtext("boston.csv", text_field = "text") str(myText2) # You create your corpus via the function "corpus" myCorpus2 <- corpus(myText2) # Jargon: types=number of unique terms; tokens=number of words head(summary(myCorpus2)) # number of documents in the corpus ndoc(myCorpus2 ) print(myCorpus2) # print the first text as.character(myCorpus2)[1] # same thing but w/o interruption of the text strwrap(as.character(myCorpus2)[1]) # print the first 3 texts as.character(myCorpus2)[1:3] strwrap(as.character(myCorpus2)[1:3]) # Let's move from the corpus to the document-feature(term) matrix! # In quanteda, we first tokenize the texts via the function "tokens" then we use the function "dfm" to produce such # a matrix, where documents are in rows and ?features? (aka: words) are columns tokens(myCorpus2) tok2 <- tokens(myCorpus2) tok2 # when you create a dfm by default "tolower=TRUE", i.e., we convert all features to lowercase myDfm <- dfm(tok2) # we can get the number of documents and features ndoc() and nfeat() that build our DfM ndoc(myDfm) nfeat(myDfm) str(myDfm) # same info about the number of documents and features here str(myDfm@Dimnames) # we can also obtain the names of documents and features by docnames() and featnames() head(docnames(myDfm), 20) head(featnames(myDfm), 20) # Let's see the first five documents and the first 10 words of our dfm myDfm[1:5, 1:10] # Let's print the texts of the first two documents strwrap(as.character(myCorpus2)[1:2]) # under docvars you can find the document-level variables str(myDfm@docvars) # 20 top features in the dfm topfeatures(myDfm , 20) # let's clean the dfm! # FIRST: let's remove numbers, separators, etc. Note that I decided also to remove the URLs tok2_clean <- tokens(myCorpus2, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_url=TRUE) # SECOND: let's remove the stopwords head(stopwords("english"), 20) head(stopwords("russian"), 10) head(stopwords("italian"), 10) # the source "marino" for som given language (in particular asian languages: japanese, chinese, korean; but also arabic and hebrew) # is a better option. See: https://github.com/koheiw/marimo stopwords("en") stopwords("en", source = "marimo") # If you are interested in dealing with Japanese/Chinese texts, plz take a look at the corresponding R script on the home-page of the course under EXTRA for Lab Class 1 # The stop words options available in Quanteda (based on the Snowball stopwords list: see http://snowball.tartarus.org/) # works for all the main European languages (ftp://cran.r-project.org/pub/R/web/packages/stopwords/stopwords.pdf). getStemLanguages() # For other languages things are a bit more complex. For Arabic ones, for example, a good source is also the stemming package ?arabicStemR? tok2_clean <- tokens_remove(tok2_clean , stopwords("english")) # THIRD: let's stem the words tok2_clean<- tokens_wordstem (tok2_clean, language =("english")) # Let's now re-create the dfm myDfm2 <- dfm(tok2_clean) topfeatures(myDfm , 50) topfeatures(myDfm2 , 20) # the featurea "for", "with" and "the" have disappeared! while #smoked is now #smoke # still some symbols to remove! for example the words starting with "00" (they are unicode characters) "\U00BD" tok2_clean <- tokens_remove(tok2_clean, c(("rt"), ("00*"), ("ed"), ("u"))) myDfm2 <- dfm(tok2_clean ) topfeatures(myDfm2 , 20) # 20 top features [better!] # You can save (and then eventually plot) the frequency of the top features in a dfm (in this case the top-20 features) features_dfm <- textstat_frequency(myDfm2, n = 20) features_dfm ggplot(features_dfm, aes(x = reorder(feature, frequency), y = frequency)) + geom_point() + coord_flip() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) # We can also create a dfm identifying only some specific words, such as for example only the hashtags # in the tweets using select = "#*" when creating the dfm. dfm_hashtag <- dfm_select(myDfm2, pattern = c("#*")) topfeatures(myDfm2 , 20) # 20 top features topfeatures(dfm_hashtag, 20) # 20 top features # You can also decide to exclude some features. For example, let's exclude all the hashtags dfm_NOhashtag <- dfm_remove(myDfm2, pattern = c("#*")) topfeatures(myDfm2 , 20) # 20 top features topfeatures(dfm_hashtag, 20) # 20 top features topfeatures(dfm_NOhashtag , 20) # 20 top features # Following the same logic, you can remove stopwords also with dfm_remove, after you have created a dfm dfm_remove(myDfm, pattern = stopwords("en")) # trimming the dfm myDfm[1:10, 1:10] # keep only words occurring >= 10 times and in >= 2 documents dfm_trim(myDfm, min_termfreq = 10, min_docfreq = 2) # keep only words occurring <= 10 times and in <=2 documents dfm_trim(myDfm, max_termfreq = 10, max_docfreq = 2) # keep only words occurring in 4 of 10 of documents dfm_trim(myDfm, min_docfreq = 0.4, termfreq_type = "prop") # keep only words occurring in all the 100 documents of my corpus at least once dfm_trim(myDfm, min_termfreq = 1, min_docfreq = 100) ######################################################################### # SECOND: you have saved in a directory a set of files (one for each document) in a given format (.txt, .doc, .pdf) - i.e., you have multiple file texts ######################################################################### # SOURCE: http://www.presidency.ucsb.edu/inaugurals.php # our txt files are included in the folder called "Inaugural Speeches" included in our working directory myText <- readtext("Inaugural Speeches/*.txt") str(myText) # we can actually extract three pieces of info from each file name (Name, Surname, Year: i.e., "George_Washington_1789.txt") # Note: to use the docvarsfrom = "filenames" option, the "file names" should be consistent, i.e., in the below example, in ALL the txts' title # you should have the same ordering: Name, Surname, Year using the same separators (i.e., "_") # It is always better to save your .txt files using the UTF-8 encoding when you are analyzing texts written in English (or Japanese/Chinese # for example) before reading them in R. With other languages, UTF-8 is generally fine. But other encoding can be better. Check on-line myText <- readtext("Inaugural Speeches/*.txt", docvarsfrom = "filenames", dvsep = "_", docvarnames = c("Name", "Surname", "Year")) str(myText) print(myText) testCorpus <- corpus(myText ) summary(testCorpus) print(testCorpus) # inspect the document-level variables (this is a very important option, as we will see later on...) head(docvars(testCorpus)) # if we want to extract individual elements of the document variables, we can specify field. docvars(testCorpus, field = "Year") # alternatively: testCorpus$Year # docvars() also allows you to create or update document variables ndoc(testCorpus) testCorpus$index<- c(1:5) head(docvars(testCorpus)) # Let's now tokenize the texts tok4 <- tokens(testCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok4 <- tokens_remove(tok4 , stopwords("en")) tok4 <- tokens_wordstem (tok4 ) myDfm <- dfm(tok4) topfeatures(myDfm , 20) # 20 top words # Note that you can also extract the topfeatures according to some document level variable, for example let's do it # with respect to the Surname of the President (i.e., we have two speeches for President in our dfm) topfeatures(myDfm , 5, groups=Surname) # 5 top words for each President ######################################################################### # Playing with the corpus ######################################################################### # let's load a corpus already presented in Quanteda: the corpus of all the US Presidents' Inaugural Speeches # To summarize the texts from a corpus, we can call a summary() method defined for a corpus. summary(data_corpus_inaugural) # inspect the document-level variables head(docvars(data_corpus_inaugural)) # let's print the speech by the first US President (i.e., George Washington) strwrap(as.character(data_corpus_inaugural)[1]) # we can subset a corpus according to some document-level variable value via the corpus_subset command # For example, let's extract only the Trump speech (by taking advantage of the document-level variable "President" presented in our corpus) # Note: you can also subset a token according to some document-level variable via the tokens_subset command trump <- corpus_subset(data_corpus_inaugural, President == "Trump") summary(trump) strwrap(as.character(trump [[1]])) # Let's extract the first five inaug. speeches mycorpus1<- corpus_subset(data_corpus_inaugural, Year <1806) summary(mycorpus1) # We could have done it also by selecting the index of the documents in our corpus mycorpus1_alt <- corpus(data_corpus_inaugural[1:5]) summary(mycorpus1_alt ) # Let's extract the last six inaug. speeches (i.e., since 2000) mycorpus2<- corpus_subset(data_corpus_inaugural, Year >2000) summary(mycorpus2) # We can add two corpus together mycorpus3 <- mycorpus1 + mycorpus2 summary(mycorpus3) # We can also subset a corpus according to more than one single condition: note the use of the logical operator "&" summary(corpus_subset(data_corpus_inaugural, Year > 1990 & Party== "Republican")) # the function corpus_reshape() allows to change the unit of texts between documents, paragraphs and sentences summary(data_corpus_inaugural) ndoc(data_corpus_inaugural) print(data_corpus_inaugural) corp_sent <- corpus_reshape(data_corpus_inaugural, to = "sentences") ndoc(corp_sent ) print(corp_sent) # If you apply corpus_subset() to corp_sent, you can only keep long sentences (more than 50 words). corp_sent_long <- corpus_subset(corp_sent, ntoken(corp_sent) >= 50) ndoc(corp_sent_long) # Restore the original documents. corp_sent2 <- corpus_reshape(corp_sent, to = "documents") ndoc(corp_sent2 ) print(corp_sent2 ) ######################################################################### # Let's explore some statistical summaries methods ######################################################################### # Statistical summary methods are essentially quantitative summaries of texts to describe their characteristics on some indicator, # and may use (or not) statistical methods based on sampling theory for comparison ######################################################################### # Statistical summaries (1): Lexical dispersion plot (Positional Analysis: i.e., analysis that retain the original text sequence) ######################################################################### # The kwic function (keywords-in-context) performs a search for a word in a corpus and it allows us to view the contexts in which it occurs # NOTE: by working on a token object, we retain the original text sequence! options(width = 200) kwic(tokens(data_corpus_inaugural), "terror") # with the window argument, we can specify the number of words to be displayed around the keyword (here=1) kwic(tokens(data_corpus_inaugural), "terror", window = 1) # also the words starting with "terror" including "terrorism" kwic(tokens(data_corpus_inaugural), "terror*") # same result as above via the "regex" specification. # A regex, or Regular Expression, is a sequence of characters that forms a search pattern kwic(tokens(data_corpus_inaugural), "terror", valuetype = "regex") kwic(tokens(data_corpus_inaugural), "communis*") # Note that by default, the kwic() is word-based. If you like to look up a multiword combination, use phrase() kwic(tokens(data_corpus_inaugural), phrase("by terror")) # We can plot a kwic object via a Lexical dispersion plot # a Lexical dispersion plot allow you to detect both the relative frequency of an employed word across documents as well as the ?timing? of that word in a given text textplot_xray(kwic(tokens(data_corpus_inaugural[40:59]), "american")) textplot_xray( kwic(tokens(data_corpus_inaugural[40:59]), "american"), kwic(tokens(data_corpus_inaugural[40:59]), "people"), kwic(tokens(data_corpus_inaugural[40:59]), "communis*") ) # If you?re only plotting a single document, but with multiple keywords, then the keywords are displayed # one below the other rather than side-by-side. textplot_xray( kwic(tokens(corpus_subset(data_corpus_inaugural, Year > 2015 & Year<2020)), "america"), kwic(tokens(corpus_subset(data_corpus_inaugural, Year > 2015 & Year<2020)), "people"), kwic(tokens(corpus_subset(data_corpus_inaugural, Year > 2015 & Year<2020)), "chief") ) # You might also have noticed that the x-axis scale is the absolute token index for single texts # and relative token index when multiple texts are being compared. # If you prefer, you can specify that you want an absolute scale textplot_xray( kwic(tokens(data_corpus_inaugural[40:59]), "american"), kwic(tokens(data_corpus_inaugural[40:59]), "people"), kwic(tokens(data_corpus_inaugural[40:59]), "communis*"), scale = 'absolute' ) # The object returned is a ggplot object, which can be modified using ggplot plot <- textplot_xray( kwic(tokens(data_corpus_inaugural[40:59]), "american"), kwic(tokens(data_corpus_inaugural[40:59]), "people"), kwic(tokens(data_corpus_inaugural[40:59]), "communist")) plot + aes(color = keyword) + scale_color_manual(values = c('red', 'blue', "green")) ######################################################################### # Statistical summaries (2): Plotting the wordclouds (Non-positional Analysis: i.e., analysis that DO NOT retain the original text sequence - bag of words) ######################################################################### # One of the simplest statistical summary method you can apply to a DfM is a tag cloud. # A tag cloud is a visual representation of text data, in which tags are single words whose frequency is shown with different font size (and/or color) myCorpus <- corpus_subset(data_corpus_inaugural, Year > 1990) summary(myCorpus) tok2 <- tokens(myCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) myDfm <- dfm(tok2) # if you define a seed, each time you get always the same plot set.seed(123) textplot_wordcloud(myDfm , min.count = 6, rot.per = .25, colors = RColorBrewer::brewer.pal(8,"Dark2")) textplot_wordcloud(myDfm , min.count = 10, colors = c('red', 'pink', 'green', 'purple', 'orange', 'blue')) # You can also plot a ?comparison cloud?, but this can only be done with fewer than eight documents: corp2 <- corpus_subset(data_corpus_inaugural, President %in% c("Washington", "Jefferson", "Madison")) summary(corp2) tok2 <- tokens(corp2, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) myDfm <- dfm(tok2) # let's group the speeches made by the same President (such as the two speeches made by Washington) in one single dfm using # the function "dfm_group" myDfm2 <- dfm_group(myDfm, groups = President) str(myDfm) length(myDfm@Dimnames$docs) # 6 documents str(myDfm2) length(myDfm2@Dimnames$docs) # 3 documents! set.seed(123) textplot_wordcloud(myDfm2, comparison = TRUE) # let's plot the wordcloud with some trimming at once set.seed(123) textplot_wordcloud(dfm_trim(myDfm2, min_termfreq = 5, verbose = FALSE), comparison = TRUE) # Let's plot a "comparison cloud" between Biden, Trump and Obama corp2 <- corpus_subset(data_corpus_inaugural, President %in% c("Biden", "Trump", "Obama")) tok2 <- tokens(corp2, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) myDfm <- dfm(tok2) myDfm2 <- dfm_group(myDfm, groups = President) set.seed(123) textplot_wordcloud(dfm_trim(myDfm2 , min_termfreq = 5, verbose = FALSE), comparison = TRUE) ######################################################################### # Statistical summaries (3): Lexical diversity (Non-positional Analysis) ######################################################################### # Other quantitative summary measures of documents are designed to characterize specific qualities of texts # Comparing the rates of types and tokens forms the foundation for measures of lexical diversity (the rate of vocabulary usage), with most common such measure # comparing the number of types to the number of tokens (the "type-token ratio") # For example, it is argued that populist communication means simplified political discourse (lower diversity), in an attempt to reach the public more easily # textstat_lexdiv() function calcuates precisely lexical diversity in various measures based on the number of unique types of tokens # and the length of a document inaug_tokens <- tokens(data_corpus_inaugural) tok2 <- tokens_remove(inaug_tokens , stopwords("en")) inaug_dfm <- dfm(tok2) lexdiv <- textstat_lexdiv(inaug_dfm) str(lexdiv) # a temporal decreasing in the level of complexity? # let's see ggplot(data=lexdiv , aes(x=document, y=TTR, group=1)) + geom_line()+ geom_point()+ theme_minimal() + scale_x_discrete(breaks=c("1789-Washington","1933-Roosevelt","1961-Kennedy", "2017-Trump")) + geom_smooth(method = "lm") # TTR is estimated as V/N, where # V (types=total number of unique terms); N (tokens=total number of words in the dfm) head(lexdiv, 5) tail(lexdiv, 5) # when you run textstat_lexdiv it automatically removes numbers punctuation etc from the corpus [w/o the need for you # to specify that when you create your dfm: see ?textstat_lexdiv # that's why the ratio you get between Types and Tokens from the corpus gives you a different value: summary(data_corpus_inaugural) # for Washington 1789 625/1537 head(lexdiv, 1) ######################################################################### # Statistical summaries (4): Comparing words associated with a target group vs. reference group (Non-positional Analysis) ######################################################################### # More sophisticated methods compare the differential occurrences of words across texts or partitions of a corpus, using statistical # association measures, to identify the words that belong for example to different sub-groups of texts, such as those predominantly # associated with male - versus female - authored documents # In this respect we can employ for example a chi2 test. Chi-squared test is used to determine whether there is a statistically significant # difference between the expected frequencies and the observed frequencies in one or more categories of a contingency table (in our cases # we are talking about the frequencies of words in two different set of texts). To understand a bit more about a chi2 test, take a look # at the slides available on the home-page of the course # The textstat_keyness command allows you precisely to compare the differential association of keywords in a target and reference group # For example, let's try to understand if there are some specific words associated to Trump vs. more mainstream GOP Presidents # In this example, we compare the inaugural speech by Donald Trump with the speech by Joe Biden pres_GOP <- corpus_subset(data_corpus_inaugural, Year > 1946 & Party== "Republican") summary(pres_GOP) # let's create a document variables = 1 for President Trump, and 0 otherwise pres_GOP$Trump <- ifelse(pres_GOP$President=="Trump", "Trump", "Other GPO") docvars(pres_GOP) tok2 <- tokens(pres_GOP, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2 ) pres_GOP_dfm<- dfm(tok2) myDfm <- dfm_group(pres_GOP_dfm, groups = Trump) str(myDfm @docvars) result_keyness <- textstat_keyness(myDfm , target = "Trump") # note that the toke "us" is not included as a stopword in Snowball (that's the reason why it is still in our dfm) textplot_keyness(result_keyness) # Note however that if you look at the p-values not all of them are lower than 0.05 head(result_keyness , 10) tail(result_keyness , 10) # Therefore, let's keep only such significant values str(result_keyness ) result_keyness2 <- result_keyness[ which(result_keyness$p<=0.05), ] str(result_keyness2 ) # let's add back the attributes of the data (i.e., "Trump" "Other GPO") attr(result_keyness2, 'groups') <- c( "Trump", "Other GPO") str(result_keyness2 ) # very interesting results (check "us" "freedom" vs. "protect" "jobs" "american") textplot_keyness(result_keyness2) # another example: Republican vs. Democratic presidents myCorpus <- corpus_subset(data_corpus_inaugural, Year > 1945) summary(myCorpus) tok2 <- tokens(myCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) myDfm <- dfm(tok2) myDfm <- dfm_group(myDfm, groups = Party) str(myDfm @docvars) # You calculate keyness and determine Republican Presidents' as target group vs. Democratic Presidents' speeches as the reference group result_keyness <- textstat_keyness(myDfm , target = "Republican") result_keyness2 <- result_keyness[ which(result_keyness$p<=0.05), ] str(result_keyness2 ) attr(result_keyness2, 'groups') <- c( "Gop", "Dem") textplot_keyness(result_keyness2) ######################################################################### # Statistical summaries (5): Cosine similarities (Non-positional Analysis) ######################################################################### # "Cosine similarity" is an intuitive measure of semantic distance. # By representing texts as vectors in a Cartesian space via the DfM, cosine similarity estimates the differences between two texts based on vectors # of word occurrences. More in details, the measure computes the cosine of the angle between each vector x and y (where x and y are two different texts). # A cosine value of 0 means that the two vectors are at 90 degrees to each other (orthogonal) and have no match. # The closer the cosine value to 1, the smaller the angle and the greater the match between vectors. # As a result, the cosine similarity between two texts ranges between 0 and 1, where 0 is reached when two texts are completely # different and 1 is reached when two texts have identical feature proportions (plz tak a look at the .pdf file on the course home-page under EXTRA for Lab Class 1 # for a more detailed info on how to compute similiary between documents from a dfm) # create a dfm from inaugural addresses from Reagan onwards myCorpus <- corpus_subset(data_corpus_inaugural, Year > 1980) summary(myCorpus) tok2 <- tokens(myCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) presDfm<- dfm(tok2) presDfm # compute some document similarities Simil <- textstat_simil(presDfm, margin = "documents", method = "cosine") Simil # Let's plot it! Simil2 <-as.matrix(Simil) str(Simil2) corrplot(Simil2, method = 'number') corrplot(Simil2, method = 'color') corrplot(Simil2, method = 'shade', type = 'lower') # for specific comparisons: here the two speeches by Obama obamaSimil <- textstat_simil(presDfm, presDfm[c("2009-Obama", "2013-Obama"), ], margin = "documents", method = "cosine") obamaSimil # compute some term-similarities #CHECK FOR THE LATEST VERSION!!!! tstat3 <- textstat_simil(presDfm, presDfm[, c("fair", "health", "terror")], method = "cosine", margin = "features") head(as.matrix(tstat3), 10) # let's list the 6 features with the highest cosine similarity to each of our three selected words as.list(tstat3, n = 6)