rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(ggplot2) library(syuzhet) library(reshape2) library(gridExtra) ############################### # dictionary that you can call from Quanteda ############################### # using the stemming procedure can be a problem in some cases (according to the dictionary employed - if it does or does not # contain stemming/wildcards such as "*" !!!) ################################# # 1) Create your own dictionary! ################################# # Let's create a dictionary with 3 entries. # We will use the same line of command employed with newsmap. Do you remember? myDict <- dictionary(list(terror = c("terror*", "threat"), economy = c("job*", "business*", "grow", "work"), pop= c("people", "washington"))) myDict # let's apply this dictionary to our usual US Presidential inaugural speeches corpus after 1991 recentCorpus <- corpus_subset(data_corpus_inaugural, Year > 1991) tok2 <- tokens(recentCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) Mydfm <- dfm(tok2) # the function to apply a dictionary to a DfM is dfm_lookup (once again, we already saw this function with newsmap!) byPresMat <- dfm_lookup(Mydfm , dictionary = myDict) byPresMat Dictionary <-convert(byPresMat, to="data.frame") head(Dictionary ) # you can also apply a dictionary to a tokenized object via the token_lookup option byPresMatToks <- tokens_lookup(tok2, myDict ) print(byPresMatToks) # the advantage of tokens_lookup is that allows you to include in a dictionary multi-word expressions # for example: head(kwic(tokens(data_corpus_inaugural), phrase(c ("United States", "New York")))) # let's define a dictionary with 2 multi-word expressions multiword_dict <- dictionary(list(country = "United States", city = "New York")) multiword_dict toks <- tokens(data_corpus_inaugural) Mydfm <- dfm(toks) byPresMat <- dfm_lookup(Mydfm , dictionary = multiword_dict) byPresMat Dictionary <-convert(byPresMat, to="data.frame") # all 0s! head(Dictionary ) # however with tokens_lookup... head(tokens_lookup(toks, dictionary = multiword_dict)) # so how to deal with multi-word expressions in a DfM? You could first use tokens_compound() to join elements of multi-word expressions # by underscore, so they become United_States and New_York. comp_toks <- tokens_compound(toks, pattern =phrase(c ("United States", "New York"))) head(tokens_select(comp_toks, pattern= c("United_States", "New_York"))) # and now let's define a new dictionary and let's create a new dfm multiword_dict2 <- dictionary(list(country = "United_States", city = "New_York")) multiword_dict2 Mydfm <- dfm(comp_toks) byPresMat <- dfm_lookup(Mydfm , dictionary = multiword_dict2) byPresMat Dictionary <-convert(byPresMat, to="data.frame") head(Dictionary) # same result as above via tokens_lookup head(tokens_lookup(toks, dictionary = multiword_dict)) tail(Dictionary) tail(tokens_lookup(toks, dictionary = multiword_dict)) ################################# # 2) Using an existing dictionary! # For example: Import the Laver-Garry dictionary from Provalis Research (see: https://www.jstor.org/stable/2669268) ################################# dictfile <- tempfile() download.file("https://provalisresearch.com/Download/LaverGarry.zip", dictfile, mode = "wb") unzip(dictfile, exdir = (td <- tempdir())) lgdict <- dictionary(file = paste(td, "LaverGarry.cat", sep = "/")) # here we are stemmed words/wildcards... lgdict # is it a dictionary that can be read by Quanteda? YES! is.dictionary(lgdict) recentCorpus <- corpus_subset(data_corpus_inaugural, Year > 1991) tok2 <- tokens(recentCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) Mydfm <- dfm(tok2) # let's focus on those US Presidential Speeches after 1991 (whose dfm we created above) and let's apply the dictionary to our corpus lg_dfm <- dfm_lookup(Mydfm , dictionary = lgdict) lg_dfm # Let's focus on the categories "More State" and "Less State" Dictionary <-convert(lg_dfm, to="data.frame") str(Dictionary ) colnames(Dictionary ) names(Dictionary )[6] <- "More_State" names(Dictionary )[8] <- "Less_State" colnames(Dictionary ) # and let's identify the more "pro-state" Presidents (according to his inaugural speech) Dictionary$pro_state <- Dictionary$More_State/(Dictionary$Less_State+Dictionary$More_State) ggplot(data=Dictionary , aes(x=doc_id, y=pro_state)) + geom_bar(stat="identity") + coord_flip() ################################# # 3) sentiment dictionaries ################################# # Quanteda has integrated a sentiment dictionary constructed by Young & Soroka (2012) stored in data_dictionary_LSD2015 # called Lexicoder. The dictionary contains thousands of positive and negative words or word stems. lengths(data_dictionary_LSD2015) head(data_dictionary_LSD2015) head(data_dictionary_LSD2015[1:2]) sentiment <- dfm_lookup(Mydfm , dictionary = data_dictionary_LSD2015[1:2]) sentiment Dictionary <-convert(sentiment , to="data.frame") str(Dictionary ) Dictionary$Sentiment <- Dictionary$posit/(Dictionary$negat+Dictionary$posit) str(Dictionary ) ggplot(data=Dictionary , aes(x=doc_id, y=Sentiment)) + geom_bar(stat="identity") + coord_flip() # are Republican presidents more "positive" than Democratic ones? ############################### # using other packages for dictionary analysis not included in Quanteda: using the syuzhet package ############################### # You can get access to different dictionaries # the dictionary "syuzhet" for example: here every word in the dictionary has a different (positive or negative) weight # (contrary to previous dictionaries) # Note: no wildcards are included! get_sentiment_dictionary(dictionary = "syuzhet", language = "english") # is it a dictionary that can be read directly by Quanteda? NO! is.dictionary(get_sentiment_dictionary(dictionary = "syuzhet", language = "english")) # So we cannot apply the syuzhet dictionary via dfm_lookup! # To apply such dictionary we need first to extract the texts from our corpus: recent_corpus <- corpus_subset(data_corpus_inaugural, Year > 1991) ndoc(recent_corpus) # let's extract the texts from our corpus text_corpus <- as.character(recent_corpus)[1:ndoc(recent_corpus)] str(text_corpus) # now we can apply the external dictionary to our texts syuzhet_vector <- get_sentiment(text_corpus, method="syuzhet") # the sentiment index you get is the difference between all the positive scores minus all the negative scores according to the dictionary syuzhet_vector president <- docnames(recent_corpus) results1 <- as.data.frame(cbind(president , syuzhet_vector)) results1 # Remember an important point: get_sentiment does not count if a positive or negative words appears once or twice. # It counts it always once! And if we want to count it more than once? # You convert it as a Quanteda dictionary and then use as usual the command dfm_lookup (see the EXTRA script on the home-page of the course) # Similarly: what if we want to apply such dictionary to an already "cleaned" tokenized object via Quanteda? # Two possibilities: a) either you convert it as a Quanteda dictionary (see the EXTRA script on the home-page of the course); # b) or you tokenizes your corpus and then convert it back to strings. Then you apply to them the dictionary (but remember the issue of wilcards!) # Let's follow the latter way recent_corpus <- corpus_subset(data_corpus_inaugural, Year > 1991) tok2 <- tokens(recentCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) xxx <- sapply(tok2, function(x) paste(as.character(x), collapse = " ")) str(xxx) syuzhet_vector2 <- get_sentiment(xxx, method="syuzhet") syuzhet_vector2 # some slight differences compared to above syuzhet_vector # and what if you had apply trimming to a dfm? how to keep only the tokens from the dfm trimmed new object and apply to it the # dictionary? Mydfm <- dfm(tok2) # 2527 tokens nfeat(Mydfm) # let's trim the dfm new_dfm <- dfm_trim(Mydfm, min_termfreq = 0.50, termfreq_type = "quantile", max_docfreq = 0.2, docfreq_type = "prop") # fewer features compared to above nfeat(new_dfm) # let's extract the feature names present in our trimmed dfm feats <- featnames(new_dfm) length(feats ) # let's keep in our tokenized objects only the features also included in our trimmed dfm tok3 <- tokens_select(tok2, feats) yyy <- sapply(tok3, function(x) paste(as.character(x), collapse = " ")) syuzhet_vector3 <- get_sentiment(yyy, method="syuzhet") syuzhet_vector3 # syuzhet is a very nice package! To get some more info about it, take a look at this vignette: # https://cran.r-project.org/web/packages/syuzhet/vignettes/syuzhet-vignette.html # We can also employ the dictionary "nrc" x <- get_sentiment_dictionary(dictionary = 'nrc', language = "english") str(x) # is it a dictionary that can be read directly by Quanteda? Once again NO! is.dictionary(get_sentiment_dictionary(dictionary = "nrc", language = "english")) # Let's apply the nrc dictionary! nrc_vector <- get_sentiment(text_corpus , method="nrc") nrc_vector results2 <- as.data.frame(cbind(president , syuzhet_vector, nrc_vector)) results2 # let's correlate the results we got from the 2 dictionaries str(results2 ) results2$syuzhet_vector <- as.numeric(results2$syuzhet_vector) results2$nrc_vector <- as.numeric(results2$nrc_vector) str(results2 ) # positive correlation, but not a perfect one cor(results2$syuzhet_vector ,results2$nrc_vector) ggplot(results2, aes(x=syuzhet_vector, y=nrc_vector)) + geom_point() + geom_text(label=president, vjust = -0.5) + geom_smooth(method=lm) # So what to do? Which of the two dictionaries to keep for your analysis? First: you always must validate the dictionaries you employ! # Once done it, you can also compare the internal-coherence of the two dictionaries (GIVEN your corpus) # via a split-half reliability test and keeping the dictionary with a better result. # To understand how to do that, take a look at the EXTRA script in the home-page of the course # the nrc dictionary has also more "categories" beyond negative and positive (detecting other "emotions") get_sentiment_dictionary(dictionary = 'nrc', language = "english") # moreover the dictionary "nrc" cover several different languages... get_sentiment_dictionary(dictionary = 'nrc', language = "spanish") get_sentiment_dictionary(dictionary = 'nrc', language = "italian") get_sentiment_dictionary(dictionary = 'nrc', language = "arabic") get_sentiment_dictionary(dictionary = 'nrc', language = "japanese") get_sentiment_dictionary(dictionary = 'nrc', language = "turkish") # let's apply such new extended dictionary nrc_data_PR <- get_nrc_sentiment(text_corpus, language = "english") nrc_data_PR$president <- docnames(recent_corpus) str(nrc_data_PR) # overall % of emotions in the text relative to each other prop.table(nrc_data_PR[, 1:8]) colSums(prop.table(nrc_data_PR[, 1:8])) data <-as.data.frame(colSums(prop.table(nrc_data_PR[, 1:8]))) str(data) names(data)[1] <- "value" str(data) ggplot(data=data, aes(y=value, x=reorder(row.names(data), +value))) + geom_bar(stat="identity", fill="blue") + coord_flip() + ylab(label="Values") + xlab("Emotions") # let's focus now on each Presidential's speech prop.table(nrc_data_PR[, 1:8]) rowSums(prop.table(nrc_data_PR[, 1:8])) data2 <- prop.table(nrc_data_PR[, 1:8])/rowSums(prop.table(nrc_data_PR[, 1:8])) str(data2) # Let's focus on anger, fear and joy % in each Presidential's speech data3 <- data2[,c(1,4,5)] data3$President <- row.names(data3) str(data3) df.long<-melt(data3,id.vars=c("President")) str(df.long) ggplot(df.long,aes(President,value,fill=variable))+ geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() + ylab(label="Emotional words") + xlab("President") ########## problems with dictionaries (as discussed in our Lecture) testText <- "This movie has good premises. Looks like it has a nice plot, and exceptional cast, first class actors and Stallone gives his best. But it sucks" testText testCorpus <- corpus(testText) tok2 <-tokens(testCorpus ) head(dfm_lookup(dfm(tok2), dictionary = data_dictionary_LSD2015 [1:2])) s_v <- "this movie has good premises, looks like it has a nice plot, an exceptional cast, first class actors and Stallone gives his best, but it sucks" syuzhet_vector <- get_sentiment(s_v, method="syuzhet") nrc_vector <- get_sentiment(s_v, method="nrc") syuzhet_vector nrc_vector ##########