rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(ggplot2) library(syuzhet) library(reshape2) library(gridExtra) library(rtweet) library(dplyr) library(cowplot) library(plyr) library(syuzhet) ############################### # dictionary that you can call from Quanteda ############################### # using the stemming procedure can be a problem in some cases (according to the dictionary employed - if it does or does not # contain stemming or wildcards such as "*") ################################# # 1) Create your own dictionary! ################################# # Let's create a dictionary with 3 entries myDict <- dictionary(list(terror = c("terrorism", "terrorists", "threat"), economy = c("jobs", "business", "grow", "work"), pop= c("people", "washington"))) myDict # let's apply this dictionary to our usual US Presidential inaugural speeches corpus recentCorpus <- corpus_subset(data_corpus_inaugural, Year > 1991) tok2 <- tokens(recentCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) # note that we haven't stemmed the texts! Mydfm <- dfm(tok2) # you can use a dictionary on any given DfM via the function "dfm_lookup" byPresMat <- dfm_lookup(Mydfm , dictionary = myDict) byPresMat ################################# # 2) Using some existing dictionary! # For example: Import the Laver-Garry dictionary from Provalis Research (see: https://www.jstor.org/stable/2669268) ################################# dictfile <- tempfile() download.file("https://provalisresearch.com/Download/LaverGarry.zip", dictfile, mode = "wb") unzip(dictfile, exdir = (td <- tempdir())) lgdict <- dictionary(file = paste(td, "LaverGarry.cat", sep = "/")) # here we are stemmed words/wildcards... lgdict recentCorpus <- corpus_subset(data_corpus_inaugural, Year > 1991) tok2 <- tokens(recentCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) Mydfm <- dfm(tok2) # let's focus on those US Presidential Speeches after 1991 (whose dfm we created above) and let's apply the dictionary to our corpus lg_dfm <- dfm_lookup(Mydfm , dictionary = lgdict) lg_dfm # Let's focus on the categories "More State" and "Less State" Dictionary <-convert(lg_dfm, to="data.frame") str(Dictionary ) colnames(Dictionary ) names(Dictionary )[6] <- "More_State" names(Dictionary )[8] <- "Less_State" colnames(Dictionary ) # and let's identify the more "pro-state" Presidents (according to his inaugural speech) Dictionary$pro_state <- Dictionary$More_State/(Dictionary$Less_State+Dictionary$More_State) ggplot(data=Dictionary , aes(x=doc_id, y=pro_state)) + geom_bar(stat="identity") + coord_flip() ################################# # 3) sentiment dictionaries ################################# # Quanteda has integrated a sentiment dictionary constructed by Young & Soroka (2012) stored in data_dictionary_LSD2015 # called Lexicoder. The dictionary contains thousands of positive and negative words or word stems. lengths(data_dictionary_LSD2015) head(data_dictionary_LSD2015) head(data_dictionary_LSD2015[1:2]) sentiment <- dfm_lookup(Mydfm , dictionary = data_dictionary_LSD2015[1:2]) sentiment Dictionary <-convert(sentiment , to="data.frame") str(Dictionary ) Dictionary$Sentiment <- Dictionary$posit/(Dictionary$negat+Dictionary$posit) str(Dictionary ) ggplot(data=Dictionary , aes(x=doc_id, y=Sentiment)) + geom_bar(stat="identity") + coord_flip() # are Republican Presidents more "positive" than Democratic ones? ############################### # using other packages for dictionary analysis not included in Quanteda: using the syuzhet package ############################### # You can get access to different dictionaries on internet! # For example the dictionary "syuzhet": in this dictionary, every word has a different (positive or negative) weight # (contrary to previous dictionaries). # Only English language is covered!!! And no wildcards! get_sentiment_dictionary(dictionary = "syuzhet", language = "english") # Let's apply the syuzhet dictionary! # To apply such dictionary we need first to extract the texts from our corpus recent_corpus <- corpus_subset(data_corpus_inaugural, Year > 1991) ndoc(recent_corpus) # let's extract the texts from the corpus text_corpus <- as.character(recent_corpus)[1:ndoc(recent_corpus)] str(text_corpus) syuzhet_vector <- get_sentiment(text_corpus, method="syuzhet") syuzhet_vector # let's create a nice data frame with these results for later use president <- docnames(recent_corpus) results1 <- as.data.frame(cbind(president , syuzhet_vector)) results1 # We can also employ the dictionary "nrc" x <- get_sentiment_dictionary(dictionary = 'nrc', language = "english") str(x) # Let's apply now the nrc dictionary! nrc_vector <- get_sentiment(text_corpus , method="nrc") nrc_vector # let's create a data frame with the results we got from the 2 dictionaries results2 <- as.data.frame(cbind(president , syuzhet_vector, nrc_vector)) results2 # let's correlate the results we got from the 2 dictionaries str(results2 ) results2$syuzhet_vector <- as.numeric(results2$syuzhet_vector) results2$nrc_vector <- as.numeric(results2$nrc_vector) str(results2 ) cor(results2$syuzhet_vector ,results2$nrc_vector) plot(results2$syuzhet_vector, results2$nrc_vector, main="Scatterplot of Sentiment", xlab="syuzhet ", ylab="nrc ", pch=19) text(results2$syuzhet_vector, results2$nrc_vector, labels = results2$president, pos = 4, col = "royalblue" , cex = 0.8) # Add fit lines abline(lm(results2$nrc_vector~results2$syuzhet_vector), col="red") # regression line (y~x) # the nrc dictionary has also more "categories" beyond negative and positive (detecting other "emotions") get_sentiment_dictionary(dictionary = 'nrc', language = "english") # moreover the dictionary "nrc" cover several different languages... get_sentiment_dictionary(dictionary = 'nrc', language = "spanish") get_sentiment_dictionary(dictionary = 'nrc', language = "italian") get_sentiment_dictionary(dictionary = 'nrc', language = "turkish") get_sentiment_dictionary(dictionary = 'nrc', language = "arabic") Sys.setlocale("LC_CTYPE","japanese") get_sentiment_dictionary(dictionary = 'nrc', language = "japanese") # let's apply such new extended dictionary nrc_data_PR <- get_nrc_sentiment(text_corpus, language = "english") nrc_data_PR$president <- docnames(recent_corpus) str(nrc_data_PR) # % of emotions in the text relative to each other colSums(prop.table(nrc_data_PR[, 1:8])) # plot % of emotions in the text relative to each other barplot( sort(colSums(prop.table(nrc_data_PR[, 1:8]))), horiz = TRUE, cex.names = 0.7, las = 1, main = "Emotions in Sample text", xlab="Percentage" ) # Plotting using ggplot focusing on anger, disgust, fear and joy str(nrc_data_PR) nrc_data_PR2 <- nrc_data_PR[,c(1,3,4,5,11)] str(nrc_data_PR2) df.long<-melt(nrc_data_PR2,id.vars=c("president")) str(df.long) ggplot(df.long,aes(president,value,fill=variable))+ geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() + ylab(label="Emotional words") + xlab("President") # (assuming that the dictionary is doing a good job) can we generalize from here that the Obama was the President using more fear words? # What is the important information we are not considering here? Of course, the relative length of each Presidential speech! # So let's check for that! # let's extract the number of words included in each Presidendial speech nrc_data_PR2 $tokens <- ntoken(recent_corpus) str(nrc_data_PR2) # let's weight the previous results according to such number of words nrc_data_PR3 <- nrc_data_PR2[1:4]/nrc_data_PR2$tokens str(nrc_data_PR3) nrc_data_PR3$president <- nrc_data_PR2$president df.long2<-melt(nrc_data_PR3,id.vars=c("president")) str(df.long2) ggplot(df.long2,aes(president,value,fill=variable))+ geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() + ylab(label="Emotional words") + xlab("President") unweighted <- ggplot(df.long,aes(president,value,fill=variable))+ geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() + ylab(label="Emotional words (unweighted)") + xlab("President") weighted <- ggplot(df.long2,aes(president,value,fill=variable))+ geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() + ylab(label="Emotional words (weighted)") + xlab("President") library(cowplot) plot_grid(unweighted , weighted , labels = c("Unweighted", "Weighted" )) ####################################### ########## problems with dictionaries (as discussed in our Lecture) ####################################### testText <- "This movie has good premises. Looks like it has a nice plot, and exceptional cast, first class actors and Stallone gives his best. But it sucks" testText testCorpus <- corpus(testText) tok2 <-tokens(testCorpus ) head(dfm_lookup(dfm(tok2), dictionary = data_dictionary_LSD2015 [1:2])) s_v <- "this movie has good premises, looks like it has a nice plot, an exceptional cast, first class actors and Stallone gives his best, but it sucks" syuzhet_vector <- get_sentiment(s_v, method="syuzhet") nrc_vector <- get_sentiment(s_v, method="nrc") syuzhet_vector nrc_vector ############################### # An example with a twitter query ############################## rt <- search_tweets("Salvini", n = 1000, include_rts = FALSE, lang = "en") print(rt$lang[1:20]) length(rt$lang) # I want to convert a date str(rt$created_at) rt$date <- as.Date(rt$created_at) str(rt$date) table(rt$date) myCorpusTwitter<- corpus(rt) tok2 <- tokens(myCorpusTwitter, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_url=TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) Salvini_dfm <- dfm(tok2) topfeatures(Salvini_dfm , 20) # 20 top words # Let's apply the nrc dictionaries nrc_data <- get_nrc_sentiment(rt$text, language="english") str(nrc_data) # I want to read the tweets that includes a number of anger words>2 rt$text[nrc_data$anger > 2] # let's plot the wordcloud of emotions all = c( paste(rt$text[nrc_data$anger > 0], collapse=" "), paste(rt$text[nrc_data$anticipation > 0], collapse=" "), paste(rt$text[nrc_data$disgust > 0], collapse=" "), paste(rt$text[nrc_data$fear > 0], collapse=" "), paste(rt$text[nrc_data$joy > 0], collapse=" "), paste(rt$text[nrc_data$sadness > 0], collapse=" "), paste(rt$text[nrc_data$surprise > 0], collapse=" "), paste(rt$text[nrc_data$trust > 0], collapse=" ") ) str(all) recentCorpus <- corpus(all) summary(recentCorpus ) colnames(nrc_data)[1:8] docnames(recentCorpus ) <- colnames(nrc_data)[1:8] summary(recentCorpus ) tok2 <- tokens(recentCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) Mydfm <- dfm(tok2) Mydfm library(quanteda.textplots) set.seed(123) textplot_wordcloud(dfm_trim(Mydfm, min_termfreq = 1, verbose = FALSE), comparison = TRUE, colors = c("#00B2FF", "red", "#FF0099", "#6600CC", "green", "orange", "blue", "brown"), labelsize=1.5) ################################### # Let's estimate the daily average emotions with respect to Salvini and let's plot it ################################### str(nrc_data) nrc_data$date <- rt$date str(nrc_data) daily_emotions <- aggregate( nrc_data[,1:8], by=list(day=nrc_data$date), FUN=mean) str(daily_emotions) head(daily_emotions) library(psych) corr.test(daily_emotions [c(2:8)]) library(PerformanceAnalytics) chart.Correlation(daily_emotions[c(2:8)]) # Let's plot it library(reshape2) df.long<-melt(daily_emotions,id.vars=c("day")) str(df.long) head(df.long) ggplot(df.long, aes(x =day, y = value, color = variable)) + geom_point() + geom_smooth(method = "loess") + facet_wrap(~ variable, scale = "free_y", nrow = 2) + theme_bw() + labs( title = "Emotional analysis of Twitter statuses over time", subtitle = "Tweets aggregated by day on topics of Salvini") + theme(text = element_text(family = "Roboto Condensed"), plot.title = element_text(face = "bold"), legend.position = "bottom", axis.text = element_text(size = 9), legend.title = element_blank()) ############################### # MISC - for you to look at ############################### # how to convert an external dictionary to a Quanteda dictionary? x <- get_sentiment_dictionary(dictionary = "syuzhet", language = "english") str(x) names(x)[2] <- "sentiment" str(x) table(x$sentiment) # let's recode the values to negative and positive x$sentiment <- ifelse(x$sentiment< 0, "negative", ifelse(x$sentiment> 0, "positive", "netural")) table(x$sentiment) str(x) # now let's convert it to a Quanteda dictionary using the function as.dictionary dict <- as.dictionary(x) str(dict) is.dictionary(dict) recentCorpus <- corpus_subset(data_corpus_inaugural, Year > 1991) tok2 <- tokens(recentCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) Mydfm <- dfm(tok2) # let's focus on those US Presidential Speeches after 1991 (whose dfm we created above) and let's apply the external dictionary to our corpus ext_dfm <- dfm_lookup(Mydfm , dictionary = dict) ext_dfm # And what if you want to keep the original weighting scores? # Note that by adopting the procedure below, you can also apply to your own dictionary a given set of weights as you prefer! x <- get_sentiment_dictionary(dictionary = "syuzhet", language = "english") str(x) weights <- x$value names(weights) <- x$word weights # let's extract the sentiment from these 3 random texts testText <- c("vision yawn yes yes pippo teen", "Husband husbands youthful zombies zombies teenager") testText testCorpus <- corpus(testText) tok3 <- tokens(testCorpus) myDfm <- dfm(tok3) myDfm # let's select only those features in the dfm also presented in the vocabulary above dfm_anew <- dfm_select(myDfm , pattern = x$word) dfm_anew # Now let's weight the reamining words in the dfm according to the weights vector. # We know that vision=0.5 (weight); yawn=-0.25; yes=0.8 (and it appears twice in text 1) # as a result final score for text 1: 0.5 -0.25 + 0.8*2 = 1.85 # And what about text 2? youthful=0.5 (weight); zombies=-0.25 (and it appears twice in text 2) # as a result final score for text 2: 0.5 -0.25*2 = 0 dfm_anew_weighted <- dfm_weight(dfm_anew, weights = weights, scheme = "count") dfm_anew_weighted # total scores rowSums(dfm_anew_weighted) # dfm_anew <- dfm_weight(dfm_anew, scheme = "prop") # remove if you don't want normalized scores # Note that by applying the original dictionary "syuzhet" you get different values than above! Why? # Cause "syuzhet" does not count if a positive or negative words appears once or twice. It counts any appearance of # a given word in a text >1 always as 1! get_sentiment(testText, method="syuzhet") # And indeed: # 0.5 -0.25 + 0.8=1.05 # 0.5 -0.25=0.25