rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(gridExtra) library(ggplot2) library(syuzhet) library(plotly) library(reshape2) ############################### # dictionary that you can call from Quanteda ############################### # using the stemming procedure can be a problem in some cases (according to the dictionary employed). # We avoid this option in the following examples ################################# # general dictionaries ################################# ################################# # 1) import the Laver-Garry dictionary from Provalis Research (see: https://www.jstor.org/stable/2669268) ################################# dictfile <- tempfile() download.file("https://provalisresearch.com/Download/LaverGarry.zip", dictfile, mode = "wb") unzip(dictfile, exdir = (td <- tempdir())) lgdict <- dictionary(file = paste(td, "LaverGarry.cat", sep = "/")) str(lgdict) lgdict recent_corpus <- corpus_subset(data_corpus_inaugural, Year > 1991) summary(recent_corpus) lg_dfm <- dfm(recent_corpus, dictionary = lgdict) lg_dfm str(lg_dfm) lg_dfm@Dimnames # Let's focus on the categories "More State" and "Less State" Dictionary <-convert(lg_dfm, to="data.frame") str(Dictionary ) colnames(Dictionary ) names(Dictionary )[6] <- "More_State" names(Dictionary )[8] <- "Less_State" colnames(Dictionary ) p <-ggplot(data=Dictionary , aes(x=document, y=More_State)) + geom_bar(stat="identity") p2 <-ggplot(data=Dictionary , aes(x=document, y=Less_State)) + geom_bar(stat="identity") grid.arrange(p, p2, ncol=2) ### Plotting in one single graph str(Dictionary) Dictionary2 <- Dictionary[,c(1,6,8)] str(Dictionary2 ) df.long<-melt(Dictionary2,id.vars=c("document")) str(df.long) ggplot(df.long,aes(document,value,fill=variable))+ geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() + ylab(label="Frequency More/Less State words") + xlab("Party") # Which relationship between "More State" and "Less State" in the economy? cor(Dictionary $More_State, Dictionary $Less_State) plot(Dictionary $More_State, Dictionary $Less_State, main="Scatterplot Example", xlab="More_State", ylab="Less_State", pch=19) text(Dictionary $More_State, Dictionary $Less_State, labels = Dictionary $document, pos = 4, col = "royalblue" , cex = 0.8) abline(lm(Dictionary $Less_State~Dictionary $More_State), col="red") # regression line (y~x) ################################# # 2) import a LIWC formatted dictionary from http://www.moralfoundations.org ################################# download.file("https://goo.gl/5gmwXq", tf <- tempfile()) mfdict <- dictionary(file = tf, format = "LIWC") str(mfdict) liwcdfm <- dfm(recent_corpus, dictionary = mfdict) liwcdfm ################################# # sentiment dictionaries ################################# # Quanteda has integrated a sentiment dictionary constructed by Young & Soroka (2012) stored in data_dictionary_LSD2015. # The dictionary contains thousands of positive and negative words or word stems. lengths(data_dictionary_LSD2015) head(data_dictionary_LSD2015) news_toks <- tokens(recent_corpus, remove_punct = TRUE) lsd_toks <- tokens_lookup(news_toks, data_dictionary_LSD2015[1:2]) head(lsd_toks, 2) sentiment <- dfm(recent_corpus, dictionary = data_dictionary_LSD2015[1:2]) sentiment Dictionary <-convert(sentiment , to="data.frame") str(Dictionary ) Dictionary$Sentiment <- Dictionary$positive-Dictionary$negative str(Dictionary ) ################################# # Applying dictionaries to Party Manifestoes! ################################# library(manifestoR) mp_setapikey(key.file = NULL, key = "XXXXXXXXXXXXXXXXX") # Let's focus on the US manifestoes of the 2016 elections cmp <- mp_maindataset() us <- cmp[ which(cmp$countryname=="United States" & cmp$date == 201611 ),] print(us[c("partyname", "party", "edate", "date", "partyabbrev")]) us$partyname us$party available_us2016 <- mp_availability(countryname == "United States" & date == 201611 & partyname %in% c("Democratic Party","Republican Party")) available_us2016 US2016 <- mp_corpus(available_us2016 ) quanteda_US2016 <- corpus(US2016 ) summary(head(quanteda_US2016 )) # Which of the two party manifestoes was more negative? # making the DFM, grouping by party and considering only words negative and positive dfm_us2016 <- dfm(quanteda_US2016, tolower=TRUE, remove_punct = TRUE, dictionary = data_dictionary_LSD2015[1:2], group=c("party")) dfm_us2016 str(dfm_us2016) dfm_us2016@Dimnames$docs dfm_us2016@Dimnames$docs <- us$partyname dfm_us2016@Dimnames$docs dfm_us2016 # What about the role of State in economy? dfm_us2016_eco <- dfm(quanteda_US2016, tolower=TRUE, remove_punct = TRUE, dictionary = lgdict, group=c("party")) dfm_us2016_eco dfm_us2016_eco@Dimnames$docs <- us$partyname dfm_us2016_eco ############################### # using other packages for dictionary analysis: using syuzhet package ############################### # You can get access to different dictionaries # the dictionary "syuzhet": every word in the dictionary has a different (positive or negative) weight (contrary to # other dictionaries) # only English covered get_sentiment_dictionary(dictionary = "syuzhet", language = "english") # Let's apply the syuzhet dictionary! recent_corpus <- corpus_subset(data_corpus_inaugural, Year > 1991) summary(recent_corpus) str(recent_corpus) recent_corpus$documents$texts syuzhet_vector <- get_sentiment(recent_corpus$documents$texts, method="syuzhet") syuzhet_vector president <- docnames(recent_corpus) results1 <- cbind(president , syuzhet_vector) results1 # the dictionary "nrc" cover lots of languages... get_sentiment_dictionary(dictionary = 'nrc', language = "english") x <- get_sentiment_dictionary(dictionary = 'nrc', language = "english") str(x) get_sentiment_dictionary(dictionary = 'nrc', language = "spanish") get_sentiment_dictionary(dictionary = 'nrc', language = "italian") get_sentiment_dictionary(dictionary = 'nrc', language = "arabic") get_sentiment_dictionary(dictionary = 'nrc', language = "japanese") get_sentiment_dictionary(dictionary = 'nrc', language = "turkish") # Let's apply the nrc dictionary! nrc_vector <- get_sentiment(recent_corpus$documents$texts, method="nrc") nrc_vector results2 <- as.data.frame(cbind(president , syuzhet_vector, nrc_vector)) results2 str(results2 ) results2$syuzhet_vector <- as.numeric(levels(results2$syuzhet_vector))[results2$syuzhet_vector] results2$nrc_vector <- as.numeric(levels(results2$nrc_vector))[results2$nrc_vector] str(results2 ) cor(results2$syuzhet_vector ,results2$nrc_vector) plot(results2$syuzhet_vector, results2$nrc_vector, main="Scatterplot of Sentiment", xlab="syuzhet ", ylab="nrc ", pch=19) text(results2$syuzhet_vector, results2$nrc_vector, labels = results2$president, pos = 4, col = "royalblue" , cex = 0.8) # Add fit lines abline(lm(results2$nrc_vector~results2$syuzhet_vector), col="red") # regression line (y~x) # Another general dictionaries with several "voices" beyond negative and positive nrc_data_PR <- get_nrc_sentiment(recent_corpus$documents$texts, language = "english") str(nrc_data_PR) # let's estimate the "sentiment" as "positive-negative" nrc_data_PR$sentiment <- nrc_data_PR$positive-nrc_data_PR$negative str(nrc_data_PR) # same as here the sentiment! nrc_vector # let's add the name of the presidents nrc_data_PR$president <- docnames(recent_corpus) str(nrc_data_PR) # plot % positive and negative words relative to each other barplot( sort(colSums(prop.table(nrc_data_PR[, 9:10]))), horiz = TRUE, cex.names = 0.7, las = 1, main = "Sentiment in Sample text", xlab="Percentage" ) # % of emotions in the text relative to each other colSums(prop.table(nrc_data_PR[, 1:8])) # plot % of emotions in the text relative to each other barplot( sort(colSums(prop.table(nrc_data_PR[, 1:8]))), horiz = TRUE, cex.names = 0.7, las = 1, main = "Emotions in Sample text", xlab="Percentage" ) # let's plot over Presidents # Plotting using ploty p <- plot_ly(nrc_data_PR, x = ~president, y = ~anger, type = 'bar', name = 'Anger') %>% add_trace(y = ~fear, name = 'Fear') %>% add_trace(y = ~joy, name = 'Joy') %>% add_trace(y = ~trust, name = 'Trust') %>% layout(yaxis = list(title = 'Count'), barmode = 'group') p # Plotting using ggplot str(nrc_data_PR) nrc_data_PR2 <- nrc_data_PR[,c(1,4,5,8,12)] str(nrc_data_PR2) df.long<-melt(nrc_data_PR2,id.vars=c("president")) str(df.long) ggplot(df.long,aes(president,value,fill=variable))+ geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() + ylab(label="Emotional words") + xlab("President") # let's normalize the values according to the lenght of the Presidential speeches for anger, fear, joy and trust tokens <- summary(recent_corpus) str(tokens) #### OR IN ALTERNATIVE (highly suggested!) tokens2 <- as.data.frame(ntoken(recent_corpus, remove_punct = TRUE, remove_numbers=TRUE)) str(tokens2) colnames(tokens2)[1] <- "tokens" str(tokens2) nrc_data_PR$tokens <- tokens2$tokens str(nrc_data_PR) nrc_data_PR$anger_n <- nrc_data_PR$anger/nrc_data_PR$tokens nrc_data_PR$fear_n <- nrc_data_PR$fear/nrc_data_PR$tokens nrc_data_PR$joy_n <- nrc_data_PR$joy/nrc_data_PR$tokens nrc_data_PR$trust_n <- nrc_data_PR$trust/nrc_data_PR$tokens str(nrc_data_PR) # Plotting using ploty p2 <- plot_ly(nrc_data_PR, x = ~president, y = ~anger_n, type = 'bar', name = 'Anger') %>% add_trace(y = ~fear_n, name = 'Fear') %>% add_trace(y = ~joy_n, name = 'Joy') %>% add_trace(y = ~trust_n, name = 'Trust') %>% layout(yaxis = list(title = '%'), barmode = 'group') p2 ########## problems with dictionaries testText <- "This movie has good premises. Looks like it has a nice plot, and exceptional cast, first class actors and Stallone gives his best. But it sucks" testText testCorpus <- corpus(testText) head(dfm(testText , dictionary = data_dictionary_LSD2015 )) s_v <- "this movie has good premises, looks like it has a nice plot, an exceptional cast, first class actors and Stallone gives his best, but it sucks" syuzhet_vector <- get_sentiment(s_v, method="syuzhet") nrc_vector <- get_sentiment(s_v, method="nrc") syuzhet_vector nrc_vector ##########