rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(ggplot2) library(syuzhet) library(reshape2) library(gridExtra) ############################### ############################### # convert an external dictionary to Quanteda dictionary x <- get_sentiment_dictionary(dictionary = "syuzhet", language = "english") str(x) names(x)[2] <- "sentiment" str(x) # let's convert the values to negative and positive x[["sentiment"]] <- with(x, sentiment <- ifelse(sentiment< 0, "negative", ifelse(sentiment> 0, "positive", "netural")) ) table(x$sentiment) # now let's convert it to a Quanteda dictionary using the function as.dictionary dict <- as.dictionary(x) str(dict) is.dictionary(dict) recentCorpus <- corpus_subset(data_corpus_inaugural, Year > 1991) tok2 <- tokens(recentCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) Mydfm <- dfm(tok2) # let's focus on those US Presidential Speeches after 1991 (whose dfm we created above) and let's apply the external dictionary to our corpus ext_dfm <- dfm_lookup(Mydfm , dictionary = dict) ext_dfm ####################################### ####################################### # and what if you want to keep the original weighting scores? # through this procedure you can also create your own dictionary with your own weights of course! x <- get_sentiment_dictionary(dictionary = "syuzhet", language = "english") str(x) weights <- x$value names(weights) <- x$word weights # let's see an example testText <- c("vision yawn yes yes pippo teen", "Husband husbands youthful zombies zombies teenager") testText testCorpus <- corpus(testText) tok3 <- tokens(testCorpus) myDfm <- dfm(tok3) myDfm dfm_anew <- dfm_select(myDfm , pattern = x$word) dfm_anew dfm_anew_weighted <- dfm_weight(dfm_anew, weights = weights, scheme = "count") dfm_anew_weighted # scores for the two texts rowSums(dfm_anew_weighted) # let's apply our dictionary via get_sentiment function to our two texts get_sentiment(testText, method="syuzhet") # why do we get a different value compared to above? # Cause get_sentiment does not count if a positive or negative words appears once or twice. It counts it always once! # zombies weight: -0.25; youthful: 0.5; vision: 0.5; yawn: -0.25; yes: 0.8 # first text: "vision yawn yes yes pippo teen" # according to get_sentiment: 0.5 -0.25 + 0.8 = 1.05 # if you count the # of words: 0.5 -0.25 + 0.8*2 = 1.85 # second text: "Husband husbands youthful zombies zombies teenager" # according to get_sentiment: 0.5 -0.25 = 0.25 # if you count the # of words: 0.5 -0.25*2 = 0