rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(ggplot2)
library(syuzhet)
library(reshape2)
library(gridExtra)

###############################
###############################
# convert an external dictionary to Quanteda dictionary
x <- get_sentiment_dictionary(dictionary = "syuzhet", language = "english")
str(x)
names(x)[2] <- "sentiment"
str(x)

# let's convert the values to negative and positive
x[["sentiment"]] <-
    with(x,
         sentiment <- ifelse(sentiment< 0, "negative",
                             ifelse(sentiment> 0, "positive", "netural"))
    )
table(x$sentiment)
# now let's convert it to a Quanteda dictionary using the function as.dictionary
dict <- as.dictionary(x)
str(dict)
is.dictionary(dict)

recentCorpus <- corpus_subset(data_corpus_inaugural, Year > 1991)
tok2 <- tokens(recentCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
Mydfm <- dfm(tok2)

# let's focus on those US Presidential Speeches after 1991 (whose dfm we created above) and let's apply the external dictionary to our corpus
ext_dfm <- dfm_lookup(Mydfm  , dictionary = dict)
ext_dfm

#######################################
#######################################
# and what if you want to keep the original weighting scores?
# through this procedure you can also create your own dictionary with your own weights of course!

x <- get_sentiment_dictionary(dictionary = "syuzhet", language = "english")
str(x)
weights <- x$value
names(weights) <- x$word
weights

# let's see an example
testText <- c("vision yawn yes yes pippo teen", "Husband husbands youthful zombies zombies teenager")
testText 
testCorpus <- corpus(testText)
tok3 <- tokens(testCorpus)
myDfm <- dfm(tok3)
myDfm
dfm_anew <- dfm_select(myDfm , pattern = x$word)
dfm_anew

dfm_anew_weighted <- dfm_weight(dfm_anew, weights = weights, scheme = "count") 
dfm_anew_weighted
# scores for the two texts
rowSums(dfm_anew_weighted)

# let's apply our dictionary via get_sentiment function to our two texts
get_sentiment(testText, method="syuzhet")
# why do we get a different value compared to above?
# Cause get_sentiment does not count if a positive or negative words appears once or twice. It counts it always once!

# zombies weight: -0.25; youthful: 0.5; vision: 0.5; yawn: -0.25; yes: 0.8
# first text: "vision yawn yes yes pippo teen"
# according to get_sentiment: 0.5 -0.25 + 0.8 = 1.05
# if you count the # of words:  0.5 -0.25 + 0.8*2 = 1.85

# second text: "Husband husbands youthful zombies zombies teenager"
# according to get_sentiment: 0.5 -0.25 = 0.25
# if you count the # of words: 0.5 -0.25*2 = 0