rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(ggplot2)
library(syuzhet)
library(reshape2)
library(gridExtra)

###############################
# dictionary that you can call from Quanteda
###############################

# using the stemming procedure can be a problem in some cases (according to the dictionary employed - if it does or does not
# contain stemming/wildcards such as "*" !!!)

#################################
# 1) Create your own dictionary! 
#################################

# Let's create a dictionary with 3 entries
myDict <- dictionary(list(terror = c("terror*", "threat"),
                          economy = c("job*", "business*", "grow", "work"),
   pop= c("people", "washington")))
myDict 

# let's apply this dictionary to our usual US Presidential inaugural speeches corpus after 1991
recentCorpus <- corpus_subset(data_corpus_inaugural, Year > 1991)
tok2 <- tokens(recentCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
Mydfm <- dfm(tok2)

# the function to apply a dictionary to a DfM is dfm_lookup
byPresMat <- dfm_lookup(Mydfm  , dictionary = myDict)
byPresMat
Dictionary <-convert(byPresMat, to="data.frame")
head(Dictionary )

# you can also applay a dictionary to a tokenized object via the token_lookup option
byPresMatToks <- tokens_lookup(tok2, myDict )
print(byPresMatToks)

# the advantage of tokens_lookup is that allows you to include in a dictionary multi-word expressions
# for example:
head(kwic(tokens(data_corpus_inaugural), phrase(c ("United States", "New York"))))

# let's define a dictionary with 2 multi-word expressions
multiword_dict <- dictionary(list(country = "United States", 
                                  city = "New York"))
multiword_dict 

toks <- tokens(data_corpus_inaugural)
Mydfm <- dfm(toks)
byPresMat <- dfm_lookup(Mydfm  , dictionary = multiword_dict)
byPresMat
Dictionary <-convert(byPresMat, to="data.frame")
# all 0s!
head(Dictionary )

# however with tokens_lookup...
head(tokens_lookup(toks, dictionary = multiword_dict))

# so how to deal with multi-word expressions in a DfM? You could first use tokens_compound() to join elements of multi-word expressions
# by underscore, so they become “United_States” and “New_York”.

comp_toks <- tokens_compound(toks, pattern =phrase(c ("United States", "New York")))
head(tokens_select(comp_toks, pattern= c("United_States", "New_York")))

# and now let's define a new dictionary and let's create a new dfm
multiword_dict2 <- dictionary(list(country = "United_States", 
                                  city = "New_York"))
multiword_dict2 
Mydfm <- dfm(comp_toks)
byPresMat <- dfm_lookup(Mydfm  , dictionary = multiword_dict2)
byPresMat
Dictionary <-convert(byPresMat, to="data.frame")
head(Dictionary)
# same result as above via tokens_lookup
head(tokens_lookup(toks, dictionary = multiword_dict))

tail(Dictionary)
tail(tokens_lookup(toks, dictionary = multiword_dict))

#################################
# 2) Using some existing dictionary!
# For example: Import the Laver-Garry dictionary from Provalis Research (see: https://www.jstor.org/stable/2669268)
#################################

dictfile <- tempfile()
download.file("https://provalisresearch.com/Download/LaverGarry.zip", dictfile, mode = "wb")
unzip(dictfile, exdir = (td <- tempdir()))
lgdict <- dictionary(file = paste(td, "LaverGarry.cat", sep = "/"))
# here we are stemmed words/wildcards...
lgdict
# is it a dictionary that can be read by Quanteda? YES!
is.dictionary(lgdict)

recentCorpus <- corpus_subset(data_corpus_inaugural, Year > 1991)
tok2 <- tokens(recentCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
Mydfm <- dfm(tok2)

# let's focus on those US Presidential Speeches after 1991 (whose dfm we created above) and let's apply the dictionary to our corpus
lg_dfm <- dfm_lookup(Mydfm  , dictionary = lgdict)
lg_dfm

# Let's focus on the categories "More State" and "Less State"
Dictionary <-convert(lg_dfm, to="data.frame")
str(Dictionary )

colnames(Dictionary )
names(Dictionary )[6] <- "More_State"
names(Dictionary )[8] <- "Less_State"
colnames(Dictionary )

# and let's identify the more "pro-state" Presidents (according to his inaugural speech)
Dictionary$pro_state <- Dictionary$More_State/(Dictionary$Less_State+Dictionary$More_State)

ggplot(data=Dictionary , aes(x=doc_id, y=pro_state)) +
  geom_bar(stat="identity") + coord_flip()

#################################
# 3) sentiment dictionaries
#################################

# Quanteda has integrated a sentiment dictionary constructed by Young & Soroka (2012) stored in data_dictionary_LSD2015
# called Lexicoder. The dictionary contains thousands of positive and negative words or word stems.

lengths(data_dictionary_LSD2015)
head(data_dictionary_LSD2015)
head(data_dictionary_LSD2015[1:2])

sentiment <- dfm_lookup(Mydfm  , dictionary = data_dictionary_LSD2015[1:2])
sentiment 
Dictionary <-convert(sentiment , to="data.frame")
str(Dictionary )
Dictionary$Sentiment <- Dictionary$posit/(Dictionary$negat+Dictionary$posit)
str(Dictionary )

ggplot(data=Dictionary , aes(x=doc_id, y=Sentiment)) +
  geom_bar(stat="identity") + coord_flip()

# are Republican presidents more "positive" than Democratic ones?

###############################
# using other packages for dictionary analysis not included in Quanteda: using the syuzhet package
###############################

# You can get access to different dictionaries
# the dictionary "syuzhet" for example: here every word in the dictionary has a different (positive or negative) weight 
# (contrary to previous dictionaries)
# Note: no wildcards are included!
get_sentiment_dictionary(dictionary = "syuzhet", language = "english")

# is it a dictionary that can be read by Quanteda? NO!
is.dictionary(get_sentiment_dictionary(dictionary = "syuzhet", language = "english"))

# So we cannot apply the syuzhet dictionary directly via dfm_lookup!
# To apply such dictionary we need first to extract the texts from our corpus:
recent_corpus <- corpus_subset(data_corpus_inaugural, Year > 1991)
ndoc(recent_corpus)
# let's extract the texts from our corpus
text_corpus <- as.character(recent_corpus)[1:ndoc(recent_corpus)]
str(text_corpus)
syuzhet_vector <- get_sentiment(text_corpus, method="syuzhet") 
# the sentiment index you get is the difference between all the positive scores minus all the negative scores according to the dictionary
syuzhet_vector
president <- docnames(recent_corpus)
results1 <- as.data.frame(cbind(president , syuzhet_vector))
results1

# Remember an important point: get_sentiment does not count if a positive or negative words appears once or twice.
# It counts it always once! And if we want to count it more than once? 
# You convert it as a Quanteda dictionary and then use dfm_lookup (see the EXTRA script on the home-page of the course)

# Similarly: what if we want to apply such dictionary to an already "cleaned" tokenized object via Quanteda?
# Two possibilities: a) once again, you convert it as a Quanteda dictionary (see the EXTRA script on the home-page of the course); 
# b) you tokenizes your corpus and then convert it back to strings and apply to them the dictionary (but remember the issue of wilcards!)
# Let's follow the latter way

recent_corpus <- corpus_subset(data_corpus_inaugural, Year > 1991)
tok2 <- tokens(recentCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
xxx <- sapply(tok2, function(x) paste(as.character(x), collapse = " "))
str(xxx)
syuzhet_vector2 <- get_sentiment(xxx, method="syuzhet") 
syuzhet_vector2
# some slight differences compared to above
syuzhet_vector

# and what if you had apply trimming to a dfm? how to keep only the tokens from the dfm trimmed new object and apply to it the
# dictionary?

Mydfm <- dfm(tok2)
# 2527 tokens
nfeat(Mydfm)
# let's trim the dfm! We keep only the top 50% of the most frequent features (min_termfreq = 0.50) that appear less than 20% in all 
# the documents (max_docfreq = 0.2) using dfm_trim() to focus on common BUT distinguishing features.

new_dfm <-   dfm_trim(Mydfm, min_termfreq = 0.50, termfreq_type = "quantile", 
                     max_docfreq = 0.2, docfreq_type = "prop")
# fewer features compared to above
nfeat(new_dfm)
# let's extract the feature names present in our trimmed dfm
feats <- featnames(new_dfm) 
length(feats )
# let's keep in our tokenized objects only the features also included in our trimmed dfm
tok3 <- tokens_select(tok2, feats) 
yyy <- sapply(tok3, function(x) paste(as.character(x), collapse = " "))
syuzhet_vector3 <- get_sentiment(yyy, method="syuzhet") 
syuzhet_vector3

# syuzhet is a very nice package! To get some more info about it, take a look at this vignette:
# https://cran.r-project.org/web/packages/syuzhet/vignettes/syuzhet-vignette.html

# We can also employ the dictionary "nrc"
x <- get_sentiment_dictionary(dictionary = 'nrc', language = "english")
str(x)

# Let's apply the nrc dictionary!
nrc_vector <- get_sentiment(text_corpus , method="nrc")
nrc_vector
results2 <- as.data.frame(cbind(president , syuzhet_vector, nrc_vector))
results2

# let's correlate the results we got from the 2 dictionaries
str(results2 )
results2$syuzhet_vector <- as.numeric(results2$syuzhet_vector)
results2$nrc_vector <- as.numeric(results2$nrc_vector)
str(results2 )

# positive correlation, but not a perfect one

cor(results2$syuzhet_vector ,results2$nrc_vector)

ggplot(results2, aes(x=syuzhet_vector, y=nrc_vector)) +
  geom_point()  + 
  geom_text(label=president,  vjust = -0.5) +
  geom_smooth(method=lm)

# So what to do? Which of the two dictionaries to keep for your analysis? First: you should validate both dictionaries!
# Plus you can also compare the internal-coherence of the two dictionaries (GIVEN your corpus)
# via a split-half reliability test and keeping the dictionary with a better result.
# To understand how to do that, take a look at the EXTRA script in the home-page of the course

# the nrc dictionary has also more "categories" beyond negative and positive (detecting other "emotions")
get_sentiment_dictionary(dictionary = 'nrc', language = "english")

# moreover the dictionary "nrc" cover several different languages...
get_sentiment_dictionary(dictionary = 'nrc', language = "spanish")
get_sentiment_dictionary(dictionary = 'nrc', language = "italian")
get_sentiment_dictionary(dictionary = 'nrc', language = "arabic")
get_sentiment_dictionary(dictionary = 'nrc', language = "japanese")
get_sentiment_dictionary(dictionary = 'nrc', language = "turkish")

# let's apply such new extended dictionary
nrc_data_PR <- get_nrc_sentiment(text_corpus, language = "english")
nrc_data_PR$president <- docnames(recent_corpus)
str(nrc_data_PR)

# % of emotions in the text relative to each other
colSums(prop.table(nrc_data_PR[, 1:8]))

# plot % of emotions in the text relative to each other
barplot(
  sort(colSums(prop.table(nrc_data_PR[, 1:8]))), 
  horiz = TRUE, 
  cex.names = 0.7, 
  las = 1, 
  main = "Emotions in Sample text", xlab="Percentage"
  )

# plotting via ggplot
data <- nrc_data_PR[,c(1:8,11)]
data 
library(reshape2)
df.long<-melt(data,id.vars=c("president"))
head(df.long)

ggplot(data=df.long, aes(x=variable, y=value)) +
  geom_bar(stat="identity", fill="blue")  + coord_flip() + ylab(label="Emotions") +  xlab("President") 

# let's reorder the bars, etiher in ascending (+) or descending (-) order
ggplot(data=df.long, aes(x=reorder(variable, +value), y=value)) +
  geom_bar(stat="identity", fill="blue")  + coord_flip()  + ylab(label="Emotions") +  xlab("President") 

ggplot(data=df.long, aes(x=reorder(variable, -value), y=value)) +
  geom_bar(stat="identity", fill="blue")  + coord_flip()  + ylab(label="Emotions") +  xlab("President") 

# Let's focus on anger, disgust, fear and joy scores in each Presidential's speech
str(nrc_data_PR)
nrc_data_PR2 <- nrc_data_PR[,c(1,3,4,5,11)]
str(nrc_data_PR2)
df.long<-melt(nrc_data_PR2,id.vars=c("president"))
str(df.long)

ggplot(df.long,aes(president,value,fill=variable))+
 geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() +  
ylab(label="Emotional words") +  xlab("President") 

# (assuming that the dictionary is doing a good job) can we generalize from the previous plot that Obama was the President using more fear words? 
# What is the important information we are not considering here?
# The fact that the Presidential speeches have a very different length!

nrc_data_PR2 $tokens <- ntoken(recent_corpus)
str(nrc_data_PR2)
# so let's normalize our results
nrc_data_PR3 <- nrc_data_PR2[1:4]/nrc_data_PR2$tokens
str(nrc_data_PR3)
nrc_data_PR3$president <- nrc_data_PR2$president
df.long2<-melt(nrc_data_PR3,id.vars=c("president"))
str(df.long2)

ggplot(df.long2,aes(president,value,fill=variable))+
 geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() +  
ylab(label="Emotional words") +  xlab("President") 

unweighted <- ggplot(df.long,aes(president,value,fill=variable))+
 geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() +  
ylab(label="Emotional words (unweighted)") +  xlab("President") 

weighted <- ggplot(df.long2,aes(president,value,fill=variable))+
 geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() +  
ylab(label="Emotional words (weighted)") +  xlab("President") 

library(cowplot)
plot_grid(unweighted , weighted , labels = c("Unweighted", "Weighted" ))

########## problems with dictionaries (as discussed in our Lecture)

testText <- "This movie has good premises. Looks like it has a nice plot, and exceptional cast, 
first class actors and Stallone gives his best. But it sucks"
testText 
testCorpus <- corpus(testText)
tok2 <-tokens(testCorpus )
head(dfm_lookup(dfm(tok2), dictionary = data_dictionary_LSD2015 [1:2]))

s_v <- "this movie has good premises, looks like it has a nice plot, an exceptional cast, first class actors and Stallone gives his best, but it sucks"
syuzhet_vector <- get_sentiment(s_v, method="syuzhet")
nrc_vector <- get_sentiment(s_v, method="nrc")
syuzhet_vector
nrc_vector

##########