rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(ggplot2)
library(syuzhet)
library(reshape2)
library(gridExtra)
library(rtweet)
library(dplyr)
library(cowplot)
library(plyr)
library(syuzhet)

###############################
# dictionary that you can call from Quanteda
###############################

# using the stemming procedure can be a problem in some cases (according to the dictionary employed - if it does or does not
# contain stemming or wildcards such as "*")

#################################
# 1) Create your own dictionary! 
#################################

# Let's create a dictionary with 3 entries
myDict <- dictionary(list(terror = c("terrorism", "terrorists", "threat"),
                          economy = c("jobs", "business", "grow", "work"),
   pop= c("people", "washington")))
myDict 

# let's apply this dictionary to our usual US Presidential inaugural speeches corpus

recentCorpus <- corpus_subset(data_corpus_inaugural, Year > 1991)
tok2 <- tokens(recentCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
# note that we haven't stemmed the texts!
Mydfm <- dfm(tok2)

# you can use a dictionary on any given DfM via the function "dfm_lookup"
byPresMat <- dfm_lookup(Mydfm  , dictionary = myDict)
byPresMat

#################################
# 2) Using some existing dictionary!
# For example: Import the Laver-Garry dictionary from Provalis Research (see: https://www.jstor.org/stable/2669268)
#################################

dictfile <- tempfile()
download.file("https://provalisresearch.com/Download/LaverGarry.zip", dictfile, mode = "wb")
unzip(dictfile, exdir = (td <- tempdir()))
lgdict <- dictionary(file = paste(td, "LaverGarry.cat", sep = "/"))
# here we are stemmed words/wildcards...
lgdict

recentCorpus <- corpus_subset(data_corpus_inaugural, Year > 1991)
tok2 <- tokens(recentCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
Mydfm <- dfm(tok2)

# let's focus on those US Presidential Speeches after 1991 (whose dfm we created above) and let's apply the dictionary to our corpus
lg_dfm <- dfm_lookup(Mydfm  , dictionary = lgdict)
lg_dfm

# Let's focus on the categories "More State" and "Less State"
Dictionary <-convert(lg_dfm, to="data.frame")
str(Dictionary )

colnames(Dictionary )
names(Dictionary )[6] <- "More_State"
names(Dictionary )[8] <- "Less_State"
colnames(Dictionary )

# and let's identify the more "pro-state" Presidents (according to his inaugural speech)
Dictionary$pro_state <- Dictionary$More_State/(Dictionary$Less_State+Dictionary$More_State)

ggplot(data=Dictionary , aes(x=doc_id, y=pro_state)) +
  geom_bar(stat="identity") + coord_flip()

#################################
# 3) sentiment dictionaries
#################################

# Quanteda has integrated a sentiment dictionary constructed by Young & Soroka (2012) stored in data_dictionary_LSD2015
# called Lexicoder. The dictionary contains thousands of positive and negative words or word stems.

lengths(data_dictionary_LSD2015)
head(data_dictionary_LSD2015)
head(data_dictionary_LSD2015[1:2])

sentiment <- dfm_lookup(Mydfm  , dictionary = data_dictionary_LSD2015[1:2])
sentiment 
Dictionary <-convert(sentiment , to="data.frame")
str(Dictionary )
Dictionary$Sentiment <- Dictionary$posit/(Dictionary$negat+Dictionary$posit)
str(Dictionary )

ggplot(data=Dictionary , aes(x=doc_id, y=Sentiment)) +
  geom_bar(stat="identity") + coord_flip()

# are Republican Presidents more "positive" than Democratic ones?

###############################
# using other packages for dictionary analysis not included in Quanteda: using the syuzhet package
###############################

# You can get access to different dictionaries on internet!
# For example the dictionary "syuzhet": in this dictionary, every word has a different (positive or negative) weight 
# (contrary to previous dictionaries).
# Only English language is covered!!! And no wildcards!
get_sentiment_dictionary(dictionary = "syuzhet", language = "english")

# Let's apply the syuzhet dictionary!
# To apply such dictionary we need first to extract the texts from our corpus
recent_corpus <- corpus_subset(data_corpus_inaugural, Year > 1991)
ndoc(recent_corpus)
# let's extract the texts from the corpus
text_corpus <- as.character(recent_corpus)[1:ndoc(recent_corpus)]
str(text_corpus)
syuzhet_vector <- get_sentiment(text_corpus, method="syuzhet") 
syuzhet_vector

# let's create a nice data frame with these results for later use
president <- docnames(recent_corpus)
results1 <- as.data.frame(cbind(president , syuzhet_vector))
results1

# We can also employ the dictionary "nrc"
x <- get_sentiment_dictionary(dictionary = 'nrc', language = "english")
str(x)

# Let's apply now the nrc dictionary!
nrc_vector <- get_sentiment(text_corpus , method="nrc")
nrc_vector

# let's create a data frame with the results we got from the 2 dictionaries
results2 <- as.data.frame(cbind(president , syuzhet_vector, nrc_vector))
results2

# let's correlate the results we got from the 2 dictionaries
str(results2 )
results2$syuzhet_vector <- as.numeric(results2$syuzhet_vector)
results2$nrc_vector <- as.numeric(results2$nrc_vector)
str(results2 )
cor(results2$syuzhet_vector ,results2$nrc_vector)

plot(results2$syuzhet_vector, results2$nrc_vector, main="Scatterplot of Sentiment", 
  	xlab="syuzhet ", ylab="nrc ", pch=19)
text(results2$syuzhet_vector, results2$nrc_vector, labels = results2$president, pos = 4,  col = "royalblue" , cex = 0.8)
# Add fit lines
abline(lm(results2$nrc_vector~results2$syuzhet_vector), col="red") # regression line (y~x) 

# the nrc dictionary has also more "categories" beyond negative and positive (detecting other "emotions")
get_sentiment_dictionary(dictionary = 'nrc', language = "english")

# moreover the dictionary "nrc" cover several different languages...
get_sentiment_dictionary(dictionary = 'nrc', language = "spanish")
get_sentiment_dictionary(dictionary = 'nrc', language = "italian")
get_sentiment_dictionary(dictionary = 'nrc', language = "turkish")
get_sentiment_dictionary(dictionary = 'nrc', language = "arabic")

Sys.setlocale("LC_CTYPE","japanese")
get_sentiment_dictionary(dictionary = 'nrc', language = "japanese")

# let's apply such new extended dictionary
nrc_data_PR <- get_nrc_sentiment(text_corpus, language = "english")
nrc_data_PR$president <- docnames(recent_corpus)
str(nrc_data_PR)

# % of emotions in the text relative to each other
colSums(prop.table(nrc_data_PR[, 1:8]))

# plot % of emotions in the text relative to each other
barplot(
  sort(colSums(prop.table(nrc_data_PR[, 1:8]))), 
  horiz = TRUE, 
  cex.names = 0.7, 
  las = 1, 
  main = "Emotions in Sample text", xlab="Percentage"
  )

# Plotting using ggplot focusing on anger, disgust, fear and joy 
str(nrc_data_PR)
nrc_data_PR2 <- nrc_data_PR[,c(1,3,4,5,11)]
str(nrc_data_PR2)
df.long<-melt(nrc_data_PR2,id.vars=c("president"))
str(df.long)

ggplot(df.long,aes(president,value,fill=variable))+
 geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() +  
ylab(label="Emotional words") +  xlab("President") 

# (assuming that the dictionary is doing a good job) can we generalize from here that the Obama was the President using more fear words? 
# What is the important information we are not considering here? Of course, the relative length of each Presidential speech!
# So let's check for that!

# let's extract the number of words included in each Presidendial speech
nrc_data_PR2 $tokens <- ntoken(recent_corpus)
str(nrc_data_PR2)
# let's weight the previous results according to such number of words
nrc_data_PR3 <- nrc_data_PR2[1:4]/nrc_data_PR2$tokens
str(nrc_data_PR3)
nrc_data_PR3$president <- nrc_data_PR2$president
df.long2<-melt(nrc_data_PR3,id.vars=c("president"))
str(df.long2)

ggplot(df.long2,aes(president,value,fill=variable))+
 geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() +  
ylab(label="Emotional words") +  xlab("President") 

unweighted <- ggplot(df.long,aes(president,value,fill=variable))+
 geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() +  
ylab(label="Emotional words (unweighted)") +  xlab("President") 

weighted <- ggplot(df.long2,aes(president,value,fill=variable))+
 geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() +  
ylab(label="Emotional words (weighted)") +  xlab("President") 

library(cowplot)
plot_grid(unweighted , weighted , labels = c("Unweighted", "Weighted" ))

#######################################
########## problems with dictionaries (as discussed in our Lecture)
#######################################

testText <- "This movie has good premises. Looks like it has a nice plot, and exceptional cast, 
first class actors and Stallone gives his best. But it sucks"
testText 
testCorpus <- corpus(testText)
tok2 <-tokens(testCorpus )
head(dfm_lookup(dfm(tok2), dictionary = data_dictionary_LSD2015 [1:2]))

s_v <- "this movie has good premises, looks like it has a nice plot, an exceptional cast, first class actors and Stallone gives his best, but it sucks"
syuzhet_vector <- get_sentiment(s_v, method="syuzhet")
nrc_vector <- get_sentiment(s_v, method="nrc")
syuzhet_vector
nrc_vector

###############################
# An example with a twitter query
##############################

rt <- search_tweets("Salvini", n = 1000, include_rts = FALSE,  lang = "en")
print(rt$lang[1:20])
length(rt$lang)
# I want to convert a date
str(rt$created_at)
rt$date <- as.Date(rt$created_at)
str(rt$date)
table(rt$date)

myCorpusTwitter<- corpus(rt)
tok2 <- tokens(myCorpusTwitter, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, 
remove_separators = TRUE, remove_url=TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
Salvini_dfm <- dfm(tok2)
topfeatures(Salvini_dfm , 20)  # 20 top words

# Let's apply the nrc dictionaries 

nrc_data <- get_nrc_sentiment(rt$text, language="english")
str(nrc_data)

# I want to read the tweets that includes a number of anger words>2
rt$text[nrc_data$anger > 2]

# let's plot the wordcloud of emotions 

all = c(
paste(rt$text[nrc_data$anger > 0], collapse=" "),
paste(rt$text[nrc_data$anticipation > 0], collapse=" "),
paste(rt$text[nrc_data$disgust > 0], collapse=" "),
paste(rt$text[nrc_data$fear > 0], collapse=" "),
paste(rt$text[nrc_data$joy > 0], collapse=" "),
paste(rt$text[nrc_data$sadness > 0], collapse=" "),
paste(rt$text[nrc_data$surprise > 0], collapse=" "),
paste(rt$text[nrc_data$trust > 0], collapse=" ")
)

str(all)

recentCorpus <- corpus(all)
summary(recentCorpus )
colnames(nrc_data)[1:8]
docnames(recentCorpus ) <- colnames(nrc_data)[1:8]
summary(recentCorpus )
tok2 <- tokens(recentCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
Mydfm <- dfm(tok2)
Mydfm

library(quanteda.textplots)
set.seed(123)
textplot_wordcloud(dfm_trim(Mydfm, min_termfreq = 1, verbose = FALSE), comparison = TRUE, 
colors = c("#00B2FF", "red", "#FF0099", "#6600CC", "green", "orange", "blue", "brown"),
labelsize=1.5)

###################################
# Let's estimate the daily average emotions with respect to Salvini and let's plot it
###################################

str(nrc_data)
nrc_data$date <- rt$date
str(nrc_data)

daily_emotions <- aggregate( nrc_data[,1:8], by=list(day=nrc_data$date), FUN=mean) 
str(daily_emotions)
head(daily_emotions)

library(psych)
corr.test(daily_emotions [c(2:8)])

library(PerformanceAnalytics)
chart.Correlation(daily_emotions[c(2:8)])

# Let's plot it
library(reshape2)
df.long<-melt(daily_emotions,id.vars=c("day"))
str(df.long)
head(df.long)

ggplot(df.long, aes(x =day, y = value, color = variable)) +
geom_point() +
geom_smooth(method = "loess") +
facet_wrap(~ variable, scale = "free_y", nrow = 2) +
theme_bw() + labs( title = "Emotional analysis of Twitter statuses over time",
         subtitle = "Tweets aggregated by day on topics of Salvini") +  
theme(text = element_text(family = "Roboto Condensed"),
          plot.title = element_text(face = "bold"),
          legend.position = "bottom",
          axis.text = element_text(size = 9),
          legend.title = element_blank()) 

###############################
# MISC - for you to look at
###############################
# how to convert an external dictionary to a Quanteda dictionary?

x <- get_sentiment_dictionary(dictionary = "syuzhet", language = "english")
str(x)
names(x)[2] <- "sentiment"
str(x)
table(x$sentiment)

# let's recode the values to negative and positive
x$sentiment <- ifelse(x$sentiment< 0, "negative", ifelse(x$sentiment> 0, "positive", "netural"))
table(x$sentiment)

str(x)
# now let's convert it to a Quanteda dictionary using the function as.dictionary
dict <- as.dictionary(x)
str(dict)
is.dictionary(dict)

recentCorpus <- corpus_subset(data_corpus_inaugural, Year > 1991)
tok2 <- tokens(recentCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
Mydfm <- dfm(tok2)
# let's focus on those US Presidential Speeches after 1991 (whose dfm we created above) and let's apply the external dictionary to our corpus
ext_dfm <- dfm_lookup(Mydfm  , dictionary = dict)
ext_dfm

# And what if you want to keep the original weighting scores?
# Note that by adopting the procedure below, you can also apply to your own dictionary a given set of weights as you prefer!

x <- get_sentiment_dictionary(dictionary = "syuzhet", language = "english")
str(x)
weights <- x$value
names(weights) <- x$word
weights

# let's extract the sentiment from these 3 random texts
testText <- c("vision yawn yes yes pippo teen", "Husband husbands youthful zombies zombies teenager")
testText 
testCorpus <- corpus(testText)
tok3 <- tokens(testCorpus)
myDfm <- dfm(tok3)
myDfm
# let's select only those features in the dfm also presented in the vocabulary above
dfm_anew <- dfm_select(myDfm , pattern = x$word)
dfm_anew

# Now let's weight the reamining words in the dfm according to the weights vector.
# We know that vision=0.5 (weight); yawn=-0.25; yes=0.8 (and it appears twice in text 1)
# as a result final score for text 1: 0.5 -0.25 + 0.8*2 = 1.85
# And what about text 2? youthful=0.5 (weight); zombies=-0.25 (and it appears twice in text 2)
# as a result final score for text 2: 0.5 -0.25*2 = 0
dfm_anew_weighted <- dfm_weight(dfm_anew, weights = weights, scheme = "count") 
dfm_anew_weighted
# total scores
rowSums(dfm_anew_weighted)

# dfm_anew <- dfm_weight(dfm_anew, scheme = "prop") # remove if you don't want normalized scores

# Note that by applying the original dictionary "syuzhet" you get different values than above! Why? 
# Cause "syuzhet" does not count if a positive or negative words appears once or twice. It counts any appearance of
# a given word in a text >1 always as 1!
get_sentiment(testText, method="syuzhet")
# And indeed:
# 0.5 -0.25 + 0.8=1.05
# 0.5 -0.25=0.25