rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(gridExtra)
library(ggplot2)
library(syuzhet)
library(plotly)
library(reshape2)

###############################
# dictionary that you can call from Quanteda
###############################

# using the stemming procedure can be a problem in some cases (according to the dictionary employed). 
# We avoid this option in the following examples

#################################
# general dictionaries
#################################

#################################
# 1) import the Laver-Garry dictionary from Provalis Research (see: https://www.jstor.org/stable/2669268)
#################################

dictfile <- tempfile()
download.file("https://provalisresearch.com/Download/LaverGarry.zip", dictfile, mode = "wb")
unzip(dictfile, exdir = (td <- tempdir()))
lgdict <- dictionary(file = paste(td, "LaverGarry.cat", sep = "/"))
str(lgdict)
lgdict

recent_corpus <- corpus_subset(data_corpus_inaugural, Year > 1991)
summary(recent_corpus)

lg_dfm <- dfm(recent_corpus, dictionary = lgdict)
lg_dfm
str(lg_dfm)
lg_dfm@Dimnames

# Let's focus on the categories "More State" and "Less State"
Dictionary <-convert(lg_dfm, to="data.frame")
str(Dictionary )

colnames(Dictionary )
names(Dictionary )[6] <- "More_State"
names(Dictionary )[8] <- "Less_State"
colnames(Dictionary )

p <-ggplot(data=Dictionary , aes(x=document, y=More_State)) +
  geom_bar(stat="identity")

p2 <-ggplot(data=Dictionary , aes(x=document, y=Less_State)) +
  geom_bar(stat="identity")

grid.arrange(p, p2, ncol=2)

### Plotting in one single graph
str(Dictionary)
Dictionary2 <- Dictionary[,c(1,6,8)]
str(Dictionary2 )
df.long<-melt(Dictionary2,id.vars=c("document"))
str(df.long)

ggplot(df.long,aes(document,value,fill=variable))+
 geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() +  
ylab(label="Frequency More/Less State words") +  xlab("Party") 

# Which relationship between "More State" and "Less State" in the economy?
cor(Dictionary $More_State, Dictionary $Less_State)

plot(Dictionary $More_State, Dictionary $Less_State, main="Scatterplot Example", 
  	xlab="More_State", ylab="Less_State", pch=19)
text(Dictionary $More_State, Dictionary $Less_State, labels = Dictionary $document, pos = 4,  col = "royalblue" , cex = 0.8)
abline(lm(Dictionary $Less_State~Dictionary $More_State), col="red") # regression line (y~x) 

#################################
# 2) import a LIWC formatted dictionary from http://www.moralfoundations.org
#################################

download.file("https://goo.gl/5gmwXq", tf <- tempfile())
mfdict <- dictionary(file = tf, format = "LIWC")
str(mfdict)

liwcdfm <- dfm(recent_corpus, dictionary = mfdict)
liwcdfm

#################################
# sentiment dictionaries
#################################

# Quanteda has integrated a sentiment dictionary constructed by Young & Soroka (2012) stored in data_dictionary_LSD2015. 
# The dictionary contains thousands of positive and negative words or word stems.

lengths(data_dictionary_LSD2015)
head(data_dictionary_LSD2015)

news_toks <- tokens(recent_corpus, remove_punct = TRUE)
lsd_toks <- tokens_lookup(news_toks, data_dictionary_LSD2015[1:2])
head(lsd_toks, 2)

sentiment <- dfm(recent_corpus, dictionary = data_dictionary_LSD2015[1:2])
sentiment 
Dictionary <-convert(sentiment , to="data.frame")
str(Dictionary )
Dictionary$Sentiment <- Dictionary$positive-Dictionary$negative
str(Dictionary )

#################################
# Applying dictionaries to Party Manifestoes!
#################################

library(manifestoR)
mp_setapikey(key.file = NULL, key = "XXXXXXXXXXXXXXXXX")

# Let's focus on the US manifestoes of the 2016 elections
cmp <- mp_maindataset()
us <- cmp[ which(cmp$countryname=="United States" & cmp$date == 201611 ),]
print(us[c("partyname", "party", "edate", "date", "partyabbrev")])
us$partyname 
us$party

available_us2016 <- mp_availability(countryname == "United States" & date == 201611 & partyname %in% c("Democratic Party","Republican Party"))
available_us2016 
US2016 <- mp_corpus(available_us2016 )
quanteda_US2016 <- corpus(US2016 )
summary(head(quanteda_US2016 ))

# Which of the two party manifestoes was more negative?
# making the DFM, grouping by party and considering only words negative and positive
dfm_us2016 <- dfm(quanteda_US2016,  tolower=TRUE, remove_punct = TRUE, dictionary = data_dictionary_LSD2015[1:2], group=c("party"))
dfm_us2016
str(dfm_us2016)
dfm_us2016@Dimnames$docs
dfm_us2016@Dimnames$docs <- us$partyname 
dfm_us2016@Dimnames$docs
dfm_us2016

# What about the role of State in economy?
dfm_us2016_eco <- dfm(quanteda_US2016,  tolower=TRUE, remove_punct = TRUE, dictionary = lgdict, group=c("party"))
dfm_us2016_eco
dfm_us2016_eco@Dimnames$docs <- us$partyname 
dfm_us2016_eco

###############################
# using other packages for dictionary analysis: using syuzhet package
###############################

# You can get access to different dictionaries

# the dictionary "syuzhet": every word in the dictionary has a different (positive or negative) weight (contrary to
# other dictionaries)

# only English covered
get_sentiment_dictionary(dictionary = "syuzhet", language = "english")

# Let's apply the syuzhet dictionary!
recent_corpus <- corpus_subset(data_corpus_inaugural, Year > 1991)
summary(recent_corpus)
str(recent_corpus)
recent_corpus$documents$texts

syuzhet_vector <- get_sentiment(recent_corpus$documents$texts, method="syuzhet")
syuzhet_vector
president <- docnames(recent_corpus)
results1 <- cbind(president , syuzhet_vector)
results1

# the dictionary "nrc" cover lots of languages...

get_sentiment_dictionary(dictionary = 'nrc', language = "english")
x <- get_sentiment_dictionary(dictionary = 'nrc', language = "english")
str(x)

get_sentiment_dictionary(dictionary = 'nrc', language = "spanish")
get_sentiment_dictionary(dictionary = 'nrc', language = "italian")
get_sentiment_dictionary(dictionary = 'nrc', language = "arabic")
get_sentiment_dictionary(dictionary = 'nrc', language = "japanese")
get_sentiment_dictionary(dictionary = 'nrc', language = "turkish")

# Let's apply the nrc dictionary!

nrc_vector <- get_sentiment(recent_corpus$documents$texts, method="nrc")
nrc_vector
results2 <- as.data.frame(cbind(president , syuzhet_vector, nrc_vector))
results2
str(results2 )
results2$syuzhet_vector <- as.numeric(levels(results2$syuzhet_vector))[results2$syuzhet_vector] 
results2$nrc_vector <- as.numeric(levels(results2$nrc_vector))[results2$nrc_vector] 
str(results2 )

cor(results2$syuzhet_vector ,results2$nrc_vector)

plot(results2$syuzhet_vector, results2$nrc_vector, main="Scatterplot of Sentiment", 
  	xlab="syuzhet ", ylab="nrc ", pch=19)
text(results2$syuzhet_vector, results2$nrc_vector, labels = results2$president, pos = 4,  col = "royalblue" , cex = 0.8)
# Add fit lines
abline(lm(results2$nrc_vector~results2$syuzhet_vector), col="red") # regression line (y~x) 

# Another general dictionaries with several "voices" beyond negative and positive
nrc_data_PR <- get_nrc_sentiment(recent_corpus$documents$texts, language = "english")
str(nrc_data_PR)
# let's estimate the "sentiment" as "positive-negative"
nrc_data_PR$sentiment <- nrc_data_PR$positive-nrc_data_PR$negative
str(nrc_data_PR)
# same as here the sentiment!
nrc_vector
# let's add the name of the presidents
nrc_data_PR$president <- docnames(recent_corpus)
str(nrc_data_PR)

# plot % positive and negative words relative to each other
barplot(
  sort(colSums(prop.table(nrc_data_PR[, 9:10]))), 
  horiz = TRUE, 
  cex.names = 0.7, 
  las = 1, 
  main = "Sentiment in Sample text", xlab="Percentage"
  )

# % of emotions in the text relative to each other
colSums(prop.table(nrc_data_PR[, 1:8]))

# plot % of emotions in the text relative to each other
barplot(
  sort(colSums(prop.table(nrc_data_PR[, 1:8]))), 
  horiz = TRUE, 
  cex.names = 0.7, 
  las = 1, 
  main = "Emotions in Sample text", xlab="Percentage"
  )

# let's plot over Presidents
# Plotting using ploty
p <- plot_ly(nrc_data_PR, x = ~president, y = ~anger, type = 'bar', name = 'Anger') %>%
   add_trace(y = ~fear, name = 'Fear') %>%
  add_trace(y = ~joy, name = 'Joy') %>%
  add_trace(y = ~trust, name = 'Trust') %>%
  layout(yaxis = list(title = 'Count'), barmode = 'group')
p

# Plotting using ggplot
str(nrc_data_PR)
nrc_data_PR2 <- nrc_data_PR[,c(1,4,5,8,12)]
str(nrc_data_PR2)
df.long<-melt(nrc_data_PR2,id.vars=c("president"))
str(df.long)

ggplot(df.long,aes(president,value,fill=variable))+
 geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() +  
ylab(label="Emotional words") +  xlab("President") 


# let's normalize the values according to the lenght of the Presidential speeches for anger, fear, joy and trust 
tokens <- summary(recent_corpus)
str(tokens)

#### OR IN ALTERNATIVE (highly suggested!)
tokens2 <- as.data.frame(ntoken(recent_corpus, remove_punct = TRUE,  remove_numbers=TRUE))
str(tokens2)
colnames(tokens2)[1] <- "tokens"
str(tokens2)

nrc_data_PR$tokens <- tokens2$tokens
str(nrc_data_PR)

nrc_data_PR$anger_n <- nrc_data_PR$anger/nrc_data_PR$tokens
nrc_data_PR$fear_n <- nrc_data_PR$fear/nrc_data_PR$tokens
nrc_data_PR$joy_n <- nrc_data_PR$joy/nrc_data_PR$tokens
nrc_data_PR$trust_n <- nrc_data_PR$trust/nrc_data_PR$tokens
str(nrc_data_PR)

# Plotting using ploty
p2 <- plot_ly(nrc_data_PR, x = ~president, y = ~anger_n, type = 'bar', name = 'Anger') %>%
  add_trace(y = ~fear_n, name = 'Fear') %>%
  add_trace(y = ~joy_n, name = 'Joy') %>%
  add_trace(y = ~trust_n, name = 'Trust') %>%
  layout(yaxis = list(title = '%'), barmode = 'group')
p2

########## problems with dictionaries

testText <- "This movie has good premises. Looks like it has a nice plot, and exceptional cast, 
first class actors and Stallone gives his best. But it sucks"
testText 
testCorpus <- corpus(testText)
head(dfm(testText  , dictionary = data_dictionary_LSD2015 ))

s_v <- "this movie has good premises, looks like it has a nice plot, an exceptional cast, first class actors and Stallone gives his best, but it sucks"
syuzhet_vector <- get_sentiment(s_v, method="syuzhet")
nrc_vector <- get_sentiment(s_v, method="nrc")
syuzhet_vector
nrc_vector

##########