rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(ggplot2)
library(syuzhet)
library(plotly)
library(reshape2)
library(gridExtra)

###############################
# dictionary that you can call from Quanteda
###############################

# using the stemming procedure can be a problem in some cases (according to the dictionary employed - if it does not
# contain stemming!!!). We avoid this option in the following examples

#################################
# 1) Create your own dictionary! We already saw how to do that in Lab 1!
#################################

corp <- corpus_subset(data_corpus_inaugural, Year>1900)
dict <- dictionary(list(christmas = c("Christmas", "Santa", "holiday"),
                          opposition = c("Opposition", "reject", "notincorpus"),
                          taxing = "taxing",
                          taxation = "taxation",
                          taxregex = "tax*",
                          country = "america"))
head(dfm(corp, dictionary = dict))

# Grouping words by dictionary

recentCorpus <- corpus_subset(data_corpus_inaugural, Year > 1991)
myDict <- dictionary(list(terror = c("terrorism", "terrorists", "threat"),
                          economy = c("jobs", "business", "grow", "work"),
   pop= c("people", "washington")))
byPresMat <- dfm(recentCorpus, dictionary = myDict)
byPresMat

recentCorpus <- corpus_subset(data_corpus_inaugural, Year > 1937)
byPresMat <- dfm(recentCorpus , dictionary = myDict)
byPresMat

#################################
# 2) Using some existing dictionary!
# For example: Import the Laver-Garry dictionary from Provalis Research (see: https://www.jstor.org/stable/2669268)
#################################

dictfile <- tempfile()
download.file("https://provalisresearch.com/Download/LaverGarry.zip", dictfile, mode = "wb")
unzip(dictfile, exdir = (td <- tempdir()))
lgdict <- dictionary(file = paste(td, "LaverGarry.cat", sep = "/"))
str(lgdict)
lgdict

# let's focus on those US Presidential Speeches after 1991
recent_corpus <- corpus_subset(data_corpus_inaugural, Year > 1991)
summary(recent_corpus)

# let's apply the dictionary to our corpus
lg_dfm <- dfm(recent_corpus, dictionary = lgdict)
lg_dfm

# Let's focus on the categories "More State" and "Less State"
Dictionary <-convert(lg_dfm, to="data.frame")
str(Dictionary )

colnames(Dictionary )
names(Dictionary )[6] <- "More_State"
names(Dictionary )[8] <- "Less_State"
colnames(Dictionary )

p <-ggplot(data=Dictionary , aes(x=document, y=More_State)) +
  geom_bar(stat="identity")

p2 <-ggplot(data=Dictionary , aes(x=document, y=Less_State)) +
  geom_bar(stat="identity")

grid.arrange(p, p2, ncol=2)

### Plotting in one single graph
str(Dictionary)
Dictionary2 <- Dictionary[,c(1,6,8)]
str(Dictionary2 )
df.long<-melt(Dictionary2,id.vars=c("document"))
str(df.long)

ggplot(df.long,aes(document,value,fill=variable))+
 geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() +  
ylab(label="Frequency More/Less State words") +  xlab("Party") 

# Which relationship between "More State" and "Less State" in the economy? Negative as expected!
cor(Dictionary $More_State, Dictionary $Less_State)

plot(Dictionary $More_State, Dictionary $Less_State, main="Scatterplot Example", 
  	xlab="More_State", ylab="Less_State", pch=19)
text(Dictionary $More_State, Dictionary $Less_State, labels = Dictionary $document, pos = 4,  col = "royalblue" , cex = 0.8)
abline(lm(Dictionary $Less_State~Dictionary $More_State), col="red") # regression line (y~x) 

#################################
# 3) sentiment dictionaries
#################################

# Quanteda has integrated a sentiment dictionary constructed by Young & Soroka (2012) stored in data_dictionary_LSD2015
# called Lexicoder. The dictionary contains thousands of positive and negative words or word stems.

lengths(data_dictionary_LSD2015)
head(data_dictionary_LSD2015)
head(data_dictionary_LSD2015[1:2])

sentiment <- dfm(recent_corpus, dictionary = data_dictionary_LSD2015[1:2])
sentiment 
Dictionary <-convert(sentiment , to="data.frame")
str(Dictionary )
Dictionary$Sentiment <- Dictionary$positive-Dictionary$negative
str(Dictionary )

ggplot(data=Dictionary , aes(x=document, y=Sentiment)) +
  geom_bar(stat="identity")

#################################
# Applying dictionaries to Party Manifestoes!
#################################

library(manifestoR)
mp_setapikey(key.file = NULL, key = "YOUR MANIFESTO API KEY")

# Let's focus on the US manifestoes of the 2016 elections
cmp <- mp_maindataset()
us <- cmp[ which(cmp$countryname=="United States" & cmp$date == 201611 ),]
print(us[c("partyname", "party", "edate", "date", "partyabbrev")])

available_us2016 <- mp_availability(countryname == "United States" & date == 201611 & partyname %in% c("Democratic Party","Republican Party"))
str(available_us2016)
US2016  <- mp_corpus(countryname=="United States" & date == 201611 )
summary(US2016  )
quanteda_US2016 <- corpus(US2016 )
summary(head(quanteda_US2016 ))

# Which of the two party manifestoes was more negative?
# making the DFM, grouping by party and considering only words negative and positive
dfm_us2016 <- dfm(quanteda_US2016,  tolower=TRUE, remove_punct = TRUE, dictionary = data_dictionary_LSD2015[1:2], group=c("party"))
dfm_us2016
str(dfm_us2016)
us$partyname 
dfm_us2016@Dimnames$docs <- us$partyname 
dfm_us2016@Dimnames$docs
dfm_us2016

# What about the role of State in economy?
str(lgdict)
dfm_us2016_eco <- dfm(quanteda_US2016,  tolower=TRUE, remove_punct = TRUE, dictionary = lgdict[2], group=c("party"))
dfm_us2016_eco
dfm_us2016_eco@Dimnames$docs <- us$partyname 
dfm_us2016_eco

Dictionary <-convert(dfm_us2016, to="data.frame")
str(Dictionary )
Dictionary$Sentiment <- Dictionary$positive-Dictionary$negative
str(Dictionary )
Dictionary2 <-convert(dfm_us2016_eco, to="data.frame")
str(Dictionary2 )
colnames(Dictionary2 )
names(Dictionary2 )[2] <- "More_State"
names(Dictionary2 )[4] <- "Less_State"
colnames(Dictionary2 )
Dictionary2$ProMarket<- Dictionary2$Less_State-Dictionary2$More_State
str(Dictionary2 )

p <-ggplot(data=Dictionary , aes(x=document, y=Sentiment)) +
  geom_bar(stat="identity")

p2 <-ggplot(data=Dictionary2 , aes(x=document, y=ProMarket)) +
  geom_bar(stat="identity")

grid.arrange(p, p2, ncol=2)

###############################
# using other packages for dictionary analysis: using the syuzhet package
###############################

# You can get access to different dictionaries
# the dictionary "syuzhet" for example: here every word in the dictionary has a different (positive or negative) weight 
# (contrary to other dictionaries)
# only English covered!!!
get_sentiment_dictionary(dictionary = "syuzhet", language = "english")

# Let's apply the syuzhet dictionary!
recent_corpus <- corpus_subset(data_corpus_inaugural, Year > 1991)
str(recent_corpus)
syuzhet_vector <- get_sentiment(recent_corpus$documents$texts, method="syuzhet") # I apply the dictionary to the element in the corpus including the texts
syuzhet_vector
president <- docnames(recent_corpus)
results1 <- as.data.frame(cbind(president , syuzhet_vector))
results1

# the dictionary "nrc" cover several different languages...
get_sentiment_dictionary(dictionary = 'nrc', language = "english")
x <- get_sentiment_dictionary(dictionary = 'nrc', language = "english")
str(x)

get_sentiment_dictionary(dictionary = 'nrc', language = "spanish")
get_sentiment_dictionary(dictionary = 'nrc', language = "italian")
get_sentiment_dictionary(dictionary = 'nrc', language = "arabic")
get_sentiment_dictionary(dictionary = 'nrc', language = "japanese")
get_sentiment_dictionary(dictionary = 'nrc', language = "turkish")

# Let's apply the nrc dictionary!
nrc_vector <- get_sentiment(recent_corpus$documents$texts, method="nrc")
nrc_vector
results2 <- as.data.frame(cbind(president , syuzhet_vector, nrc_vector))
results2

# let's correlate the results we got from the 2 dictionaries
str(results2 )
results2$syuzhet_vector <- as.numeric(levels(results2$syuzhet_vector))[results2$syuzhet_vector] 
results2$nrc_vector <- as.numeric(levels(results2$nrc_vector))[results2$nrc_vector] 
str(results2 )
cor(results2$syuzhet_vector ,results2$nrc_vector)

plot(results2$syuzhet_vector, results2$nrc_vector, main="Scatterplot of Sentiment", 
  	xlab="syuzhet ", ylab="nrc ", pch=19)
text(results2$syuzhet_vector, results2$nrc_vector, labels = results2$president, pos = 4,  col = "royalblue" , cex = 0.8)
# Add fit lines
abline(lm(results2$nrc_vector~results2$syuzhet_vector), col="red") # regression line (y~x) 

# the nrc dictionary has also a variant with more "categories" beyond negative and positive
nrc_data_PR <- get_nrc_sentiment(recent_corpus$documents$texts, language = "english")
str(nrc_data_PR)
# let's add the name of the presidents to the data frame
nrc_data_PR$president <- docnames(recent_corpus)
str(nrc_data_PR)

# % of emotions in the text relative to each other
colSums(prop.table(nrc_data_PR[, 1:8]))

# plot % of emotions in the text relative to each other
barplot(
  sort(colSums(prop.table(nrc_data_PR[, 1:8]))), 
  horiz = TRUE, 
  cex.names = 0.7, 
  las = 1, 
  main = "Emotions in Sample text", xlab="Percentage"
  )

# Plotting using ggplot
str(nrc_data_PR)
nrc_data_PR2 <- nrc_data_PR[,c(1,4,5,8,11)]
str(nrc_data_PR2)
df.long<-melt(nrc_data_PR2,id.vars=c("president"))
str(df.long)

ggplot(df.long,aes(president,value,fill=variable))+
 geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() +  
ylab(label="Emotional words") +  xlab("President") 

########## problems with dictionaries (as discussed in our Lecture)

testText <- "This movie has good premises. Looks like it has a nice plot, and exceptional cast, 
first class actors and Stallone gives his best. But it sucks"
testText 
testCorpus <- corpus(testText)
head(dfm(testText  , dictionary = data_dictionary_LSD2015 ))

s_v <- "this movie has good premises, looks like it has a nice plot, an exceptional cast, first class actors and Stallone gives his best, but it sucks"
syuzhet_vector <- get_sentiment(s_v, method="syuzhet")
nrc_vector <- get_sentiment(s_v, method="nrc")
syuzhet_vector
nrc_vector

##########