rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(rtweet)
library(ggplot2)
library(dplyr)
library(readtext)
library(quanteda)
library(cowplot)
library(gridExtra)
library(plyr)
library(syuzhet)

rt <- search_tweets("Salvini", n = 1000, include_rts = FALSE,  lang = "en", type="mixed")
print(rt$lang[1:20])
length(rt$lang)
# I want to convert a date
str(rt$created_at)
rt$date <- as.Date(rt$created_at)
str(rt$date)
table(rt$date)

myCorpusTwitter<- corpus(rt)
tok2 <- tokens(myCorpusTwitter, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_url=TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
Salvini_dfm <- dfm(tok2)
topfeatures(Salvini_dfm , 20)  # 20 top words

###########################################################################
# let's suppose we want to see the trend of the overall Sentiment per day about Salvini over all the tweets
###########################################################################

sentiment <- dfm_lookup(Salvini_dfm, dictionary = data_dictionary_LSD2015[1:2])
head(sentiment, 10)

Dictionary <-convert(sentiment , to="data.frame")
str(Dictionary )
Dictionary$Sentiment <- Dictionary$posit-Dictionary$negat
str(Dictionary )
summary(Dictionary$Sentiment)

# Let's suppose we want to plot the sentiment vs. the volume
rt$Sentiment <- Dictionary$Sentiment # add the sentiment values back to the data frame you got via Twitter
colnames(rt)

# get daily summaries of the results (average sentiments and number of tweets)
daily <- ddply(rt, ~ date, summarize, num_tweets = length(Sentiment), ave_sentiment = mean(Sentiment  )) 
str(daily)

# correlation between the volume of discussion and sentiment
cor(daily$ave_sentiment, daily$num_tweets)

# plot the daily sentiment vs. volume
sentiment <- ggplot(daily, aes(x=date, y=ave_sentiment)) + geom_line(linetype = "dashed", colour="red") +
 ggtitle("Salvini Sentiment") + xlab("Day") + ylab("Sentiment") + theme_gray(base_size = 12)

volume <- ggplot(daily, aes(x=date, y=num_tweets)) + geom_line() +
 ggtitle("Salvini #") + xlab("Day") + ylab("Volume") + theme_gray(base_size = 12)

grid.arrange(sentiment , volume , ncol = 1)

###################################
# Let's apply the nrc dictionaries with more categories than simply positive-negative
###################################

nrc_data <- get_nrc_sentiment(rt$text, language="english")
str(nrc_data)

# I want to read the tweets that includes a number of anger words>2
rt$text[nrc_data$anger > 2]

# let's add back the column with the texts
nrc_data$text <- rt$text
str(nrc_data)

# let's plot the wordcloud of emotions 
# first: let's group the texts according to their emotions, i.e., now our unit of analysis (our rows) will be the texts
# grouped according to the 8 emotions

all = c(
paste(rt$text[nrc_data$anger > 0], collapse=" "),
paste(rt$text[nrc_data$anticipation > 0], collapse=" "),
paste(rt$text[nrc_data$disgust > 0], collapse=" "),
paste(rt$text[nrc_data$fear > 0], collapse=" "),
paste(rt$text[nrc_data$joy > 0], collapse=" "),
paste(rt$text[nrc_data$sadness > 0], collapse=" "),
paste(rt$text[nrc_data$surprise > 0], collapse=" "),
paste(rt$text[nrc_data$trust > 0], collapse=" ")
)

str(all)

recentCorpus <- corpus(all)
summary(recentCorpus )
# let's add the proper emotion name to the docnames of our corpus
colnames(nrc_data)[1:8]
docnames(recentCorpus ) <- colnames(nrc_data)[1:8]
summary(recentCorpus )
tok2 <- tokens(recentCorpus , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, 
remove_separators = TRUE, remove_url = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok   <- tokens_wordstem (tok   , language =("english"))
Mydfm <- dfm(tok2)
Mydfm

# let's remove the feature "salvini" (not very informative given that all tweets include such feature)
MyDfm2 <- dfm_remove(Mydfm, c("salvini"))

library(quanteda.textplots)
set.seed(123)
textplot_wordcloud(MyDfm2 , min.count = 6, rot.per = .25, comparison = TRUE, 
colors = c("#00B2FF", "red", "#FF0099", "#6600CC", "green", "orange", "blue", "brown"),
labelsize=1.5)

###################################
# Let's estimate the daily average emotions with respect to Salvini and let's plot it
###################################

str(nrc_data)
nrc_data$date <- rt$date
str(nrc_data)

daily_emotions <- aggregate( nrc_data[,1:8], by=list(day=nrc_data$date), FUN=mean) 
str(daily_emotions)
head(daily_emotions)

library(psych)
corr.test(daily_emotions [c(2:8)])

library(PerformanceAnalytics)
chart.Correlation(daily_emotions[c(2:8)])

# Let's plot it
library(reshape2)
df.long<-melt(daily_emotions,id.vars=c("day"))
str(df.long)
head(df.long)

ggplot(df.long, aes(x =day, y = value, color = variable)) +
geom_point() +
geom_smooth(method = "loess") +
facet_wrap(~ variable, scale = "free_y", nrow = 2) +
theme_bw() + labs( title = "Emotional analysis of Twitter statuses over time",
         subtitle = "Tweets aggregated by day on topics of Salvini") +  
theme(text = element_text(family = "Roboto Condensed"),
          plot.title = element_text(face = "bold"),
          legend.position = "bottom",
          axis.text = element_text(size = 9),
          legend.title = element_blank())