rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(rtweet)
library(ggplot2)
library(dplyr)
library(readtext)
library(quanteda)
library(cowplot)
library(gridExtra)
library(plyr)
library(syuzhet)
library(wordcloud)

rt <- search_tweets("Salvini", n = 1000, include_rts = FALSE,  lang = "en")
print(rt$lang[1:20])
length(rt$lang)
# I want to convert a date
str(rt$created_at)
rt$date <- as.Date(rt$created_at)
str(rt$date)
table(rt$date)

myCorpusTwitter<- corpus(rt)
tok2 <- tokens(myCorpusTwitter, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
Salvini_dfm <- dfm(tok2)
topfeatures(Salvini_dfm , 20)  # 20 top words

###########################################################################
# let's suppose we want to see the trend of the overall Sentiment per day about Salvini over all the tweets
###########################################################################

sentiment <- dfm_lookup(Salvini_dfm, dictionary = data_dictionary_LSD2015[1:2])
head(sentiment, 10)

Dictionary <-convert(sentiment , to="data.frame")
str(Dictionary )
Dictionary$Sentiment <- Dictionary$posit-Dictionary$negat
str(Dictionary )
summary(Dictionary$Sentiment)

# Let's suppose we want to plot the sentiment vs. the volume
rt$Sentiment <- Dictionary$Sentiment # add the sentiment values back to the data frame you got via Twitter
colnames(rt)

# get daily summaries of the results (average sentiments and number of tweets)
daily <- ddply(rt, ~ date, summarize, num_tweets = length(Sentiment), ave_sentiment = mean(Sentiment  )) 
str(daily)

# correlation between the volume of discussion and sentiment
cor(daily$ave_sentiment, daily$num_tweets)

# plot the daily sentiment vs. volume
sentiment <- ggplot(daily, aes(x=date, y=ave_sentiment)) + geom_line(linetype = "dashed", colour="red") +
 ggtitle("Salvini Sentiment") + xlab("Day") + ylab("Sentiment") + theme_gray(base_size = 12)

volume <- ggplot(daily, aes(x=date, y=num_tweets)) + geom_line() +
 ggtitle("Salvini #") + xlab("Day") + ylab("Volume") + theme_gray(base_size = 12)

grid.arrange(sentiment , volume , ncol = 1)

###################################
# Let's apply the nrc dictionaries with more categories than simply positive-negative
###################################

nrc_data <- get_nrc_sentiment(rt$text, language="english")
str(nrc_data)

# I want to read the tweets that includes a number of anger words>2
rt$text[nrc_data$anger > 2]

barplot(
  sort(colSums(prop.table(nrc_data[, 1:8]))), 
  horiz = TRUE, 
  cex.names = 0.7, 
  las = 1, 
  main = "Emotions in tweets discussing about Salvini", xlab="Percentage"
  )

# let's plot the wordcloud of emotions

all = c(
paste(rt$text[nrc_data$anger > 0], collapse=" "),
paste(rt$text[nrc_data$anticipation > 0], collapse=" "),
paste(rt$text[nrc_data$disgust > 0], collapse=" "),
paste(rt$text[nrc_data$fear > 0], collapse=" "),
paste(rt$text[nrc_data$joy > 0], collapse=" "),
paste(rt$text[nrc_data$sadness > 0], collapse=" "),
paste(rt$text[nrc_data$surprise > 0], collapse=" "),
paste(rt$text[nrc_data$trust > 0], collapse=" ")
)

str(all)

# clean the text
library(tm)
library(wordcloud)

# function to make the text suitable for analysis
clean.text = function(x)
{
# tolower
x = tolower(x)
# remove rt
x = gsub("rt", "", x)
# remove at
x = gsub("@\\w+", "", x)
# remove punctuation
x = gsub("[[:punct:]]", "", x)
# remove numbers
x = gsub("[[:digit:]]", "", x)
# remove links http
x = gsub("http\\w+", "", x)
# remove tabs
x = gsub("[ |\t]{2,}", "", x)
# remove blank spaces at the beginning
x = gsub("^ ", "", x)
# remove blank spaces at the end
x = gsub(" $", "", x)
return(x)
}

all = clean.text(all)
# remove stop-words
all = removeWords(all,  c(stopwords("english")))
# create corpus
corpus = Corpus(VectorSource(all))
# create term-document matrix
tdm = TermDocumentMatrix(corpus)
# convert as matrix
tdm = as.matrix(tdm)
# add column names
colnames(tdm) = c('anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust')

# Plot comparison wordcloud
layout(matrix(c(1, 2), nrow=2), heights=c(1, 4))
par(mar=rep(0, 4))
plot.new()
text(x=0.5, y=0.5, 'Emotion Comparison Word Cloud for tweets about Salvini')
comparison.cloud(tdm, random.order=FALSE,
colors = c("#00B2FF", "red", "#FF0099", "#6600CC", "green", "orange", "blue", "brown"),
title.size=1.5, max.words=250)

###################################
# Let's estimate the daily average emotions with respect to Salvini and let's plot it
###################################

str(nrc_data)
nrc_data$date <- rt$date
str(nrc_data)

daily_emotions <- aggregate( nrc_data[,1:8], by=list(day=nrc_data$date), FUN=mean) 
str(daily_emotions)
head(daily_emotions)

library(psych)
corr.test(daily_emotions [c(2:8)])

library(PerformanceAnalytics)
chart.Correlation(daily_emotions[c(2:8)])

# Let's plot it
library(reshape2)
df.long<-melt(daily_emotions,id.vars=c("day"))
str(df.long)
head(df.long)

ggplot(df.long, aes(x =day, y = value, color = variable)) +
geom_point() +
geom_smooth(method = "loess") +
facet_wrap(~ variable, scale = "free_y", nrow = 2) +
theme_bw() + labs( title = "Emotional analysis of Twitter statuses over time",
         subtitle = "Tweets aggregated by day on topics of Salvini") +  
theme(text = element_text(family = "Roboto Condensed"),
          plot.title = element_text(face = "bold"),
          legend.position = "bottom",
          axis.text = element_text(size = 9),
          legend.title = element_blank())