rm(list=ls(all=TRUE))
getwd()
### set here your working directory!
setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL")
getwd()
library(rtweet)
library(ggplot2)
library(dplyr)
library(tidytext)
library(readtext)
library(quanteda)
library(httpuv)
library(maps)
library(leaflet)
library(lattice)
library(cowplot)
library(gridExtra)
library(ggthemes) 
library(plyr)
library(syuzhet)

token <- create_token(
  app = "my_twitter_research_app",
  consumer_key = "YOUR NUMBER",
  consumer_secret = "YOUR NUMBER",
  access_token = "YOUR NUMBER",
  access_secret = "YOUR NUMBER")

get_token()

rt <- search_tweets("Icardi", n = 2000, include_rts = FALSE,  lang = "en")
print(rt$lang[1:20])
colnames(rt)
# I want to convert the POSIXct time format to a date (here you can also change the time zone by selecting tz="Hongkong" for example)
# here I choose Greenwich Mean Time (GMT)
str(rt$created_at)
rt$date <- as.Date(rt$created_at, "GMT")
str(rt$date)

myCorpusTwitter<- corpus(rt)
summary(myCorpusTwitter)
head(myCorpusTwitter)
texts(myCorpusTwitter)[1:2]
# number of documents
ndoc(myCorpusTwitter)
# inspect the document-level variables
head(docvars(myCorpusTwitter))

###########################################################################
# let's suppose we want to see the trend of the overall Sentiment per day about Icardi over all the tweets
###########################################################################

sentiment <- dfm(myCorpusTwitter , remove = stopwords("english"),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, stem = TRUE, remove_twitter = TRUE, remove_url = TRUE, 
dictionary = data_dictionary_LSD2015[1:2])
head(sentiment, 10)
str(sentiment)

Dictionary <-convert(sentiment , to="data.frame")
str(Dictionary )
Dictionary$Sentiment <- Dictionary$posit-Dictionary$negat
str(Dictionary )
summary(Dictionary$Sentiment)

# Let's suppose we want to plot the sentiment vs. the volume
rt$Sentiment <- Dictionary$Sentiment
colnames(rt)

# get daily summaries of the results (sentiments and tweets)
daily <- ddply(rt, ~ date, summarize, num_tweets = length(Sentiment), ave_sentiment = mean(Sentiment  )) 
str(daily)

# correlation between the volume of discussion and sentiment
cor(daily$ave_sentiment, daily$num_tweets)

# plot the daily sentiment vs. volume
sentiment <- ggplot(daily, aes(x=date, y=ave_sentiment)) + geom_line(linetype = "dashed", colour="red") +
 ggtitle("Icardi Sentiment") + xlab("Day") + ylab("Sentiment") + theme_gray(base_size = 12)

volume <- ggplot(daily, aes(x=date, y=num_tweets)) + geom_line() +
 ggtitle("Icardi #") + xlab("Day") + ylab("Volume") + theme_gray(base_size = 12)

grid.arrange(sentiment , volume , ncol = 1)

ggplot(daily, aes(x=date, y=num_tweets)) + geom_line() +
 ggtitle("Icardi #") + xlab("Day") + ylab("Volume") + theme_economist() 

ggplot(daily, aes(x=date, y=num_tweets)) + geom_line() +
 ggtitle("Icardi #") + xlab("Day") + ylab("Volume") +  theme_stata() + scale_color_stata() 

ggplot(daily, aes(x=date, y=num_tweets)) + geom_line() +
 ggtitle("Icardi #") + xlab("Day") + ylab("Volume") + theme_wsj()+ scale_colour_wsj("colors6")

# Let's apply other dictionaries!
# save your results as a csv file
write_as_csv(rt, "twitter.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8")
# and then re-open it
x <- read.csv("twitter.csv")
str(x)
x$text <- as.character(x$text)

syuzhet_vector <- get_sentiment(x$text , method="syuzhet")
nrc_vector <- get_sentiment(x$text, method="nrc")

syuzhet_vector <- get_sentiment(x$text , method="syuzhet")
nrc_vector <- get_sentiment(x$text, method="nrc")

x$sentiment_syuzhet <- syuzhet_vector 
x$sentiment_nrc <- nrc_vector 

# correlation
library(PerformanceAnalytics)
attach(x)
set_sentiment <- cbind(Sentiment,sentiment_nrc , sentiment_syuzhet)
chart.Correlation(set_sentiment )

# Another general dictionaries
nrc_data <- get_nrc_sentiment(x$text, language="english")
str(nrc_data)

# % positive and negative tweets
colSums(prop.table(nrc_data[, 9:10]))

# plot % positive and negative tweets
barplot(
  sort(colSums(prop.table(nrc_data[, 9:10]))), 
  horiz = TRUE, 
  cex.names = 0.7, 
  las = 1, 
  main = "Sentiment in Sample text", xlab="Percentage"
  )

angry_items <- which(nrc_data$anger > 2)
str(angry_items )
x$text[angry_items]
x$text[nrc_data$anger > 2]

barplot(
  sort(colSums(prop.table(nrc_data[, 1:8]))), 
  horiz = TRUE, 
  cex.names = 0.7, 
  las = 1, 
  main = "Emotions in Sample text", xlab="Percentage"
  )

all = c(
paste(x$text[nrc_data$anger > 0], collapse=" "),
paste(x$text[nrc_data$anticipation > 0], collapse=" "),
paste(x$text[nrc_data$disgust > 0], collapse=" "),
paste(x$text[nrc_data$fear > 0], collapse=" "),
paste(x$text[nrc_data$joy > 0], collapse=" "),
paste(x$text[nrc_data$sadness > 0], collapse=" "),
paste(x$text[nrc_data$surprise > 0], collapse=" "),
paste(x$text[nrc_data$trust > 0], collapse=" ")
)

# clean the text
library(tm)
library(wordcloud)

# function to make the text suitable for analysis
clean.text = function(x)
{
# tolower
x = tolower(x)
# remove rt
x = gsub("rt", "", x)
# remove at
x = gsub("@\\w+", "", x)
# remove punctuation
x = gsub("[[:punct:]]", "", x)
# remove numbers
x = gsub("[[:digit:]]", "", x)
# remove links http
x = gsub("http\\w+", "", x)
# remove tabs
x = gsub("[ |\t]{2,}", "", x)
# remove blank spaces at the beginning
x = gsub("^ ", "", x)
# remove blank spaces at the end
x = gsub(" $", "", x)
return(x)
}

all = clean.text(all)
# remove stop-words
all = removeWords(all,  c(stopwords("english"), 'Trump' ,"trump"))
# create corpus
corpus = Corpus(VectorSource(all))
# create term-document matrix
tdm = TermDocumentMatrix(corpus)
# convert as matrix
tdm = as.matrix(tdm)
# add column names
colnames(tdm) = c('anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust')

# Plot comparison wordcloud
library(wordcloud)
layout(matrix(c(1, 2), nrow=2), heights=c(1, 4))
par(mar=rep(0, 4))
plot.new()
text(x=0.5, y=0.5, 'Emotion Comparison Word Cloud')
comparison.cloud(tdm, random.order=FALSE,
colors = c("#00B2FF", "red", "#FF0099", "#6600CC", "green", "orange", "blue", "brown"),
title.size=1.5, max.words=250)