rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(rtweet)
library(ggplot2)
library(dplyr)
library(readtext)
library(quanteda)
library(cowplot)
library(gridExtra)
library(plyr)
library(syuzhet)
library(psych)
library(PerformanceAnalytics)
library(reshape2)
library(quanteda.textplots)
library(quanteda.textstats)
library(leaflet)

# let's run a query about Giorgia Meloni (but by excluding the tweets including @Chris_Meloni - a popular
# Twitter user not related at all with Giorgia Meloni!)
rt <- search_tweets("Meloni -@Chris_Meloni", n = 2000, include_rts = FALSE,  lang = "en", type="mixed")

# Note the following: Quanteda has problems to deal with Saxon Genitive. For example, when you tokenize a text including 
# the expression "meloni's", it will consider it as a single token. And if later you want to remove the feature "meloni"
# from your dfm for example via dfm_remove you still can do it, but you won't be able to remove "meloni's" given that it will be 
# considered as a distinct feature compared to "meloni". So if you intend to do it, better working like that since the beginning:
rt $text <- gsub("'"," ",rt$text) # let's replace the apostrophe with an empty space
print(rt$lang[1:20])
length(rt$lang)
# I want to convert a date
str(rt$created_at)
rt$date <- as.Date(rt$created_at)
str(rt$date)
table(rt$date)

myCorpusTwitter<- corpus(rt)
tok2 <- tokens(myCorpusTwitter, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_url=TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
Meloni_dfm <- dfm(tok2)
topfeatures(Meloni_dfm, 20)
# min_char: I specify the minimum length in characters for tokens to be removed 
Meloni_dfm  <- dfm_remove(Meloni_dfm , min_nchar = 2)
topfeatures(Meloni_dfm, 30)

# Let's plot the network of the top-30 most frequent features as recovered above in terms of their co-occurrence in tweets
# within a window-size=5. That means that we focus on the five words before and after a given target word (say "meloni")
# in each given tweet, and we count the number of times any of the other top-30 most frequent features appear
 
tok3 <- tokens_tolower(tok2)
co_occurance <- fcm(tok3 , context = "window", window=5)
co_occurance

feat <- names(topfeatures(Meloni_dfm, 30))
select <- fcm_select(co_occurance, pattern = feat)
set.seed(123)   
textplot_network(select)

# in this case the window=1, i.e., only those words that appear next to each other

tok3 <- tokens_tolower(tok2)
co_occurance2 <- fcm(tok3 , context = "window", window=1)
co_occurance2

feat <- names(topfeatures(Meloni_dfm, 30))
select2 <- fcm_select(co_occurance2, pattern = feat)
set.seed(123)   
textplot_network(select2)

###########################################################################
# let's suppose we want to see the trend of the overall Sentiment per day about Meloni over all the tweets
###########################################################################

sentiment <- dfm_lookup(Meloni_dfm, dictionary = data_dictionary_LSD2015[1:2])
head(sentiment, 10)

Dictionary <-convert(sentiment , to="data.frame")
str(Dictionary )
Dictionary$Sentiment <- Dictionary$posit-Dictionary$negat
str(Dictionary )
summary(Dictionary$Sentiment)

rt$Sentiment <- Dictionary$Sentiment # add the sentiment values back to the data frame you got via Twitter
colnames(rt)

# get daily summaries of the results (average sentiments and number of tweets)
daily <- ddply(rt, ~ date, summarize, num_tweets = length(Sentiment), ave_sentiment = mean(Sentiment  )) 
str(daily)

# plot the daily sentiment vs. volume
sentiment <- ggplot(daily, aes(x=date, y=ave_sentiment)) + geom_line(linetype = "dashed", colour="red") +
 ggtitle("Meloni Sentiment") + xlab("Day") + ylab("Sentiment") + theme_gray(base_size = 12)

volume <- ggplot(daily, aes(x=date, y=num_tweets)) + geom_line() +
 ggtitle("Meloni #") + xlab("Day") + ylab("Volume") + theme_gray(base_size = 12)

grid.arrange(sentiment , volume , ncol = 1)

###################################
# Let's apply the nrc dictionaries with more categories than simply positive-negative
###################################

# around 50 seconds
system.time(nrc_data <- get_nrc_sentiment(rt$text, language="english"))
str(nrc_data)

# I want to read the tweets that includes a number of anger words>3
rt$text[nrc_data$anger > 3]

# let's add back the column with the texts to our results just found
nrc_data$text <- rt$text
str(nrc_data)

# Let's plot the wordcloud of emotions! 
# first: let's group the tweets according to their emotions, i.e., now our unit of analysis (our rows) will be the tweets
# grouped according to the 8 emotions (not the single tweets!).
# For example, we are going to add all those tweets that contains at least 2 angers words 
# i.e., [nrc_data$anger > 1], together in one single big text belonging to the "anger" emotion, 
# and so forth for all the other emotions.

all = c(
paste(rt$text[nrc_data$anger > 1], collapse=" "),
paste(rt$text[nrc_data$anticipation > 1], collapse=" "),
paste(rt$text[nrc_data$disgust > 1], collapse=" "),
paste(rt$text[nrc_data$fear > 1], collapse=" "),
paste(rt$text[nrc_data$joy > 1], collapse=" "),
paste(rt$text[nrc_data$sadness > 1], collapse=" "),
paste(rt$text[nrc_data$surprise > 1], collapse=" "),
paste(rt$text[nrc_data$trust > 1], collapse=" ")
)

str(all)

recentCorpus <- corpus(all)
summary(recentCorpus )
# let's add the proper emotion name to the docnames of our corpus, as well let's save them as docvars
colnames(nrc_data)[1:8]
docnames(recentCorpus ) <- colnames(nrc_data)[1:8]
docvars(recentCorpus,  field = "Emotions" ) <- colnames(nrc_data)[1:8]
summary(recentCorpus )
tok2 <- tokens(recentCorpus , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, 
remove_separators = TRUE, remove_url = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok   <- tokens_wordstem (tok   , language =("english"))
Mydfm <- dfm(tok2)
Mydfm  <- dfm_remove(Mydfm  , min_nchar = 2)
Mydfm

# let's remove the features "giorgia" and "meloni" (not very informative given that all tweets include such feature)
MyDfm2 <- dfm_remove(Mydfm, c("giorgia", "meloni"))
topfeatures(MyDfm2 , groups=Emotions)

set.seed(123)
textplot_wordcloud(MyDfm2 , min.count = 6, rot.per = .25, comparison = TRUE, 
colors = c("#00B2FF", "red", "#FF0099", "#6600CC", "green", "orange", "blue", "brown"),
labelsize=1.5)

# As we have learnt last week, we can also save (and then eventually plot) the frequency of the top features in a dfm 
# (in this case the top-20 features by emotions)
features_dfm <- textstat_frequency(MyDfm2 , n = 20, groups=Emotions)
str(features_dfm)

# let's plot now the top-20 words that appear in the anger-tweets and their relative frequency for example
ggplot(features_dfm[ which(features_dfm$group=='anger'), ], 
aes(x = reorder(feature, frequency), y = frequency, fill=frequency)) +
  geom_bar(stat = "identity") +
  coord_flip() + xlab(label="Anger words") 

###################################
# Let's estimate the daily average emotions with respect to Meloni and let's plot it
###################################

str(nrc_data)
nrc_data$date <- rt$date
str(nrc_data)

daily_emotions <- aggregate( nrc_data[,1:8], by=list(day=nrc_data$date), FUN=mean) 
str(daily_emotions)
head(daily_emotions)

corr.test(daily_emotions [c(2:8)])
chart.Correlation(daily_emotions[c(2:8)])

# Let's plot it
df.long<-melt(daily_emotions,id.vars=c("day"))
str(df.long)
head(df.long)

ggplot(df.long, aes(x =day, y = value, color = variable)) +
geom_point() +
geom_smooth(method = "loess") +
facet_wrap(~ variable, scale = "free_y", nrow = 2) +
theme_bw() + labs( title = "Emotional analysis of Twitter statuses over time",
         subtitle = "Tweets aggregated by day on topics of Meloni") +  
theme(text = element_text(family = "Roboto Condensed"),
          plot.title = element_text(face = "bold"),
          legend.position = "bottom",
          axis.text = element_text(size = 9),
          legend.title = element_blank()) 

###################################
###################################
###################################
# Let's run a sentiment analysis with geographical data
###################################
###################################
###################################

lookup_coords("United States")
rt <- search_tweets("trump", n = 5000, geocode = lookup_coords("United States"))

# how many non-missing bbox_coords have we recovered from our query?
table(!is.na(rt $place_name))

myCorpusTwitter<- corpus(rt)
tok2 <- tokens(myCorpusTwitter, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_url=TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
trump_dfm <- dfm(tok2)
topfeatures(trump_dfm , 20)  # 20 top words

sentiment <- dfm_lookup(trump_dfm, dictionary = data_dictionary_LSD2015[1:2])
head(sentiment, 10)
Dictionary <-convert(sentiment , to="data.frame")
str(Dictionary )
Dictionary$Diff <- Dictionary$posit-Dictionary$negat
Dictionary$Sentiment <- as.factor(ifelse(Dictionary$Diff>0, "pos", "neg"))
table(Dictionary$Sentiment )

## create lat/lng variables using all available tweet and profile geo-location data
rtll <- lat_lng(rt)
rtll$Sentiment <- Dictionary$Sentiment

# plot the results with the Sentiment
map.data <- map_data("usa") # in the case of the US you just specify "usa" without "world"
str(map.data)
points <- data.frame(x = rtll$lng, y =  rtll$lat, Sentiment=rtll$Sentiment)

ggplot(map.data) + geom_map(aes(map_id = region), map = map.data, fill = "white",   
color = "grey20", size = 0.25)+ expand_limits(x = map.data$long, y = map.data$lat) + 
    theme(axis.line = element_blank(), axis.text = element_blank(), axis.ticks = element_blank(), 
        axis.title = element_blank(), plot.margin = unit(0 * c(-1.5, -1.5, -1.5, -1.5), "lines")) + 
geom_point(data=points, aes(x, y, color= Sentiment), size = 3, alpha = 1/3) + scale_color_manual(values=c("red", "green"))

# alternative plot via leaflet

str(rtll$ Sentiment)

getColor <- ifelse(rtll$ Sentiment=="pos", "green", "red")

icons <- awesomeIcons(
  icon = 'ios-close',
  iconColor = 'black',
  library = 'ion',
  markerColor = getColor
)

m <- leaflet(rtll) 
m <- addTiles(m)   
m <- addAwesomeMarkers(m, lng=rtll$lng, lat=rtll$lat, icon=icons, popup=rtll$text)
m