rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(rtweet) library(ggplot2) library(dplyr) library(readtext) library(quanteda) library(cowplot) library(gridExtra) library(plyr) library(syuzhet) rt <- search_tweets("Salvini", n = 1000, include_rts = FALSE, lang = "en", type="mixed") print(rt$lang[1:20]) length(rt$lang) # I want to convert a date str(rt$created_at) rt$date <- as.Date(rt$created_at) str(rt$date) table(rt$date) myCorpusTwitter<- corpus(rt) tok2 <- tokens(myCorpusTwitter, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_url=TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) Salvini_dfm <- dfm(tok2) topfeatures(Salvini_dfm , 20) # 20 top words ########################################################################### # let's suppose we want to see the trend of the overall Sentiment per day about Salvini over all the tweets ########################################################################### sentiment <- dfm_lookup(Salvini_dfm, dictionary = data_dictionary_LSD2015[1:2]) head(sentiment, 10) Dictionary <-convert(sentiment , to="data.frame") str(Dictionary ) Dictionary$Sentiment <- Dictionary$posit-Dictionary$negat str(Dictionary ) summary(Dictionary$Sentiment) # Let's suppose we want to plot the sentiment vs. the volume rt$Sentiment <- Dictionary$Sentiment # add the sentiment values back to the data frame you got via Twitter colnames(rt) # get daily summaries of the results (average sentiments and number of tweets) daily <- ddply(rt, ~ date, summarize, num_tweets = length(Sentiment), ave_sentiment = mean(Sentiment )) str(daily) # correlation between the volume of discussion and sentiment cor(daily$ave_sentiment, daily$num_tweets) # plot the daily sentiment vs. volume sentiment <- ggplot(daily, aes(x=date, y=ave_sentiment)) + geom_line(linetype = "dashed", colour="red") + ggtitle("Salvini Sentiment") + xlab("Day") + ylab("Sentiment") + theme_gray(base_size = 12) volume <- ggplot(daily, aes(x=date, y=num_tweets)) + geom_line() + ggtitle("Salvini #") + xlab("Day") + ylab("Volume") + theme_gray(base_size = 12) grid.arrange(sentiment , volume , ncol = 1) ################################### # Let's apply the nrc dictionaries with more categories than simply positive-negative ################################### nrc_data <- get_nrc_sentiment(rt$text, language="english") str(nrc_data) # I want to read the tweets that includes a number of anger words>2 rt$text[nrc_data$anger > 2] # let's add back the column with the texts nrc_data$text <- rt$text str(nrc_data) # let's plot the wordcloud of emotions # first: let's group the texts according to their emotions, i.e., now our unit of analysis (our rows) will be the texts # grouped according to the 8 emotions all = c( paste(rt$text[nrc_data$anger > 0], collapse=" "), paste(rt$text[nrc_data$anticipation > 0], collapse=" "), paste(rt$text[nrc_data$disgust > 0], collapse=" "), paste(rt$text[nrc_data$fear > 0], collapse=" "), paste(rt$text[nrc_data$joy > 0], collapse=" "), paste(rt$text[nrc_data$sadness > 0], collapse=" "), paste(rt$text[nrc_data$surprise > 0], collapse=" "), paste(rt$text[nrc_data$trust > 0], collapse=" ") ) str(all) recentCorpus <- corpus(all) summary(recentCorpus ) # let's add the proper emotion name to the docnames of our corpus colnames(nrc_data)[1:8] docnames(recentCorpus ) <- colnames(nrc_data)[1:8] summary(recentCorpus ) tok2 <- tokens(recentCorpus , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_url = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok <- tokens_wordstem (tok , language =("english")) Mydfm <- dfm(tok2) Mydfm # let's remove the feature "salvini" (not very informative given that all tweets include such feature) MyDfm2 <- dfm_remove(Mydfm, c("salvini")) library(quanteda.textplots) set.seed(123) textplot_wordcloud(MyDfm2 , min.count = 6, rot.per = .25, comparison = TRUE, colors = c("#00B2FF", "red", "#FF0099", "#6600CC", "green", "orange", "blue", "brown"), labelsize=1.5) ################################### # Let's estimate the daily average emotions with respect to Salvini and let's plot it ################################### str(nrc_data) nrc_data$date <- rt$date str(nrc_data) daily_emotions <- aggregate( nrc_data[,1:8], by=list(day=nrc_data$date), FUN=mean) str(daily_emotions) head(daily_emotions) library(psych) corr.test(daily_emotions [c(2:8)]) library(PerformanceAnalytics) chart.Correlation(daily_emotions[c(2:8)]) # Let's plot it library(reshape2) df.long<-melt(daily_emotions,id.vars=c("day")) str(df.long) head(df.long) ggplot(df.long, aes(x =day, y = value, color = variable)) + geom_point() + geom_smooth(method = "loess") + facet_wrap(~ variable, scale = "free_y", nrow = 2) + theme_bw() + labs( title = "Emotional analysis of Twitter statuses over time", subtitle = "Tweets aggregated by day on topics of Salvini") + theme(text = element_text(family = "Roboto Condensed"), plot.title = element_text(face = "bold"), legend.position = "bottom", axis.text = element_text(size = 9), legend.title = element_blank())