rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(rtweet) library(ggplot2) library(dplyr) library(readtext) library(quanteda) library(cowplot) library(gridExtra) library(plyr) library(syuzhet) library(wordcloud) rt <- search_tweets("Salvini", n = 1000, include_rts = FALSE, lang = "en") print(rt$lang[1:20]) length(rt$lang) # I want to convert a date str(rt$created_at) rt$date <- as.Date(rt$created_at) str(rt$date) table(rt$date) myCorpusTwitter<- corpus(rt) tok2 <- tokens(myCorpusTwitter, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) Salvini_dfm <- dfm(tok2) topfeatures(Salvini_dfm , 20) # 20 top words ########################################################################### # let's suppose we want to see the trend of the overall Sentiment per day about Salvini over all the tweets ########################################################################### sentiment <- dfm_lookup(Salvini_dfm, dictionary = data_dictionary_LSD2015[1:2]) head(sentiment, 10) Dictionary <-convert(sentiment , to="data.frame") str(Dictionary ) Dictionary$Sentiment <- Dictionary$posit-Dictionary$negat str(Dictionary ) summary(Dictionary$Sentiment) # Let's suppose we want to plot the sentiment vs. the volume rt$Sentiment <- Dictionary$Sentiment # add the sentiment values back to the data frame you got via Twitter colnames(rt) # get daily summaries of the results (average sentiments and number of tweets) daily <- ddply(rt, ~ date, summarize, num_tweets = length(Sentiment), ave_sentiment = mean(Sentiment )) str(daily) # correlation between the volume of discussion and sentiment cor(daily$ave_sentiment, daily$num_tweets) # plot the daily sentiment vs. volume sentiment <- ggplot(daily, aes(x=date, y=ave_sentiment)) + geom_line(linetype = "dashed", colour="red") + ggtitle("Salvini Sentiment") + xlab("Day") + ylab("Sentiment") + theme_gray(base_size = 12) volume <- ggplot(daily, aes(x=date, y=num_tweets)) + geom_line() + ggtitle("Salvini #") + xlab("Day") + ylab("Volume") + theme_gray(base_size = 12) grid.arrange(sentiment , volume , ncol = 1) ################################### # Let's apply the nrc dictionaries with more categories than simply positive-negative ################################### nrc_data <- get_nrc_sentiment(rt$text, language="english") str(nrc_data) # I want to read the tweets that includes a number of anger words>2 rt$text[nrc_data$anger > 2] barplot( sort(colSums(prop.table(nrc_data[, 1:8]))), horiz = TRUE, cex.names = 0.7, las = 1, main = "Emotions in tweets discussing about Salvini", xlab="Percentage" ) # let's plot the wordcloud of emotions all = c( paste(rt$text[nrc_data$anger > 0], collapse=" "), paste(rt$text[nrc_data$anticipation > 0], collapse=" "), paste(rt$text[nrc_data$disgust > 0], collapse=" "), paste(rt$text[nrc_data$fear > 0], collapse=" "), paste(rt$text[nrc_data$joy > 0], collapse=" "), paste(rt$text[nrc_data$sadness > 0], collapse=" "), paste(rt$text[nrc_data$surprise > 0], collapse=" "), paste(rt$text[nrc_data$trust > 0], collapse=" ") ) str(all) # clean the text library(tm) library(wordcloud) # function to make the text suitable for analysis clean.text = function(x) { # tolower x = tolower(x) # remove rt x = gsub("rt", "", x) # remove at x = gsub("@\\w+", "", x) # remove punctuation x = gsub("[[:punct:]]", "", x) # remove numbers x = gsub("[[:digit:]]", "", x) # remove links http x = gsub("http\\w+", "", x) # remove tabs x = gsub("[ |\t]{2,}", "", x) # remove blank spaces at the beginning x = gsub("^ ", "", x) # remove blank spaces at the end x = gsub(" $", "", x) return(x) } all = clean.text(all) # remove stop-words all = removeWords(all, c(stopwords("english"))) # create corpus corpus = Corpus(VectorSource(all)) # create term-document matrix tdm = TermDocumentMatrix(corpus) # convert as matrix tdm = as.matrix(tdm) # add column names colnames(tdm) = c('anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust') # Plot comparison wordcloud layout(matrix(c(1, 2), nrow=2), heights=c(1, 4)) par(mar=rep(0, 4)) plot.new() text(x=0.5, y=0.5, 'Emotion Comparison Word Cloud for tweets about Salvini') comparison.cloud(tdm, random.order=FALSE, colors = c("#00B2FF", "red", "#FF0099", "#6600CC", "green", "orange", "blue", "brown"), title.size=1.5, max.words=250) ################################### # Let's estimate the daily average emotions with respect to Salvini and let's plot it ################################### str(nrc_data) nrc_data$date <- rt$date str(nrc_data) daily_emotions <- aggregate( nrc_data[,1:8], by=list(day=nrc_data$date), FUN=mean) str(daily_emotions) head(daily_emotions) library(psych) corr.test(daily_emotions [c(2:8)]) library(PerformanceAnalytics) chart.Correlation(daily_emotions[c(2:8)]) # Let's plot it library(reshape2) df.long<-melt(daily_emotions,id.vars=c("day")) str(df.long) head(df.long) ggplot(df.long, aes(x =day, y = value, color = variable)) + geom_point() + geom_smooth(method = "loess") + facet_wrap(~ variable, scale = "free_y", nrow = 2) + theme_bw() + labs( title = "Emotional analysis of Twitter statuses over time", subtitle = "Tweets aggregated by day on topics of Salvini") + theme(text = element_text(family = "Roboto Condensed"), plot.title = element_text(face = "bold"), legend.position = "bottom", axis.text = element_text(size = 9), legend.title = element_blank())