rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL") getwd() library(rtweet) library(ggplot2) library(dplyr) library(readtext) library(quanteda) library(httpuv) library(maps) library(leaflet) library(lattice) library(cowplot) library(gridExtra) library(plyr) library(syuzhet) library(wordcloud) rt <- search_tweets("Salvini", n = 2000, include_rts = FALSE, lang = "en") print(rt$lang[1:20]) length(rt$lang) # I want to convert the POSIXct time format to a date (here you can also change the time zone by selecting tz="Hongkong" for example) # here I choose Greenwich Mean Time (GMT) str(rt$created_at) rt$date <- as.Date(rt$created_at, "GMT") str(rt$date) table(rt$date) myCorpusTwitter<- corpus(rt) texts(myCorpusTwitter)[1:2] # number of documents ndoc(myCorpusTwitter) # inspect the document-level variables head(docvars(myCorpusTwitter)) ########################################################################### # let's suppose we want to see the trend of the overall Sentiment per day about Salvini over all the tweets ########################################################################### sentiment <- dfm(myCorpusTwitter , remove = stopwords("english"), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, stem = TRUE, remove_url = TRUE, dictionary = data_dictionary_LSD2015[1:2]) head(sentiment, 10) Dictionary <-convert(sentiment , to="data.frame") str(Dictionary ) Dictionary$Sentiment <- Dictionary$posit-Dictionary$negat str(Dictionary ) summary(Dictionary$Sentiment) # Let's suppose we want to plot the sentiment vs. the volume rt$Sentiment <- Dictionary$Sentiment # add the sentiment values back to the data frame you got via Twitter colnames(rt) # get daily summaries of the results (average sentiments and number of tweets) daily <- ddply(rt, ~ date, summarize, num_tweets = length(Sentiment), ave_sentiment = mean(Sentiment )) str(daily) # correlation between the volume of discussion and sentiment cor(daily$ave_sentiment, daily$num_tweets) # plot the daily sentiment vs. volume sentiment <- ggplot(daily, aes(x=date, y=ave_sentiment)) + geom_line(linetype = "dashed", colour="red") + ggtitle("Salvini Sentiment") + xlab("Day") + ylab("Sentiment") + theme_gray(base_size = 12) volume <- ggplot(daily, aes(x=date, y=num_tweets)) + geom_line() + ggtitle("Salvini #") + xlab("Day") + ylab("Volume") + theme_gray(base_size = 12) grid.arrange(sentiment , volume , ncol = 1) # Let's apply other dictionaries! colnames(rt) syuzhet_vector <- get_sentiment(rt$text , method="syuzhet") nrc_vector <- get_sentiment(rt$text, method="nrc") rt$sentiment_syuzhet <- syuzhet_vector rt$sentiment_nrc <- nrc_vector colnames(rt) # correlation library(PerformanceAnalytics) attach(rt) set_sentiment <- cbind(Sentiment,sentiment_nrc , sentiment_syuzhet) chart.Correlation(set_sentiment ) # Another general dictionaries with more categories than simply positive-negative nrc_data <- get_nrc_sentiment(rt$text, language="english") str(nrc_data) # I want to read the tweets that included a number of anger words>2 rt$text[nrc_data$anger > 2] barplot( sort(colSums(prop.table(nrc_data[, 1:8]))), horiz = TRUE, cex.names = 0.7, las = 1, main = "Emotions in tweets discussing about Salvini", xlab="Percentage" ) # let's plot the wordcloud of emotions all = c( paste(rt$text[nrc_data$anger > 0], collapse=" "), paste(rt$text[nrc_data$anticipation > 0], collapse=" "), paste(rt$text[nrc_data$disgust > 0], collapse=" "), paste(rt$text[nrc_data$fear > 0], collapse=" "), paste(rt$text[nrc_data$joy > 0], collapse=" "), paste(rt$text[nrc_data$sadness > 0], collapse=" "), paste(rt$text[nrc_data$surprise > 0], collapse=" "), paste(rt$text[nrc_data$trust > 0], collapse=" ") ) # clean the text library(tm) library(wordcloud) # function to make the text suitable for analysis clean.text = function(x) { # tolower x = tolower(x) # remove rt x = gsub("rt", "", x) # remove at x = gsub("@\\w+", "", x) # remove punctuation x = gsub("[[:punct:]]", "", x) # remove numbers x = gsub("[[:digit:]]", "", x) # remove links http x = gsub("http\\w+", "", x) # remove tabs x = gsub("[ |\t]{2,}", "", x) # remove blank spaces at the beginning x = gsub("^ ", "", x) # remove blank spaces at the end x = gsub(" $", "", x) return(x) } all = clean.text(all) # remove stop-words all = removeWords(all, c(stopwords("english"))) # create corpus corpus = Corpus(VectorSource(all)) # create term-document matrix tdm = TermDocumentMatrix(corpus) # convert as matrix tdm = as.matrix(tdm) # add column names colnames(tdm) = c('anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust') # Plot comparison wordcloud layout(matrix(c(1, 2), nrow=2), heights=c(1, 4)) par(mar=rep(0, 4)) plot.new() text(x=0.5, y=0.5, 'Emotion Comparison Word Cloud for tweets about Salvini') comparison.cloud(tdm, random.order=FALSE, colors = c("#00B2FF", "red", "#FF0099", "#6600CC", "green", "orange", "blue", "brown"), title.size=1.5, max.words=250)