rm(list=ls(all=TRUE)) getwd() ### set here your working directory! setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL") getwd() library(rtweet) library(ggplot2) library(dplyr) library(tidytext) library(readtext) library(quanteda) library(httpuv) library(maps) library(leaflet) library(lattice) library(cowplot) library(gridExtra) library(ggthemes) library(plyr) library(syuzhet) token <- create_token( app = "my_twitter_research_app", consumer_key = "YOUR NUMBER", consumer_secret = "YOUR NUMBER", access_token = "YOUR NUMBER", access_secret = "YOUR NUMBER") get_token() rt <- search_tweets("Icardi", n = 2000, include_rts = FALSE, lang = "en") print(rt$lang[1:20]) colnames(rt) # I want to convert the POSIXct time format to a date (here you can also change the time zone by selecting tz="Hongkong" for example) # here I choose Greenwich Mean Time (GMT) str(rt$created_at) rt$date <- as.Date(rt$created_at, "GMT") str(rt$date) myCorpusTwitter<- corpus(rt) summary(myCorpusTwitter) head(myCorpusTwitter) texts(myCorpusTwitter)[1:2] # number of documents ndoc(myCorpusTwitter) # inspect the document-level variables head(docvars(myCorpusTwitter)) ########################################################################### # let's suppose we want to see the trend of the overall Sentiment per day about Icardi over all the tweets ########################################################################### sentiment <- dfm(myCorpusTwitter , remove = stopwords("english"), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, stem = TRUE, remove_twitter = TRUE, remove_url = TRUE, dictionary = data_dictionary_LSD2015[1:2]) head(sentiment, 10) str(sentiment) Dictionary <-convert(sentiment , to="data.frame") str(Dictionary ) Dictionary$Sentiment <- Dictionary$posit-Dictionary$negat str(Dictionary ) summary(Dictionary$Sentiment) # Let's suppose we want to plot the sentiment vs. the volume rt$Sentiment <- Dictionary$Sentiment colnames(rt) # get daily summaries of the results (sentiments and tweets) daily <- ddply(rt, ~ date, summarize, num_tweets = length(Sentiment), ave_sentiment = mean(Sentiment )) str(daily) # correlation between the volume of discussion and sentiment cor(daily$ave_sentiment, daily$num_tweets) # plot the daily sentiment vs. volume sentiment <- ggplot(daily, aes(x=date, y=ave_sentiment)) + geom_line(linetype = "dashed", colour="red") + ggtitle("Icardi Sentiment") + xlab("Day") + ylab("Sentiment") + theme_gray(base_size = 12) volume <- ggplot(daily, aes(x=date, y=num_tweets)) + geom_line() + ggtitle("Icardi #") + xlab("Day") + ylab("Volume") + theme_gray(base_size = 12) grid.arrange(sentiment , volume , ncol = 1) ggplot(daily, aes(x=date, y=num_tweets)) + geom_line() + ggtitle("Icardi #") + xlab("Day") + ylab("Volume") + theme_economist() ggplot(daily, aes(x=date, y=num_tweets)) + geom_line() + ggtitle("Icardi #") + xlab("Day") + ylab("Volume") + theme_stata() + scale_color_stata() ggplot(daily, aes(x=date, y=num_tweets)) + geom_line() + ggtitle("Icardi #") + xlab("Day") + ylab("Volume") + theme_wsj()+ scale_colour_wsj("colors6") # Let's apply other dictionaries! # save your results as a csv file write_as_csv(rt, "twitter.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8") # and then re-open it x <- read.csv("twitter.csv") str(x) x$text <- as.character(x$text) syuzhet_vector <- get_sentiment(x$text , method="syuzhet") nrc_vector <- get_sentiment(x$text, method="nrc") syuzhet_vector <- get_sentiment(x$text , method="syuzhet") nrc_vector <- get_sentiment(x$text, method="nrc") x$sentiment_syuzhet <- syuzhet_vector x$sentiment_nrc <- nrc_vector # correlation library(PerformanceAnalytics) attach(x) set_sentiment <- cbind(Sentiment,sentiment_nrc , sentiment_syuzhet) chart.Correlation(set_sentiment ) # Another general dictionaries nrc_data <- get_nrc_sentiment(x$text, language="english") str(nrc_data) # % positive and negative tweets colSums(prop.table(nrc_data[, 9:10])) # plot % positive and negative tweets barplot( sort(colSums(prop.table(nrc_data[, 9:10]))), horiz = TRUE, cex.names = 0.7, las = 1, main = "Sentiment in Sample text", xlab="Percentage" ) angry_items <- which(nrc_data$anger > 2) str(angry_items ) x$text[angry_items] x$text[nrc_data$anger > 2] barplot( sort(colSums(prop.table(nrc_data[, 1:8]))), horiz = TRUE, cex.names = 0.7, las = 1, main = "Emotions in Sample text", xlab="Percentage" ) all = c( paste(x$text[nrc_data$anger > 0], collapse=" "), paste(x$text[nrc_data$anticipation > 0], collapse=" "), paste(x$text[nrc_data$disgust > 0], collapse=" "), paste(x$text[nrc_data$fear > 0], collapse=" "), paste(x$text[nrc_data$joy > 0], collapse=" "), paste(x$text[nrc_data$sadness > 0], collapse=" "), paste(x$text[nrc_data$surprise > 0], collapse=" "), paste(x$text[nrc_data$trust > 0], collapse=" ") ) # clean the text library(tm) library(wordcloud) # function to make the text suitable for analysis clean.text = function(x) { # tolower x = tolower(x) # remove rt x = gsub("rt", "", x) # remove at x = gsub("@\\w+", "", x) # remove punctuation x = gsub("[[:punct:]]", "", x) # remove numbers x = gsub("[[:digit:]]", "", x) # remove links http x = gsub("http\\w+", "", x) # remove tabs x = gsub("[ |\t]{2,}", "", x) # remove blank spaces at the beginning x = gsub("^ ", "", x) # remove blank spaces at the end x = gsub(" $", "", x) return(x) } all = clean.text(all) # remove stop-words all = removeWords(all, c(stopwords("english"), 'Trump' ,"trump")) # create corpus corpus = Corpus(VectorSource(all)) # create term-document matrix tdm = TermDocumentMatrix(corpus) # convert as matrix tdm = as.matrix(tdm) # add column names colnames(tdm) = c('anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust') # Plot comparison wordcloud library(wordcloud) layout(matrix(c(1, 2), nrow=2), heights=c(1, 4)) par(mar=rep(0, 4)) plot.new() text(x=0.5, y=0.5, 'Emotion Comparison Word Cloud') comparison.cloud(tdm, random.order=FALSE, colors = c("#00B2FF", "red", "#FF0099", "#6600CC", "green", "orange", "blue", "brown"), title.size=1.5, max.words=250)