rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(rtweet) library(ggplot2) library(dplyr) library(readtext) library(quanteda) library(cowplot) library(gridExtra) library(plyr) library(syuzhet) library(psych) library(PerformanceAnalytics) library(reshape2) library(quanteda.textplots) library(quanteda.textstats) library(leaflet) # let's run a query about Giorgia Meloni (but by excluding the tweets including @Chris_Meloni - a popular # Twitter user not related at all with Giorgia Meloni!) rt <- search_tweets("Meloni -@Chris_Meloni", n = 2000, include_rts = FALSE, lang = "en", type="mixed") # Note the following: Quanteda has problems to deal with Saxon Genitive. For example, when you tokenize a text including # the expression "meloni's", it will consider it as a single token. And if later you want to remove the feature "meloni" # from your dfm for example via dfm_remove you still can do it, but you won't be able to remove "meloni's" given that it will be # considered as a distinct feature compared to "meloni". So if you intend to do it, better working like that since the beginning: rt $text <- gsub("'"," ",rt$text) # let's replace the apostrophe with an empty space print(rt$lang[1:20]) length(rt$lang) # I want to convert a date str(rt$created_at) rt$date <- as.Date(rt$created_at) str(rt$date) table(rt$date) myCorpusTwitter<- corpus(rt) tok2 <- tokens(myCorpusTwitter, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_url=TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) Meloni_dfm <- dfm(tok2) topfeatures(Meloni_dfm, 20) # min_char: I specify the minimum length in characters for tokens to be removed Meloni_dfm <- dfm_remove(Meloni_dfm , min_nchar = 2) topfeatures(Meloni_dfm, 30) # Let's plot the network of the top-30 most frequent features as recovered above in terms of their co-occurrence in tweets # within a window-size=5. That means that we focus on the five words before and after a given target word (say "meloni") # in each given tweet, and we count the number of times any of the other top-30 most frequent features appear tok3 <- tokens_tolower(tok2) co_occurance <- fcm(tok3 , context = "window", window=5) co_occurance feat <- names(topfeatures(Meloni_dfm, 30)) select <- fcm_select(co_occurance, pattern = feat) set.seed(123) textplot_network(select) # in this case the window=1, i.e., only those words that appear next to each other tok3 <- tokens_tolower(tok2) co_occurance2 <- fcm(tok3 , context = "window", window=1) co_occurance2 feat <- names(topfeatures(Meloni_dfm, 30)) select2 <- fcm_select(co_occurance2, pattern = feat) set.seed(123) textplot_network(select2) ########################################################################### # let's suppose we want to see the trend of the overall Sentiment per day about Meloni over all the tweets ########################################################################### sentiment <- dfm_lookup(Meloni_dfm, dictionary = data_dictionary_LSD2015[1:2]) head(sentiment, 10) Dictionary <-convert(sentiment , to="data.frame") str(Dictionary ) Dictionary$Sentiment <- Dictionary$posit-Dictionary$negat str(Dictionary ) summary(Dictionary$Sentiment) rt$Sentiment <- Dictionary$Sentiment # add the sentiment values back to the data frame you got via Twitter colnames(rt) # get daily summaries of the results (average sentiments and number of tweets) daily <- ddply(rt, ~ date, summarize, num_tweets = length(Sentiment), ave_sentiment = mean(Sentiment )) str(daily) # plot the daily sentiment vs. volume sentiment <- ggplot(daily, aes(x=date, y=ave_sentiment)) + geom_line(linetype = "dashed", colour="red") + ggtitle("Meloni Sentiment") + xlab("Day") + ylab("Sentiment") + theme_gray(base_size = 12) volume <- ggplot(daily, aes(x=date, y=num_tweets)) + geom_line() + ggtitle("Meloni #") + xlab("Day") + ylab("Volume") + theme_gray(base_size = 12) grid.arrange(sentiment , volume , ncol = 1) ################################### # Let's apply the nrc dictionaries with more categories than simply positive-negative ################################### # around 50 seconds system.time(nrc_data <- get_nrc_sentiment(rt$text, language="english")) str(nrc_data) # I want to read the tweets that includes a number of anger words>3 rt$text[nrc_data$anger > 3] # let's add back the column with the texts to our results just found nrc_data$text <- rt$text str(nrc_data) # Let's plot the wordcloud of emotions! # first: let's group the tweets according to their emotions, i.e., now our unit of analysis (our rows) will be the tweets # grouped according to the 8 emotions (not the single tweets!). # For example, we are going to add all those tweets that contains at least 2 angers words # i.e., [nrc_data$anger > 1], together in one single big text belonging to the "anger" emotion, # and so forth for all the other emotions. all = c( paste(rt$text[nrc_data$anger > 1], collapse=" "), paste(rt$text[nrc_data$anticipation > 1], collapse=" "), paste(rt$text[nrc_data$disgust > 1], collapse=" "), paste(rt$text[nrc_data$fear > 1], collapse=" "), paste(rt$text[nrc_data$joy > 1], collapse=" "), paste(rt$text[nrc_data$sadness > 1], collapse=" "), paste(rt$text[nrc_data$surprise > 1], collapse=" "), paste(rt$text[nrc_data$trust > 1], collapse=" ") ) str(all) recentCorpus <- corpus(all) summary(recentCorpus ) # let's add the proper emotion name to the docnames of our corpus, as well let's save them as docvars colnames(nrc_data)[1:8] docnames(recentCorpus ) <- colnames(nrc_data)[1:8] docvars(recentCorpus, field = "Emotions" ) <- colnames(nrc_data)[1:8] summary(recentCorpus ) tok2 <- tokens(recentCorpus , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_url = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok <- tokens_wordstem (tok , language =("english")) Mydfm <- dfm(tok2) Mydfm <- dfm_remove(Mydfm , min_nchar = 2) Mydfm # let's remove the features "giorgia" and "meloni" (not very informative given that all tweets include such feature) MyDfm2 <- dfm_remove(Mydfm, c("giorgia", "meloni")) topfeatures(MyDfm2 , groups=Emotions) set.seed(123) textplot_wordcloud(MyDfm2 , min.count = 6, rot.per = .25, comparison = TRUE, colors = c("#00B2FF", "red", "#FF0099", "#6600CC", "green", "orange", "blue", "brown"), labelsize=1.5) # As we have learnt last week, we can also save (and then eventually plot) the frequency of the top features in a dfm # (in this case the top-20 features by emotions) features_dfm <- textstat_frequency(MyDfm2 , n = 20, groups=Emotions) str(features_dfm) # let's plot now the top-20 words that appear in the anger-tweets and their relative frequency for example ggplot(features_dfm[ which(features_dfm$group=='anger'), ], aes(x = reorder(feature, frequency), y = frequency, fill=frequency)) + geom_bar(stat = "identity") + coord_flip() + xlab(label="Anger words") ################################### # Let's estimate the daily average emotions with respect to Meloni and let's plot it ################################### str(nrc_data) nrc_data$date <- rt$date str(nrc_data) daily_emotions <- aggregate( nrc_data[,1:8], by=list(day=nrc_data$date), FUN=mean) str(daily_emotions) head(daily_emotions) corr.test(daily_emotions [c(2:8)]) chart.Correlation(daily_emotions[c(2:8)]) # Let's plot it df.long<-melt(daily_emotions,id.vars=c("day")) str(df.long) head(df.long) ggplot(df.long, aes(x =day, y = value, color = variable)) + geom_point() + geom_smooth(method = "loess") + facet_wrap(~ variable, scale = "free_y", nrow = 2) + theme_bw() + labs( title = "Emotional analysis of Twitter statuses over time", subtitle = "Tweets aggregated by day on topics of Meloni") + theme(text = element_text(family = "Roboto Condensed"), plot.title = element_text(face = "bold"), legend.position = "bottom", axis.text = element_text(size = 9), legend.title = element_blank()) ################################### ################################### ################################### # Let's run a sentiment analysis with geographical data ################################### ################################### ################################### lookup_coords("United States") rt <- search_tweets("trump", n = 5000, geocode = lookup_coords("United States")) # how many non-missing bbox_coords have we recovered from our query? table(!is.na(rt $place_name)) myCorpusTwitter<- corpus(rt) tok2 <- tokens(myCorpusTwitter, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_url=TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) trump_dfm <- dfm(tok2) topfeatures(trump_dfm , 20) # 20 top words sentiment <- dfm_lookup(trump_dfm, dictionary = data_dictionary_LSD2015[1:2]) head(sentiment, 10) Dictionary <-convert(sentiment , to="data.frame") str(Dictionary ) Dictionary$Diff <- Dictionary$posit-Dictionary$negat Dictionary$Sentiment <- as.factor(ifelse(Dictionary$Diff>0, "pos", "neg")) table(Dictionary$Sentiment ) ## create lat/lng variables using all available tweet and profile geo-location data rtll <- lat_lng(rt) rtll$Sentiment <- Dictionary$Sentiment # plot the results with the Sentiment map.data <- map_data("usa") # in the case of the US you just specify "usa" without "world" str(map.data) points <- data.frame(x = rtll$lng, y = rtll$lat, Sentiment=rtll$Sentiment) ggplot(map.data) + geom_map(aes(map_id = region), map = map.data, fill = "white", color = "grey20", size = 0.25)+ expand_limits(x = map.data$long, y = map.data$lat) + theme(axis.line = element_blank(), axis.text = element_blank(), axis.ticks = element_blank(), axis.title = element_blank(), plot.margin = unit(0 * c(-1.5, -1.5, -1.5, -1.5), "lines")) + geom_point(data=points, aes(x, y, color= Sentiment), size = 3, alpha = 1/3) + scale_color_manual(values=c("red", "green")) # alternative plot via leaflet str(rtll$ Sentiment) getColor <- ifelse(rtll$ Sentiment=="pos", "green", "red") icons <- awesomeIcons( icon = 'ios-close', iconColor = 'black', library = 'ion', markerColor = getColor ) m <- leaflet(rtll) m <- addTiles(m) m <- addAwesomeMarkers(m, lng=rtll$lng, lat=rtll$lat, icon=icons, popup=rtll$text) m