#################################################### # the very first time you use streamR #################################################### rm(list=ls(all=TRUE)) getwd() ### set here your working directory! setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL") getwd() library(ROAuth) library(RCurl) download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.pem") requestURL <- "https://api.twitter.com/oauth/request_token" accessURL <- "https://api.twitter.com/oauth/access_token" authURL <- "https://api.twitter.com/oauth/authorize" consumerKey <- "YOUR NUMBER" consumerSecret <- "YOUR NUMBER" my_oauth <- OAuthFactory$new(consumerKey = consumerKey, consumerSecret = consumerSecret, requestURL = requestURL, accessURL = accessURL, authURL = authURL) my_oauth$handshake(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")) save(my_oauth, file = "my_oauth.Rdata") #################################################### # after the first time #################################################### rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL") getwd() library(streamR) library(ROAuth) library(RCurl) load("my_oauth.Rdata") library(ggplot2) library(grid) library(leaflet) library(readtext) library(quanteda) # with streamR you capture via streaming the tweets # filterStream opens a connection to the Streaming API that will return all tweets that contain one or more of the keywords given # in the track argument # streamR gives you both access to the spritzer (1% random sample of all the tweets posted in that specific moment) # [with streamR::sampleStream] and also to tweets mentioning a set of keywords # (in which case it returns all the tweets, unless that number of tweets is higher than 1% of *all* tweets # in a given moment) [with streamR::filterStream] # "timeout" is a numeric, maximum length of time (in seconds) of connection to stream. # The connection will be automatically closed after this period. # For example, setting timeout to 10800 will keep the connection open for 3 hours. # The default is 0, which will keep the connection open permanently. # with the "follow" option, you can specify the string or numeric, vector of Twitter user IDs, # indicating the users whose public statuses should be delivered on the stream # in the example below, I save the tweets in a database to which I could append further tweets later on # Of course, you can once again select the "language" of your research if you want filterStream("tweetsTrump.json", track = c("Trump"), timeout = 30, oauth = my_oauth) # recover the tweets saved in your database # If "simplify" is TRUE it will return a data frame with only tweet and user fields #(i.e., no geographic information or url entities). tweets.df <- parseTweets("tweetsTrump.json") str(tweets.df) print(tweets.df$text[1:5]) # more than 140 chr! nchar(tweets.df$text[1:250]) table(tweets.df$user_lang) tweets.df$location sum(is.na(tweets.df$location)) sum(!is.na(tweets.df$location)) table(tweets.df$place_lat) table(tweets.df$place_lon) tweets.df$ time_zone summary(tweets.df $ followers_count) # how many locations are represented (this is more or less the same exact graph we plotted in rtweet!) length(unique(tweets.df$location)) count(tweets.df, location, sort = TRUE) # Let’s sort by count and just plot the top locations count <- count(tweets.df, location, sort = TRUE) str(count) # sum the value in the row "USA" with the value in the row "United States" x1 <- count[ which(count$location=='United States'), ] x2 <- count[ which(count$location=='USA'), ] x3 <- x1$n+x2$n x3 count$n[count$location=="United States"] <- x3 count <- count[-c(3), ] str(count) # delete the NA count <- na.omit(count) str(count) count <- mutate(count, location = reorder(location, n)) count <- top_n(count, 20) ggplot(count, aes(x = location, y = n)) + geom_col() + coord_flip() + labs(x = "Count", y = "Location", title = "Where Twitter users are from - unique locations ") # if you want to get the tweets from a specific location, you can, but different approach than streamR (plus it works well this time!) # you need to provide a vector of longitude, latitude pairs (with the southwest corner coming first) # specifying sets of bounding boxes to filter public statuses by # how to do that? check here! # http://boundingbox.klokantech.com/ and select "csv" # in this example we want to download the tweets from Japan (i.e., all the tweets from Japan, given that track is NULL) filterStream("tweetsJP.json", locations = c(129.29,31.05,159.43,45.54), timeout = 100, oauth = my_oauth) tweets.df <- parseTweets("tweetsJP.json", verbose = FALSE) str(tweets.df) tweets.df$place_lat tweets.df$place_lon table(tweets.df $ country_code) tweets.df <- tweets.df[ which(tweets.df$country_code=='JP'), ] table(tweets.df $ country_code) # plot the results map.data <- map_data("world2", "japan") points <- data.frame(x = tweets.df$lon, y = tweets.df$lat) ggplot(map.data) + geom_map(aes(map_id = region), map = map.data, fill = "white", color = "grey20", size = 0.25) + expand_limits(x = map.data$long, y = map.data$lat) + theme(axis.line = element_blank(), axis.text = element_blank(), axis.ticks = element_blank(), axis.title = element_blank(), plot.margin = unit(0 * c(-1.5, -1.5, -1.5, -1.5), "lines")) + geom_point(data = points, aes(x = x, y = y), size = 3, alpha = 1/3, color = "darkblue") # plot your graph! m <- leaflet(tweets.df) m <- addTiles(m) # Add default OpenStreetMap map tiles m <- addMarkers(m, lng=tweets.df$place_lon, lat=tweets.df$place_lat, popup=tweets.df$text) m # different style of graph m <- leaflet(tweets.df) m <- addTiles(m) # Add default OpenStreetMap map tiles m <- addCircleMarkers(m, lng=tweets.df$place_lon, lat=tweets.df$place_lat, popup=tweets.df$text) m # a lot of the markers are clumped together rather closely. # We can cluster them together by specifying clusterOptions as follows (zoom out and zoom in the graph!) # The number inside each circle represents the total number of observations in that area. # Areas with higher observations are marked by yellow circles and areas with lower incidents are marked by green circle m <- leaflet(tweets.df) m <- addTiles(m) # Add default OpenStreetMap map tiles m <- addMarkers(m, lng=tweets.df$place_lon, lat=tweets.df$place_lat, clusterOptions = markerClusterOptions()) m # plot your graph according to the time_zone str(tweets.df) table(newdata $ time_zone) table(tweets.df$ time_zone) tweets.df$ time_zone2 <- tweets.df$ time_zone str(tweets.df) tweets.df$time_zone2[is.na(tweets.df$time_zone2)] <- "other" str(tweets.df) table(tweets.df$ time_zone2) newdata <- tweets.df[ which(tweets.df$ time_zone2=='Tokyo'), ] str(newdata ) tweets.df$x <- sapply(tweets.df$time_zone2, function(time_zone2) { if(time_zone2 == 'Tokyo') { "green" } else if(time_zone2 == 'Osaka') { "orange" } else { "red" } }) str(tweets.df) table(tweets.df$x) icons <- awesomeIcons( icon = 'ios-close', iconColor = 'black', library = 'ion', markerColor = tweets.df$x ) m <- leaflet(tweets.df) m <- addTiles(m) # Add default OpenStreetMap map tiles m <- addAwesomeMarkers(m, lng=tweets.df$place_lon, lat=tweets.df$place_lat, popup=tweets.df$text, icon=icons) m # plot your graph according to followers_count (according to quartiles) summary(tweets.df$ followers_count) # change the value below according to the results of the collection of tweets! # take the 1tQu, Median, and 3rd Qu. tweets.df$x2 <- sapply(tweets.df$followers_count, function(followers_count) { if(followers_count <=155) { "green" } else if( followers_count>155 && followers_count<=377 ) { "orange" } else if(followers_count>377 && followers_count <=762) { "red" } else { "blue" } }) table(tweets.df$x2) # same as tweets.df$x3 <- sapply(tweets.df$followers_count, function(followers_count) { if(followers_count <=155) { "green" } else if(followers_count<=377 ) { "orange" } else if(followers_count <=762) { "red" } else { "blue" } }) table(tweets.df$x3) # check if correct tweets.df$green <- tweets.df$followers_count <=155 table(tweets.df$green ) # different options for awesomeIcons: https://github.com/lvoogdt/Leaflet.awesome-markers # you can also create your own Icons!!! check here: https://rstudio.github.io/leaflet/markers.html icons <- awesomeIcons( icon = 'glass', iconColor = 'white', markerColor = tweets.df$x3 ) m <- leaflet(tweets.df) m <- addTiles(m) # Add default OpenStreetMap map tiles m <- addAwesomeMarkers(m, lng=tweets.df$place_lon, lat=tweets.df$place_lat, popup=tweets.df$text, icon=icons) m table(tweets.df$x3) # The function sampleStream allows the user to capture a small random sample (around 1%) of all tweets that are being sent at each moment. sampleStream("tweetsSample.json", timeout = 30, oauth = my_oauth) tweets_sample <- parseTweets("tweetsSample.json", verbose = TRUE) mean(as.numeric(tweets_sample$friends_count)) table(is.na(tweets_sample$lat)) round(sort(table(tweets_sample$lang), decreasing = T)[1:5]/sum(table(tweets_sample $lang)), 2) # plot the results map.data <- map_data("world") points <- data.frame(x = as.numeric(tweets_sample$lon), y = as.numeric(tweets_sample$lat)) ggplot(map.data) + geom_map(aes(map_id = region), map = map.data, fill = "white", color = "grey20", size = 0.25) + expand_limits(x = map.data$long, y = map.data$lat) + theme(axis.line = element_blank(), axis.text = element_blank(), axis.ticks = element_blank(), axis.title = element_blank(), plot.margin = unit(0 * c(-1.5, -1.5, -1.5, -1.5), "lines")) + geom_point(data = points, aes(x = x, y = y), size = 3, alpha = 1/3, color = "darkblue") ############################################################ # passing your tweets collected via streamR to quanteda ############################################################ tweets.df <- parseTweets("tweetsTrump.json") myCorpusTwitter <- corpus(tweets.df) summary(myCorpusTwitter) str(myCorpusTwitter) texts(myCorpusTwitter)[1] options(width = 200) # the remove_twitter = TRUE implies that when I do a dfm I remove Twitter characters @ and # myDfm <- dfm(myCorpusTwitter ,remove = stopwords("english"), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, stem = TRUE, remove_twitter = TRUE, remove_url = TRUE) topfeatures(myDfm , 20) # 20 top words