rm(list=ls(all=TRUE)) getwd() ### set here your working directory! setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL") getwd() library(rtweet) library(ggplot2) library(dplyr) library(tidytext) library(readtext) library(quanteda) library(httpuv) library(maps) library(leaflet) token <- create_token( app = "my_twitter_research_app", consumer_key = "YOUR NUMBER", consumer_secret = "YOUR NUMBER", access_token = "YOUR NUMBER", access_secret = "YOUR NUMBER") get_token() ## check to see if the token is loaded identical(token, get_token()) # Returns Twitter statuses matching a user provided search query. # ONLY RETURNS DATA FROM THE PAST 6-9 DAYS # Search for up to 1000 (non-retweeted) tweets containing the rstats hashtag rt <- search_tweets( "#rstats", n = 1000, include_rts = FALSE) # print tweet text print(rt$text[1:5]) # more than 140 chr!!!! nchar(rt$text[1:500]) print(rt$text[493]) colnames(rt) # you can then save your results as a csv file write_as_csv(rt, "twitter.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8") # and then re-opening it x <- read.csv("twitter.csv") str(x) ## plot time series of tweets ts_plot(rt, "3 hours") + ggplot2::theme_minimal() + ggplot2::theme(plot.title = ggplot2::element_text(face = "bold")) + ggplot2::labs( x = NULL, y = NULL, title = "Frequency of #rstats Twitter statuses from past 9 days", subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals", caption = "\nSource: Data collected from Twitter's REST API via rtweet" ) ## plot time series of tweets frequency ts_plot(rt, by = "secs") ts_plot(rt, by = "mins") ts_plot(rt, by = "days") # Next, let’s figure out who is tweeting about R using the #rstats hashtag. # you can access to users data discussing about #rstats via users_data() users_data(rt) # view column with screen names - top 6 head(rt$screen_name) # get a list of unique usernames unique(rt$screen_name) # You can similarly use the search_users() function to just see what users are tweeting using a particular hashtag. # This function returns just a data.frame of the users and information about their accounts. # what users are tweeting with #rstats (max=1000) users <- search_users("#rstats", n = 1000) # just view the first 2 users - the data frame is large! head(users, n = 2) # once again, you can then save your results as a csv file write_as_csv(users, "users.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8") # and then re-opening it xu <- read.csv("users.csv") str(xu) # What's the difference with search_tweets? That with search_tweets you retrieve a given amount of tweets, # with search_users you retrieve a given amount of unique users. If a user can tweets a lot about #rstats # it will count as "1 author" when using search_users, but his/her tweets will appear several times # in the data.frame you get out of search_tweets. And indeed compare the two above results: length(unique(users$user_id)) length(unique(rt$user_id)) # Let’s learn a bit more about these people tweeting about R. First, where are they from? # how many languages are represented (und=undeterminated) length(unique(users$lang)) count(users, lang, sort = TRUE) # how many locations are represented length(unique(users$location)) count(users, location, sort = TRUE) # Let’s sort by count and just plot the top locations. To do this you use top_n(). # Note that in this case you are grouping your data by user. count <- count(users, location, sort = TRUE) str(count) count <- count [-which(count$location == ""), ] str(count) count <- mutate(count, location = reorder(location, n)) count <- top_n(count, 20) ggplot(count, aes(x = location, y = n)) + geom_col() + coord_flip() + labs(x = "Count", y = "Location", title = "Where Twitter users are from - unique locations ") # Twitter rate limits cap the number of search results returned to 18,000 every 15 minutes. # To request more than that, simply set retryonratelimit = TRUE and rtweet will wait for rate limit resets for you. ## search for 20000 tweets containing the word data (do not run it!) ## rt <- search_tweets("data", n = 20000, retryonratelimit = TRUE) ## search for tweets containing "rstats", including retweets rtR <- search_tweets("rstats", n = 1000) ## plot multiple time series--retweets vs non-retweets ts_plot(group_by(rtR, is_retweet), "hours") ## A more complex search: search for any tweets mentioning "salvini" or "dimaio" written in Italian ita <- search_tweets("salvini OR dimaio", n = 100, include_rts = FALSE, lang = "it") print(ita$lang[1:20]) print(ita$text[1:20]) ## Get friends # Retrieve a list of all the accounts a user follows. ## get user IDs of accounts followed by CNN cnn_fds <- get_friends("cnn") str(cnn_fds) length(cnn_fds$user_id) ## lookup data on those accounts cnn_fds_data <- lookup_users(cnn_fds$user_id) head(cnn_fds_data$name) # Get followers # Retrieve a list of the accounts following a user ## get user IDs of accounts following CNN (just the first 1000 in this example) cnn_flw <- get_followers("cnn", n = 1000) ## lookup data on those accounts cnn_flw_data <- lookup_users(cnn_flw$user_id) head(cnn_flw_data$name) # Or if you really want ALL of their followers: ## how many followers does Curini have? curini_flw <- get_followers("Curini", retryonratelimit = TRUE) length(curini_flw$user_id) Curini <- lookup_users(curini_flw$user_id) head(Curini$name) # Get favorites # Get the 300 most recently favorited statuses by Curini. fav <- get_favorites("Curini", n = 300) print(fav$text[1:20]) print(fav$lang[1:20]) #Get trends # Discover what’s currently trending in San Francisco. sf <- get_trends("san francisco") sf$trend # Get timelines # Get the most recent 200 tweets from some important US political figures tmls <- get_timeline( c("SenSchumer", "SenGillibrand", "realDonaldTrump"), n = 200 ) table(tmls$name) ## examine all twitter activity using weekly intervals ts_plot(tmls, "weeks") ## group by screen name and plot each time series [same two variants!] ts_plot(dplyr::group_by(tmls, screen_name), "weeks") # Get the most recent 1000 tweets from cnn, BBCWorld, and foxnews. tmls <- get_timelines(c("cnn", "BBCWorld", "foxnews"), n = 100) table(tmls$name) ## group by screen name and plot each time series [same two variants!] ts_plot(group_by(tmls, name), "days") ######################################################################################### # geographical analysis: problems! It works just for the US at the moment ######################################################################################### Sys.getenv("GOOGLE_MAPS_KEY") identical(Sys.getenv("GOOGLE_MAPS_KEY"),rtweet:::find_google_geocode_key()) ## search for 1000 tweets sent from the US discussing about dinner or food rt <- search_tweets( "dinner", n = 1000, include_rts = FALSE, lang="en", geocode = lookup_coords("usa")) ## create lat/lng variables using all available tweet and profile geo-location data rtll <- lat_lng(rt) str(rtll) ## plot state boundaries par(mar = c(0, 0, 0, 0)) map("state", lwd = .25) ## plot lat and lng points onto state map with(rtll, points(lng, lat, pch = 20, cex = .75, col = rgb(0, .3, .7, .75))) # alternative plot m2 <- leaflet(rtll) m2 <- addTiles(m2) # Add default OpenStreetMap map tiles m2 <- addMarkers(m2, lng=rtll$lng, lat=rtll$lat, popup=rtll$text) m2 ######################### ## Passing your results to Quanteda ######################### rt <- search_tweets("#rstats", n = 500, include_rts = FALSE, lang = "en") print(rt$lang[1:20]) colnames(rt) # I want to convert the POSIXct time format to a date (here you can also change the time zone by selecting tz="Hongkong" for example) # here I choose Greenwich Mean Time (GMT) str(rt$created_at) rt$date <- as.Date(rt$created_at, "GMT") str(rt$date) myCorpusTwitter<- corpus(rt) summary(myCorpusTwitter) head(myCorpusTwitter) texts(myCorpusTwitter)[1:2] # number of documents ndoc(myCorpusTwitter) # inspect the document-level variables head(docvars(myCorpusTwitter)) # the remove_twitter = TRUE implies that when I do a dfm I remove Twitter characters @ and # myDfm <- dfm(myCorpusTwitter , remove = stopwords("english"), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, stem = TRUE, remove_twitter = TRUE, remove_url = TRUE) topfeatures(myDfm , 20) # 20 top words # Let me see my document-term matrix for the first four documents and first 10 words myDfm[1:4, 1:10]