rm(list=ls(all=TRUE)) getwd() # setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL") setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(rtweet) library(readtext) library(quanteda) library(ggplot2) library(ggmap) library(httpuv) library(dplyr) library(stringr) # "search_tweets" command: it implements the REST API search # it returns Twitter statuses matching a user provided search query # it ONLY RETURNS DATA FROM THE PAST 6-9 DAYS # (exception: user timelines as we will seen - 3,200 most recent tweets are available) # Search for up to 100 (non-retweeted) tweets written in English containing the rstats hashtag without retweets. rt <- search_tweets( "#rstats", n = 100, lang = "en", include_rts = FALSE) length(rt$text) # days covered by our analysis since <- rt$created_at[100] latest <- rt$created_at[1] cat("Twitter data","\n",paste("From:",since),"\n",paste(" To:",latest)) # print tweet text print(rt$text[1:5]) # lots of info about each single tweet colnames(rt) # What are the most popular hashtags at the moment? We’ll use regular expressions to extract hashtags. ht <- str_extract_all(rt$text, '#[A-Za-z0-9_]+') ht <- unlist(ht) head(sort(table(ht), decreasing = TRUE)) # And who are the most frequently mentioned users? handles <- str_extract_all(rt$text, '@[0-9_A-Za-z]+') handles_vector <- unlist(handles) head(sort(table(handles_vector), decreasing = TRUE), n=10) # How many tweets mention "Data"? length(grep("data", rt$text, ignore.case=TRUE)) # The query to be searched must be a character string not to exceed maximum of 500 characters. # Spaces behave like boolean "AND" operator. To search for tweets containing at least one of multiple possible terms, # separate each search term with spaces and "OR" (in caps). # For example, the search q = "data science" looks for tweets containing both "data" and "science" # anywhere located anywhere in the tweets and in any order. # When "OR" is entered between search terms, query = "data OR science", Twitter should return any tweet that contains # either "data" or "science." # It is also possible to search for exact phrases using double quotes. # To do this, either wrap single quotes around a search query using double quotes, e.g., q = '"data science"' # or escape each internal double quote with a single backslash, e.g., q = "\"data science\"". # example: rt <- search_tweets('"data science"', n = 100, lang = "en", include_rts = FALSE) # you can then save your results as a csv file write_as_csv(rt, "twitter.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8") ## plot time series of tweets ts_plot(rt, "1 hours") + theme_minimal() + theme(plot.title = element_text(face = "bold")) + labs( x = NULL, y = NULL, title = "Frequency of #rstats Twitter statuses from past 6-9 days", subtitle = "Twitter status (tweet) counts aggregated using one-hour intervals", caption = "Source: Data collected from Twitter's REST API via rtweet" ) ## plot time series of tweets frequency ts_plot(rt, by = "mins") ts_plot(rt, by = "days") # Next, let’s figure out who is tweeting about R using the #rstats hashtag. # you can access to users data discussing about #rstats via users_data() users_data(rt) # view column with screen names head(rt$screen_name) # get a list of unique usernames unique(rt$screen_name) # You can similarly use the search_users() function to just see what users are tweeting using a particular hashtag. # This function returns just a data.frame of the users and information about their accounts. # what users are tweeting with #rstats (max=100) users <- search_users("#rstats", n = 100) # once again, you can then save your results as a csv file write_as_csv(users, "users.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8") # What's the difference with search_tweets? That with search_tweets you retrieve a given amount of tweets, # with search_users you retrieve a given amount of UNIQUE users. If a user can tweets a lot about #rstats # it will count as "1 author" when using search_users, but his/her tweets will appear several times # in the data.frame you get out of search_tweets. And indeed compare the two above results: length(unique(users$user_id)) length(unique(rt$user_id)) # Let’s learn a bit more about these people tweeting about R. First, where are they from? # how many languages are represented (und=undeterminated) length(unique(users$lang)) count(users, lang, sort = TRUE) # how many locations are represented length(unique(users$location)) count(users, location, sort = TRUE) # Let’s sort by count and just plot the top 4 locations. To do this you use top_n(). # Note that in this case you are grouping your data by user. count <- count(users, location, sort = TRUE) str(count) count <- count [-which(count$location == ""), ] str(count) count <- mutate(count, location = reorder(location, n)) count <- top_n(count, 4) ggplot(count, aes(x = location, y = n)) + geom_col() + coord_flip() + labs(x = "Count", y = "Location", title = "Where Twitter users are from - unique locations ") # Twitter rate limits cap the number of search results returned to 18,000 every 15 minutes. # To request more than that, simply set retryonratelimit = TRUE and rtweet will wait for rate limit resets for you. ## search for 20,000 tweets containing the word data (do not run it!) ## rt <- search_tweets("data", n = 20000, retryonratelimit = TRUE) ## search for tweets containing "rstats", including retweets rtR <- search_tweets("#rstats", n = 100) ## plot multiple time series--retweets vs non-retweets ts_plot(group_by(rtR, is_retweet), "mins") # What is the most retweeted tweet? x <- rtR[which.max(rtR$retweet_count),] print(x$retweet_count) print(x$text) print(x$screen_name) ## Get friends # Retrieve a list of all the accounts a user follows. ## get user IDs of accounts followed by CNN cnn_fds <- get_friends("cnn") str(cnn_fds) length(cnn_fds$user_id) ## lookup data on those accounts cnn_fds_data <- lookup_users(cnn_fds$user_id) head(cnn_fds_data$name) # Get followers # Retrieve a list of the accounts following a user ## get user IDs of accounts following CNN (just the first 100 in this example) cnn_flw <- get_followers("cnn", n = 100) ## lookup data on those accounts cnn_flw_data <- lookup_users(cnn_flw$user_id) head(cnn_flw_data$name) # Or if you really want ALL of their followers: ## how many followers does Curini have? curini_flw <- get_followers("Curini", retryonratelimit = TRUE) length(curini_flw$user_id) # and if you want to have the list of them you can use the command lookup_users (but it will take you a while) # Curini <- lookup_users(curini_flw$user_id) # head(Curini$name) # Get favorites # Get the 10 most recently favorited statuses by Joe Biden. fav <- get_favorites("JoeBiden", n = 10) print(fav$text[1:10]) print(fav$lang[1:10]) #Get trends # Discover what’s currently trending in San Francisco. sf <- get_trends("san francisco") sf$trend # Get timelines # Get the most recent 100 tweets from some important US political figures tmls <- get_timeline( c("BernieSanders", "JoeBiden"), n = 100 ) table(tmls$name) ## group by screen name and plot each time series [same two variants!] ts_plot(group_by(tmls, screen_name), "days") ######################### ## Passing your rtweet results to Quanteda ######################### rt <- search_tweets("#rstats", n = 100, include_rts = FALSE, lang = "en") print(rt$lang[1:20]) colnames(rt) # I want to convert the POSIXct time format to a date (here you can also change the time zone by selecting tz="Hongkong" for example) # here I choose Greenwich Mean Time (GMT) str(rt$created_at) rt$date <- as.Date(rt$created_at, "GMT") str(rt$date) myCorpusTwitter<- corpus(rt) texts(myCorpusTwitter)[1:2] # number of documents ndoc(myCorpusTwitter) # inspect the document-level variables head(docvars(myCorpusTwitter)) myDfm <- dfm(myCorpusTwitter , remove = stopwords("english"), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, stem = TRUE, remove_url = TRUE) topfeatures(myDfm , 20) # 20 top words # Let me see my document-feature matrix for the first four documents and first 10 words myDfm[1:4, 1:10] topfeatures(myDfm) # let's draw a network of the 50 most used features in our dfm (you will learn A LOT about network analysis with Rob Fahey!) top_feat <- names(topfeatures(myDfm, 50)) dfm_select <- dfm_select(myDfm , pattern = top_feat, selection = "keep") str(dfm_select) # 50 features topfeatures(dfm_select) # let's construct a feature co-occurrence matrix of the most recurring features (measuring the co-occurrences of features) tag_fcm <- fcm(dfm_select) head(tag_fcm) # Let's plot the results! # textplot_network allows you to plot a network directly from Quanteda. # Currently the size of the network is limited to 1000. To draw much complex networks you should use the package "igraph" (see below) jpeg('network1.jpg') set.seed(144) textplot_network(tag_fcm, min_freq = 0.8) dev.off() # if you are using RStudio, rather than the old-Skool R GUI as myself, # then the graph should appear to you also by simply typing textplot_network(tag_fcm, min_freq = 0.8) # N.B. you could also decide to plot directly the dfm "dfm_select" rather than the fcm "tag_fcm" by writing: # textplot_network(dfm_select, min_freq = 0.8) # You will get the same graph as above. In this case textplot_network first builds a fcm for you, and then it plots it! ######################### ## An example of a possible analysis [review it by yourself!] ######################### rt <- search_tweets("liberal OR conservative", n = 1000, include_rts = FALSE, lang = "en") print(rt$text[1:10]) myCorpusTwitter<- corpus(rt) myDfmTwitter <- dfm(myCorpusTwitter , remove = stopwords("english"), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, stem = FALSE, remove_url = TRUE) topfeatures(myDfmTwitter) dict <- dictionary(list(lib = c("liberal*"), cons = c("conservativ*"))) dict # let's see how many times the word liberal* and conservativ* appear in each tweet dfm_lookup(myDfmTwitter, dict)[,1] dfm_lookup(myDfmTwitter, dict)[,2] # let's save this info and let's merge them liberal <- dfm_lookup(myDfmTwitter, dict)[,1] conservative <- dfm_lookup(myDfmTwitter, dict)[,2] df_tot <- merge(liberal, conservative, by = "doc_id") str(df_tot) # let's identify a tweet as a "conservative" one if it uses the word conservative more than liberal df_tot <- mutate(df_tot, conservative = if_else(cons > lib, 1, 0)) str(df_tot) # let's add back this info to our dfm docvars(myDfmTwitter)$conservative <- df_tot$conservative # let's see the difference in the language tstat_key <- textstat_keyness(myDfmTwitter,target = myDfmTwitter$conservative == 1) textplot_keyness(tstat_key) head(tstat_key , 10) tail(tstat_key, 10) # let's plot two different features network dfm_cons <- dfm_subset(myDfmTwitter, conservative == 1) dfm_lib <- dfm_subset(myDfmTwitter, conservative == 0) feat_cons <- names(topfeatures(dfm_cons , 50)) feat_lib <- names(topfeatures(dfm_lib , 50)) dfm_cons_select <- dfm_select(dfm_cons , pattern = feat_cons , selection = "keep") dfm_lib_select <- dfm_select(dfm_lib , pattern = feat_lib , selection = "keep") tag_fcm_con <- fcm(dfm_cons_select) head(tag_fcm_con) tag_fcm_lib <- fcm(dfm_lib_select) head(tag_fcm_lib) library (cowplot) pdf("network together.pdf") set.seed(144) a <- textplot_network(tag_fcm_con, min_freq = 0.8) b <- textplot_network(tag_fcm_lib, min_freq = 0.8) plot_grid(a , b, labels = c( 'Conservative', "Liberal")) dev.off()