rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(rtweet) library(readtext) library(quanteda) library(ggplot2) library(ggmap) library(httpuv) library(dplyr) library(stringr) library(quanteda.textstats) library(quanteda.textplots ) # "search_tweets" command: it implements the REST API search # it returns Twitter statuses matching a user provided search query # it ONLY RETURNS DATA FROM THE PAST 6-9 DAYS # (exception: user timelines as we will seen - 3,200 most recent tweets are available) # Search for up to 200 (non-retweeted) tweets written in English containing the rstats hashtag without retweets. rt <- search_tweets("#rstats", n = 200, lang = "en", include_rts = FALSE) length(rt$text) # days covered by our analysis since <- rt$created_at[nrow(rt)] latest <- rt$created_at[1] cat("Twitter data","\n",paste("From:",since),"\n",paste(" To:",latest)) # When Twitter provides a tweet, the 'created_at' field provides a timestamp for when the tweet was authored # This timestamp is useful, but remember: it is in Greenwich Mean Time! # Unless the tweet happens to have come from that timezone, its time needs to be adjusted to account for this discrepancy. # Not an easy effort sometimes! # By default, search_tweets download first the most recent tweets. You can also specify that you want to retrieve # tweets not only starting from the most recent tweets by typing: "type = "mixed"" to ensure you get tweets that spread # over a couple of days. A third alternative is to specify "type = "popular"" # print tweet text strwrap(print(rt$text[1:5])) # lots of info about each single tweet dplyr::glimpse(rt) colnames(rt) # What are the most popular hashtags at the moment? We can use regular expressions to extract hashtags ht <- str_extract_all(rt$text, '#[A-Za-z0-9_]+') ht <- unlist(ht) head(sort(table(ht), decreasing = TRUE)) # And who are the most frequently mentioned users? handles <- str_extract_all(rt$text, '@[0-9_A-Za-z]+') handles_vector <- unlist(handles) head(sort(table(handles_vector), decreasing = TRUE), n=10) # How many tweets mention "Data"? if ignore.case=FALSE the pattern matching is case sensitive; if TRUE # the pattern matching does not take into consideration if capital letter is used or otherwise, for example, or if data is matched with other # words such as "datascience" length(grep("data", rt$text, ignore.case=TRUE)) grep("data", rt$text, ignore.case=TRUE, value=TRUE) # The query to be searched must be a character string not to exceed maximum of 500 characters. # Spaces behave like boolean "AND" operator. To search for tweets containing at least one of multiple possible terms, # separate each search term with spaces and "OR" (in caps). # For example, the search q = "data science" looks for tweets containing both "data" and "science" # anywhere located anywhere in the tweets and in any order. # When "OR" is entered between search terms, query = "data OR science", Twitter should return any tweet that contains # either "data" or "science." # It is also possible to search for exact phrases using double quotes. # To do this, either wrap single quotes around a search query using double quotes, e.g., q = '"data science"' # or escape each internal double quote with a single backslash, e.g., q = "\"data science\"". # example: rt <- search_tweets('"data science"', n = 100, lang = "en", include_rts = FALSE) # An obvious (but still very) important point about the query you make: # As a researcher, when you acquire your corpus you need to ensure that the texts under examination are related to the # research question you are interest about and have theoretical consistency. # You want therefore to generate a list of keywords for your query that will include in the corpus all relevant texts # (i.e., minimize false negatives) and exclude any irrelevant texts (i.e., minimize false positives) # you can then save your results as a csv file # write_as_csv(rt, "twitter.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8") # another interesting option is the "filter" function in a search query. # For example, let's retrieve only those tweets including some type of media(mainly photo) and discussing about "war" rt1 <- search_tweets("war filter:media", n = 200, include_rts = FALSE) length(rt1 $text) # print tweet text strwrap(print(rt1 $text[1:5])) dplyr::glimpse(rt1 ) rt1 $status_url[1:5] # just tweets including a link to news articles rt1 <- search_tweets("war filter:news", n = 200, include_rts = FALSE) rt1 $status_url[1:10] # you can also run a query with both requests (i.e., tweets including either a media or a link; look at OR below) rt1 <- search_tweets("war filter:news OR media", n = 200, include_rts = FALSE) # or you can filter in a negative way rt1 <- search_tweets("war -filter:media", n = 200, include_rts = FALSE) rt1 $status_url[1:5] # other possible filters: # Exclude quotes via "-filter:quote" # Exclude replies via "-filter:replies" # Filter (return only) verified via "filter:verified" # Filter (return only) retweets via filter:retweets ## plot time series of tweets ts_plot(rt, "1 hours") + theme_minimal() + theme(plot.title = element_text(face = "bold")) + labs( x = NULL, y = NULL, title = "Frequency of #rstats Twitter statuses from past 6-9 days", subtitle = "Twitter status (tweet) counts aggregated using one-hour intervals", caption = "Source: Data collected from Twitter's REST API via rtweet" ) ## plot time series of tweets frequency ts_plot(rt, by = "mins") ts_plot(rt, by = "days") # Next, let’s figure out who is tweeting about R using the #rstats hashtag. # you can access to users data discussing about #rstats via users_data() users_data(rt) # view column with screen names head(rt$screen_name) # get a list of unique usernames unique(rt$screen_name) # You can similarly use the search_users() function to download a specific number of users with #rstats in their profiles (max=1,000) users <- search_users("#rstats", n = 100) # once again, you can then save your results as a csv file # write_as_csv(users, "users.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8") # What's the difference with search_tweets? That with search_tweets you retrieve a given amount of tweets, # with search_users you retrieve a given amount of UNIQUE users. If a user can tweets a lot about #rstats # it will count as "1 author" when using search_users, but his/her tweets will appear several times # in the data.frame you get out of search_tweets. And indeed compare the two above results: length(unique(users$user_id)) length(unique(rt$user_id)) # Let’s learn a bit more about these people tweeting about R # How many languages are represented (und=undeterminated) length(unique(users$lang)) count(users, lang, sort = TRUE) # And from the location from where they are tweeting? More on this next week! # Twitter rate limits cap the number of search results returned to 18,000 every 15 minutes. # To request more than that, simply set retryonratelimit = TRUE and rtweet will wait for rate limit resets for you. ## search for 20,000 tweets containing the word data (do not run it!) ## rt <- search_tweets("data", n = 20000, retryonratelimit = TRUE) # As an alternative: use the bearer_token()! It will only work on valid tokens generated from a user-created Twitter app (requires # a Twitter developer account). Unlike the default token returned by create_token, bearer token requests cannot engage in user actions (e.g., posting tweets, # reading DMs), and the information returned by Twitter will not include user-specific variables (e.g., if the user is following a certain account). # The upside to this authentication method is that it can afford users with more generous rate limits. For example, the rate limit for the standard search API # is 18,000 tweets per fifteen minutes. With a bearer token, the rate limit is 45,000 tweets per fifteen minutes. # rt <- search_tweets("#rstats", n = 30000, token = bearer_token()) ## search for tweets containing "rstats", including retweets rtR <- search_tweets("#rstats", n = 100) ## plot multiple time series--retweets vs non-retweets ts_plot(group_by(rtR, is_retweet), "mins") # What is the most retweeted tweet? x <- rtR[which.max(rtR$retweet_count),] print(x$retweet_count) print(x$text) print(x$screen_name) ## Get friends # Retrieve a list of all the accounts a user follows. ## get user IDs of accounts followed by CNN cnn_fds <- get_friends("cnn") str(cnn_fds) length(cnn_fds$user_id) ## lookup data on those accounts cnn_fds_data <- lookup_users(cnn_fds$user_id) head(cnn_fds_data$name) # Get followers # Retrieve a list of the accounts following a user ## get user IDs of accounts following CNN (just the first 100 in this example) cnn_flw <- get_followers("cnn", n = 100) ## lookup data on those accounts cnn_flw_data <- lookup_users(cnn_flw$user_id) head(cnn_flw_data$name) # Or if you really want ALL of their followers: ## how many followers does Curini have? curini_flw <- get_followers("Curini", retryonratelimit = TRUE) length(curini_flw$user_id) # and if you want to have the list of them you can use the command lookup_users (but it will take you a while) # Curini <- lookup_users(curini_flw$user_id) # head(Curini$name) # Get favorites # Get the 10 most recently favorited statuses by Joe Biden. fav <- get_favorites("JoeBiden", n = 10) print(fav$text[1:10]) print(fav$lang[1:10]) #Get trends # Discover what’s currently trending in San Francisco. sf <- get_trends("london") sf$trend # Get timelines: for example, let's get the most recent 100 tweets from some important US political figures. # N.B. for doing that, you need to pass to rtweet your token. # calling my personal token tokenTwitter <- readRDS("twittertoken.rds") # You should write it your own like the below example: # tokenTwitter <- create_token( # app = [your_twitter_api_app], # consumer_key = [your_api_consumer_key], # consumer_secret = your_api_consumer_secret]) # how to retrieve this info? Read this link: # https://cran.r-project.org/web/packages/rtweet/vignettes/auth.html tmls <- get_timeline( c("BernieSanders", "JoeBiden"), token=tokenTwitter, n = 100 ) table(tmls$name) ## group by screen name and plot each time series [same two variants!] ts_plot(group_by(tmls, screen_name), "days") # The cap limit for a user timeline is: 3,200 # How to move beyond this limit? You can apply for an academic license on Twitter! # plz take a look at here: # https://blog.twitter.com/developer/en_us/topics/tools/2021/enabling-the-future-of-academic-research-with-the-twitter-api ######################### ## A seach query with emoji ######################### # for the list of emoji: https://github.com/hadley/emo library(emo) x <- emo::ji("smile") x smile <- search_tweets(x, n = 200, lang = "en", include_rts = FALSE) print(smile $text[1:5]) ######################### ## Passing your rtweet results to Quanteda ######################### rt <- search_tweets("#rstats", n = 100, include_rts = FALSE, lang = "en") myCorpusTwitter<- corpus(rt) as.character(myCorpusTwitter)[1:2] # number of documents ndoc(myCorpusTwitter) # inspect the document-level variables head(docvars(myCorpusTwitter)) tok <- tokens(myCorpusTwitter, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_url = TRUE) tok <- tokens_remove(tok , stopwords("english")) tok <- tokens_wordstem (tok , language =("english")) myDfm <- dfm(tok) topfeatures(myDfm , 20) # 20 top words # Let me see my document-feature matrix for the first four documents and first 10 words myDfm[1:4, 1:10] # We already saw how to answer to these questions above, but now we can do that using directly the Quanteda functions # What are the most popular hashtags at the moment? dfm_hashtag <- dfm_select(myDfm , pattern = c("#*")) topfeatures(dfm_hashtag , 20) # 20 top hashtags # And who are the most frequently mentioned users? dfm_at <- dfm_select(myDfm , pattern = c("@*")) topfeatures(dfm_at , 20) # 20 top accounts # How many tweets mention "Data"? kwic(tokens(tok), "*data*") x <- kwic(tokens(tok), "*data*") unique(x$docname) # Before creating your DFM, to clean up tweets (if you need it...) txt <- "This is a @username and #hashtag. https://twitter.com/home" txt # preserve social media tags (default) tokens(txt, remove_punct = TRUE) # remove social media tags # (in words replace each "#" with "") x2 <- gsub("#","",txt) # then tokens x2 tokens(x2, remove_punct = TRUE) # remove social media tags # and @ (in words replace each "#" and "@" with "") x2 <- gsub("\\#|@","",txt) # then tokens x2 tokens(x2, remove_punct = TRUE) # remove URLs x2 <- gsub("http.*","",txt) # then tokens x2 tokens(x2, remove_punct = TRUE) # NOTE: you can remove URLs directly via the command tokens as we did above tokens(txt, remove_punct = TRUE, remove_url = TRUE) # remove social media tags and URLS at the same time x2 <-gsub("\\#|@|http.*","",txt) # then tokens x2 tokens(x2, remove_punct = TRUE) ######################### ## An example of a possible analysis by applying something we learnt in Lab 1! ######################### # let's make two queries. One using the query "liberal*" and the second using the query "conservative*" lib <- search_tweets("liberal*", n = 1000, include_rts = FALSE, lang = "en") cons <- search_tweets("conservative", n = 1000, include_rts = FALSE, lang = "en") print(lib $text[1:10]) print(cons $text[1:10]) ## create query variable lib$query <- "Liberal" cons$query <- "Conservative" ## row bind into single data frame df <- rbind(lib, cons) # let's graph the time-trend ts_plot(group_by(df, query), by="15 mins") myCorpusTwitter<- corpus(df) tok <- tokens(myCorpusTwitter, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_url = TRUE) tok <- tokens_remove(tok , stopwords("english")) tok <- tokens_wordstem (tok , language =("english")) myDfm <- dfm(tok) topfeatures(myDfm , 20) # let's see the difference in the language between Cons and Libs tweets # FIRST: Let's compute the lexical diversity myDfm2 <- dfm_group(myDfm, groups = query) lexdiv <- textstat_lexdiv(myDfm2 ) str(lexdiv) ggplot(data=lexdiv, aes(x=document, y=TTR)) + geom_bar(stat="identity", color="blue", fill="white") + coord_flip() # SECOND: Let's employ a comparison tag-cloud set.seed(123) textplot_wordcloud(dfm_trim(myDfm2, min_termfreq = 20, verbose = FALSE), comparison = TRUE) # let's drop the two features "liber" and "conserv" from the DfM to make the graph much more interpatable! myDfm2 <- dfm_remove(myDfm2, c("liber", "conserv")) set.seed(123) textplot_wordcloud(dfm_trim(myDfm2, min_termfreq = 20, verbose = FALSE), comparison = TRUE) # THIRD: let's use as target the "conservative" vs. "liberal" table(myDfm$query) tstat_key <- textstat_keyness(myDfm ,target =myDfm$query =="Conservative") textplot_keyness(tstat_key) # let's drop once again the two features "liber" and "conserv" from the DfM to make the graph much more interpatable! myDfm2 <- dfm_remove(myDfm, c("liber", "conserv")) tstat_key <- textstat_keyness(myDfm2 ,target =myDfm$query =="Conservative") textplot_keyness(tstat_key) head(tstat_key , 10) tail(tstat_key, 10)