rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL") getwd() library(twitteR) library(quanteda) library(readtext) consumer_key <- "YOUR CONSUMER KEY" consumer_secret <- "YOUR CONSUMER SECRET" access_token <- "YOUR ACCESS TOKEN" access_secret <- "YOUR ACCESS SECRET" setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret) # The searchString is always required. Terms can contain spaces, and multiple terms should be separated with "+" # (for example: "Lakers + Knicks" will return you all the tweets that use at the same time the word # Larker AND Knicks") or use logical operator as OR. (for example: "Lakers OR Knicks" will return you all the tweets # that use either the word Larker or Knicks or both") # n: The maximum number of tweets to return. The default is=25 # lang: If not NULL, restricts tweets to the given language, given by an ISO 639-1 code (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) # resultType: The resultType argument specifies the type of search results received in API response # Default is mixed. Allowed values are mixed (includes popular + real time results), # recent (returns the most recent results) and popular (returns only the most popular results). # how many search you can make? Clients may not make more than 150 requests per hour # see this paper http://www.sciencedirect.com/science/article/pii/S0378873314000057 for a discussion # of biases of different methods of data collection # Example: a search for Trump keyword in 5 different languages (italian, english, japanese, arabic, chinese) tweets <- searchTwitter("Trump", n=50, lang="it") head(tweets) tweets <- searchTwitter("Trump", n=50, lang="en") head(tweets) tweets <- searchTwitter("Trump", n=50, lang="en", resultType="recent") head(tweets) tweets <- searchTwitter("Trump", n=50, lang="en", resultType="popular") head(tweets) tweets <- searchTwitter("Trump", n=50, lang="ja") head(tweets) tweets <- searchTwitter("Trump", n=50, lang="ar") head(tweets) tweets <- searchTwitter("Trump", n=50, lang="zh") head(tweets) # let's drop the retweets from the preview tweets <- searchTwitter("Trump", n=50, lang="en") head(tweets) head(strip_retweets(tweets, strip_manual=TRUE, strip_mt=TRUE)) # this command allows you to transform the tweets you have downloaded into a dataframe x10 <- twListToDF(tweets) str(x10) library(ggplot2) # tweets with and without hashtag (#) ggplot(x10, aes(factor(grepl("#", x10$text)))) + geom_bar(fill = "midnightblue") + theme(legend.position="none", axis.title.x = element_blank()) + ylab("Number of tweets") + ggtitle("Tweets with Hashtags") + scale_x_discrete(labels=c("No hashtags", "Tweets with hashtags")) # tweets and retweets ggplot(x10, aes(factor(grepl("RT", x10$text)))) + geom_bar(fill = "midnightblue") + theme(legend.position="none", axis.title.x = element_blank()) + ylab("Number of tweets") + ggtitle("Retweeted Tweets") + scale_x_discrete(labels=c("Not retweeted", "Retweeted tweets")) # tweets and replied tweets ggplot(x10, aes(factor(!is.na(replyToSN)))) + geom_bar(fill = "midnightblue") + theme(legend.position="none", axis.title.x = element_blank()) + ylab("Number of tweets") + ggtitle("Replied Tweets") + scale_x_discrete(labels=c("Not in reply", "Replied tweets")) # day of the tweets # https://www.timeanddate.com/worldclock/timezone/utc x10$created ## Search between dates (i.e., as a maximum the last 7 days) tweets <- searchTwitter('Icardi', since='2017-12-10', until='2017-12-13', lang="it", n=1000) head(tweets) length(tweets) x10 <- twListToDF(tweets) str(x10) x10$created # plotting for time ggplot(data = x10, aes(x = created)) + geom_histogram(aes(fill = ..count..)) + theme(legend.position = "none") + xlab("Time") + ylab("Number of tweets") + scale_fill_gradient(low = "midnightblue", high = "aquamarine4") # plotting for time and differentiate according to tweet, retweet and reply x10$retweet <- as.factor(grepl("RT", x10$text)) str(x10) x10$type <- "tweet" x10[(!is.na(x10$replyToSN)),18] <- "reply" x10[(which(x10$retweet=="TRUE")),18] <- "RT" x10$type <- as.factor(x10$type) x10$type = factor(x10$type,levels(x10$type)[c(3,1,2)]) str(x10) table(x10$type) ggplot(data = x10, aes(x = created, fill = type)) + geom_histogram() + xlab("Time") + ylab("Number of tweets") + scale_fill_manual(values = c("midnightblue", "deepskyblue4", "aquamarine3")) # Hhow to retrieve data from the past? # Generally people are doing a study on some major event that has already happened (e.g. # Arab Spring, an election, etc). Using the Twitter API this is impossible as # you can only go back a small amount as discussed. # However, if you have the ability to look ahead, it is easy to enable a # prospective study by collecting data and automatically persisting it to a database. # how to do that? library(RSQLite) library(DBI) # you can also create your own storage online via Amazon for example # http://www.jason-french.com/blog/2014/07/03/using-r-with-mysql-databases/ # and using_ # library(RMySQL) # create your database and save it in your working directory with the name "db" mydb <- dbConnect(RSQLite::SQLite(), "db.sqlite") register_db_backend(mydb) tweets <- searchTwitter("Trump", n=50, lang="en") # store your search in the database store_tweets_db(tweets) # load the database from_db = load_tweets_db() data <- twListToDF(from_db) str(data ) tweets <- searchTwitter("Renzi", n=50, lang="it") head(tweets) store_tweets_db(tweets) from_db = load_tweets_db() # now your database has 100 entries! data <- twListToDF(from_db) str(data) # after you finish your session write dbDisconnect(mydb) # if you also write the following command, your database is erased # unlink("db.sqlite") # if you want to reopen your database and adding new observations to that, # simply type the following: mydb <- dbConnect(RSQLite::SQLite(), "db.sqlite") register_db_backend(mydb) tweets <- searchTwitter("Obama", n=50, lang="en") store_tweets_db(tweets) from_db = load_tweets_db() # now your database has 150 entries! data <- twListToDF(from_db) str(data ) dbDisconnect(mydb) # search a specific account searchTwitter('from:realDonaldTrump', resultType="recent", n=10) x<-searchTwitter('from:realDonaldTrump', resultType="recent", n=10) str(x) some_txt2 <- sapply(x, function(x) x$getText()) some_txt2 # I want to delete the retweets x2 <- head(strip_retweets(x, strip_manual=TRUE, strip_mt=TRUE)) some_txt3 <- sapply(x2, function(x) x$getText()) str(some_txt3) some_txt3 # as an alternative (but better as an alternative the previous one) userTimeline('realDonaldTrump', n=10, includeRts=TRUE) # without retweet userTimeline('realDonaldTrump', n=10) # Looking at users trump <-getUser("realDonaldTrump") str(trump ) trump$getDescription() trump$getFollowersCount() trump$getFriendsCount() # in chronological order trump$getFriends(n=5) trump$getFavorites(n=5) # geolocate analysis # geolocate the analysis in New York City! library(maps) data(world.cities) str(world.cities) world.cities[which(world.cities$name == "New York"),] library(ggmap) geocode("New York") geocode("Italy") library(leaflet) x2 <- searchTwitter("#dinner", geocode='40.75,-74,10km', n=300) head(x2) length(x2 ) nyc<- twListToDF(x2) str(nyc) # However, most of these tweets have no location. Why is that? You should find that if you check the user of each tweet # they have set their profile location and it falls within 20 miles of 42.34,-71.02' sum(is.na(nyc$longitude)) sum(!is.na(nyc$longitude)) table(nyc$longitude) table(nyc$latitude) str(nyc) # transorm as numeric longitude & latitude nyc$longitude <- as.numeric(nyc$longitude) nyc$latitude <- as.numeric(nyc$latitude ) str(nyc) # plot your graph! m <- leaflet(nyc) m <- addTiles(m) m <- addMarkers(m, lng=nyc$longitude, lat=nyc$latitude, popup=nyc$text) m # alternatively (but there are several different ways to map data with R! Check it by yourself!) library(ggplot2) library(grid) map.data <- map_data("state") points <- data.frame(x = nyc$longitude, y = nyc$latitude) ggplot(map.data) + geom_map(aes(map_id = region), map = map.data, fill = "white", color = "grey20", size = 0.25) + expand_limits(x = map.data$long, y = map.data$lat) + theme(axis.line = element_blank(), axis.text = element_blank(), axis.ticks = element_blank(), axis.title = element_blank(), plot.margin = unit(0 * c(-1.5, -1.5, -1.5, -1.5), "lines")) + geom_point(data = points, aes(x = x, y = y), size = 3, alpha = 1/3, color = "darkblue") # zoom in nyc_plot <- ggplot(map.data) + geom_map(aes(map_id = region), map = map.data, fill = "white", color = "grey20", size = 0.25) + expand_limits(x = map.data$long, y = map.data$lat) + theme(axis.line = element_blank(), axis.text = element_blank(), axis.ticks = element_blank(), axis.title = element_blank(), plot.margin = unit(0 * c(-1.5, -1.5, -1.5, -1.5), "lines")) + geom_point(data = points, aes(x = x, y = y), size = 3, alpha = 1/3, color = "darkblue") nyc_plot nyc_plot + coord_fixed(xlim=c(-74.12,-73.9), ylim=c(40.58,40.87), ratio = 1.5) # Finding trends on Twitter avail_trends=availableTrendLocations() head(avail_trends) str(avail_trends) fix(avail_trends) table(avail_trends$country) avail_trends[which(avail_trends$country == "United Kingdom"),] trends=getTrends(44418) head(trends) world.cities[which(world.cities$name == "London"),] close_trends=closestTrendLocations(51.52,-0.10) head(close_trends) # tweets from which sources? r_tweets<-searchTwitter("renzi",n=100) str(r_tweets) sources<-sapply(r_tweets,function(x)x$getStatusSource()) sources # now some cleanings sources<-gsub("","",sources) sources<-strsplit(sources,">") sources<-sapply(sources,function(x)ifelse(length(x)>1,x[2],x[1])) table(sources) source_table=table(sources) pie(source_table[source_table>10]) # get the tweets and pass everything to quanteda for a later text analysis x <- searchTwitter("renzi", n=10, lang="it") str(x) x20 <- twListToDF(x) str(x20) head(x20) # salva direttamente come corpus! myCorpusTwitter<- corpus(x20) summary(myCorpusTwitter) # the remove_twitter = TRUE implies that when I do a dfm I remove Twitter characters @ and # myDfm3 <- dfm(myCorpusTwitter , remove = stopwords("italian"), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, stem = TRUE, remove_twitter = TRUE, remove_url = TRUE) topfeatures(myDfm3 , 20) # 20 top words # exercise: download 50 tweets about Trump written in English, create a dfm without stemming an apply wordfish to them