rm(list=ls(all=TRUE))
getwd()
### set here your working directory!
setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL")
getwd()
library(rtweet)
library(ggplot2)
library(dplyr)
library(tidytext)
library(readtext)
library(quanteda)
library(httpuv)
library(maps)
library(leaflet)

token <- create_token(
  app = "my_twitter_research_app",
  consumer_key = "YOUR NUMBER",
  consumer_secret = "YOUR NUMBER",
  access_token = "YOUR NUMBER",
  access_secret = "YOUR NUMBER")

get_token()

## check to see if the token is loaded
identical(token, get_token())

# Returns Twitter statuses matching a user provided search query. 
# ONLY RETURNS DATA FROM THE PAST 6-9 DAYS

# Search for up to 1000 (non-retweeted) tweets containing the rstats hashtag

rt <- search_tweets( "#rstats", n = 1000, include_rts = FALSE)
# print tweet text
print(rt$text[1:5])
# more than 140 chr!!!!
nchar(rt$text[1:500])
print(rt$text[493])
colnames(rt)

# you can then save your results as a csv file
write_as_csv(rt, "twitter.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8")
# and then re-opening it
x <- read.csv("twitter.csv")
str(x)

## plot time series of tweets
ts_plot(rt, "3 hours") +
  ggplot2::theme_minimal() +
  ggplot2::theme(plot.title = ggplot2::element_text(face = "bold")) +
  ggplot2::labs(
    x = NULL, y = NULL,
    title = "Frequency of #rstats Twitter statuses from past 9 days",
    subtitle = "Twitter status (tweet) counts aggregated using three-hour intervals",
    caption = "\nSource: Data collected from Twitter's REST API via rtweet"
  )

## plot time series of tweets frequency
ts_plot(rt, by = "secs")
ts_plot(rt, by = "mins")
ts_plot(rt, by = "days")

# Next, let’s figure out who is tweeting about R using the #rstats hashtag.
# you can access to  users data discussing about #rstats via users_data()
users_data(rt)

# view column with screen names - top 6
head(rt$screen_name)
# get a list of unique usernames
unique(rt$screen_name)

# You can similarly use the search_users() function to just see what users are tweeting using a particular hashtag. 
# This function returns just a data.frame of the users and information about their accounts.

# what users are tweeting with #rstats (max=1000)
users <- search_users("#rstats", n = 1000)
# just view the first 2 users - the data frame is large!
head(users, n = 2)

# once again, you can then save your results as a csv file
write_as_csv(users, "users.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8")
# and then re-opening it
xu <- read.csv("users.csv")
str(xu)

# What's the difference with search_tweets? That with search_tweets you retrieve a given amount of tweets,
# with search_users you retrieve a given amount of unique users. If a user can tweets a lot about #rstats
# it will count as "1 author" when using  search_users, but his/her tweets will appear several times
# in the data.frame you get out of  search_tweets. And indeed compare the two above results:

length(unique(users$user_id))
length(unique(rt$user_id))

# Let’s learn a bit more about these people tweeting about R. First, where are they from?

# how many languages are represented (und=undeterminated)
length(unique(users$lang))
count(users, lang, sort = TRUE)

# how many locations are represented
length(unique(users$location))
count(users, location, sort = TRUE)

# Let’s sort by count and just plot the top locations. To do this you use top_n(). 
# Note that in this case you are grouping your data by user. 

count <- count(users, location, sort = TRUE)
str(count)
count <- count [-which(count$location == ""), ]
str(count)
count <- mutate(count, location = reorder(location, n))
count <- top_n(count, 20)
ggplot(count, aes(x = location, y = n)) +
  geom_col() +
  coord_flip() +
      labs(x = "Count",
      y = "Location",
      title = "Where Twitter users are from - unique locations ")

# Twitter rate limits cap the number of search results returned to 18,000 every 15 minutes. 
# To request more than that, simply set retryonratelimit = TRUE and rtweet will wait for rate limit resets for you.

## search for 20000 tweets containing the word data (do not run it!)
## rt <- search_tweets("data", n = 20000, retryonratelimit = TRUE)

## search for tweets containing "rstats", including retweets
rtR <- search_tweets("rstats", n = 1000)
## plot multiple time series--retweets vs non-retweets
ts_plot(group_by(rtR, is_retweet), "hours")

## A more complex search: search for any tweets mentioning "salvini" or "dimaio" written in Italian
ita <- search_tweets("salvini OR dimaio", n = 100, include_rts = FALSE,  lang = "it")
print(ita$lang[1:20])
print(ita$text[1:20])

## Get friends
# Retrieve a list of all the accounts a user follows.

## get user IDs of accounts followed by CNN
cnn_fds <- get_friends("cnn")
str(cnn_fds)
length(cnn_fds$user_id)
## lookup data on those accounts
cnn_fds_data <- lookup_users(cnn_fds$user_id)
head(cnn_fds_data$name)

# Get followers
# Retrieve a list of the accounts following a user

## get user IDs of accounts following CNN (just the first 1000 in this example)
cnn_flw <- get_followers("cnn", n = 1000)
## lookup data on those accounts
cnn_flw_data <- lookup_users(cnn_flw$user_id)
head(cnn_flw_data$name)

# Or if you really want ALL of their followers:
## how many followers does Curini have?
curini_flw <- get_followers("Curini", retryonratelimit = TRUE)
length(curini_flw$user_id)
Curini <- lookup_users(curini_flw$user_id)
head(Curini$name)

# Get favorites
# Get the 300 most recently favorited statuses by Curini.
fav <- get_favorites("Curini", n = 300)
print(fav$text[1:20])
print(fav$lang[1:20])

#Get trends
# Discover what’s currently trending in San Francisco.
sf <- get_trends("san francisco")
sf$trend

# Get timelines
# Get the most recent 200 tweets from some important US political figures

tmls <- get_timeline(
  c("SenSchumer", "SenGillibrand", "realDonaldTrump"),
  n = 200
)

table(tmls$name)

## examine all twitter activity using weekly intervals
ts_plot(tmls, "weeks")

## group by screen name and plot each time series [same two variants!]
ts_plot(dplyr::group_by(tmls, screen_name), "weeks")

# Get the most recent 1000 tweets from cnn, BBCWorld, and foxnews.
tmls <- get_timelines(c("cnn", "BBCWorld", "foxnews"), n = 100)
table(tmls$name)

## group by screen name and plot each time series [same two variants!]
ts_plot(group_by(tmls, name), "days")

#########################################################################################
# geographical analysis: problems! It works just for the US at the moment
#########################################################################################

Sys.getenv("GOOGLE_MAPS_KEY")
identical(Sys.getenv("GOOGLE_MAPS_KEY"),rtweet:::find_google_geocode_key())

## search for 1000 tweets sent from the US discussing about dinner or food
rt <- search_tweets( "dinner", n = 1000, include_rts = FALSE, lang="en", geocode = lookup_coords("usa"))

## create lat/lng variables using all available tweet and profile geo-location data
rtll <- lat_lng(rt)
str(rtll)

## plot state boundaries
par(mar = c(0, 0, 0, 0))
map("state", lwd = .25)
## plot lat and lng points onto state map
with(rtll, points(lng, lat, pch = 20, cex = .75, col = rgb(0, .3, .7, .75)))

# alternative plot
m2 <- leaflet(rtll)
m2 <- addTiles(m2) # Add default OpenStreetMap map tiles
m2 <- addMarkers(m2, lng=rtll$lng, lat=rtll$lat, popup=rtll$text)
m2

#########################
## Passing your results to Quanteda
#########################

rt <- search_tweets("#rstats", n = 500, include_rts = FALSE,  lang = "en")
print(rt$lang[1:20])
colnames(rt)
# I want to convert the POSIXct time format to a date (here you can also change the time zone by selecting tz="Hongkong" for example)
# here I choose Greenwich Mean Time (GMT)
str(rt$created_at)
rt$date <- as.Date(rt$created_at, "GMT")
str(rt$date)

myCorpusTwitter<- corpus(rt)
summary(myCorpusTwitter)
head(myCorpusTwitter)
texts(myCorpusTwitter)[1:2]
# number of documents
ndoc(myCorpusTwitter)
# inspect the document-level variables
head(docvars(myCorpusTwitter))

# the remove_twitter = TRUE implies that when I do a dfm I remove Twitter characters @ and #
myDfm <- dfm(myCorpusTwitter , remove = stopwords("english"),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, stem = TRUE, remove_twitter = TRUE, remove_url = TRUE)
topfeatures(myDfm , 20)  # 20 top words
# Let me see my document-term matrix for the first four documents and first 10 words
myDfm[1:4, 1:10]