rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL")
getwd()

library(twitteR)
library(quanteda)
library(readtext)

consumer_key <- "YOUR CONSUMER KEY"
consumer_secret <- "YOUR CONSUMER SECRET"
access_token <- "YOUR ACCESS TOKEN"
access_secret <- "YOUR ACCESS SECRET"
setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)

# The searchString is always required. Terms can contain spaces, and multiple terms should be separated with "+"
# (for example: "Lakers + Knicks" will return you all the tweets that use at the same time the word
# Larker AND Knicks") or use logical operator as OR. (for example: "Lakers OR Knicks" will return you all the tweets 
# that use either the word Larker or Knicks or both")
# n: The maximum number of tweets to return. The default is=25
# lang: If not NULL, restricts tweets to the given language, given by an ISO 639-1 code (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)
# resultType: The resultType argument specifies the type of search results received in API response
# Default is mixed. Allowed values are mixed (includes popular + real time results), 
# recent (returns the most recent results) and popular (returns only the most popular results).
# how many search you can make? Clients may not make more than 150 requests per hour
# see this paper http://www.sciencedirect.com/science/article/pii/S0378873314000057 for a discussion 
# of biases of different methods of data collection

# Example: a search for Trump keyword in 5 different languages (italian, english, japanese, arabic, chinese)

tweets <- searchTwitter("Trump", n=50, lang="it")
head(tweets)

tweets <- searchTwitter("Trump", n=50, lang="en")
head(tweets)

tweets <- searchTwitter("Trump", n=50, lang="en", resultType="recent")
head(tweets)

tweets <- searchTwitter("Trump", n=50, lang="en", resultType="popular")
head(tweets)

tweets <- searchTwitter("Trump", n=50, lang="ja")
head(tweets)

tweets <- searchTwitter("Trump", n=50, lang="ar")
head(tweets)

tweets <- searchTwitter("Trump", n=50, lang="zh")
head(tweets)

# let's drop the retweets from the preview
tweets <- searchTwitter("Trump", n=50, lang="en")
head(tweets)
head(strip_retweets(tweets, strip_manual=TRUE, strip_mt=TRUE))

# this command allows you to transform the tweets you have downloaded into a dataframe
x10 <- twListToDF(tweets)
str(x10)

library(ggplot2)

# tweets with and without hashtag (#)
ggplot(x10, aes(factor(grepl("#", x10$text)))) +
        geom_bar(fill = "midnightblue") + 
        theme(legend.position="none", axis.title.x = element_blank()) +
        ylab("Number of tweets") + 
        ggtitle("Tweets with Hashtags") +
        scale_x_discrete(labels=c("No hashtags", "Tweets with hashtags"))

# tweets and retweets
ggplot(x10, aes(factor(grepl("RT", x10$text)))) +
        geom_bar(fill = "midnightblue") + 
        theme(legend.position="none", axis.title.x = element_blank()) +
        ylab("Number of tweets") + 
        ggtitle("Retweeted Tweets") +
        scale_x_discrete(labels=c("Not retweeted", "Retweeted tweets"))

# tweets and replied tweets
ggplot(x10, aes(factor(!is.na(replyToSN)))) +
        geom_bar(fill = "midnightblue") + 
        theme(legend.position="none", axis.title.x = element_blank()) +
        ylab("Number of tweets") + 
        ggtitle("Replied Tweets") +
        scale_x_discrete(labels=c("Not in reply", "Replied tweets"))

# day of the tweets
# https://www.timeanddate.com/worldclock/timezone/utc
x10$created

## Search between dates (i.e., as a maximum the last 7 days)
tweets <- searchTwitter('Icardi', since='2017-12-10', until='2017-12-13', lang="it", n=1000)
head(tweets)
length(tweets)
x10 <- twListToDF(tweets)
str(x10)
x10$created

# plotting for time
ggplot(data = x10, aes(x = created)) +
        geom_histogram(aes(fill = ..count..)) +
        theme(legend.position = "none") +
        xlab("Time") + ylab("Number of tweets") + 
        scale_fill_gradient(low = "midnightblue", high = "aquamarine4")

# plotting for time and differentiate according to tweet, retweet and reply
x10$retweet <- as.factor(grepl("RT", x10$text))
str(x10)
x10$type <- "tweet"
x10[(!is.na(x10$replyToSN)),18] <- "reply"
x10[(which(x10$retweet=="TRUE")),18] <- "RT"
x10$type <- as.factor(x10$type)
x10$type = factor(x10$type,levels(x10$type)[c(3,1,2)])
str(x10)
table(x10$type)

ggplot(data = x10, aes(x = created, fill = type)) +
        geom_histogram() +
        xlab("Time") + ylab("Number of tweets") +
        scale_fill_manual(values = c("midnightblue", "deepskyblue4", "aquamarine3"))

# Hhow to retrieve data from the past? 
# Generally people are doing a study on some major event that has already happened (e.g.
# Arab Spring, an election, etc). Using the Twitter API this is impossible as
# you can only go back a small amount as discussed. 
# However, if you have the ability to look ahead, it is easy to enable a 
# prospective study by collecting data and automatically persisting it to a database.
# how to do that?

library(RSQLite)
library(DBI)
# you can also create your own storage online via Amazon for example
# http://www.jason-french.com/blog/2014/07/03/using-r-with-mysql-databases/
# and using_
# library(RMySQL)

# create your database and save it in your working directory with the name "db"
mydb <- dbConnect(RSQLite::SQLite(), "db.sqlite")
register_db_backend(mydb)

tweets <- searchTwitter("Trump", n=50, lang="en")
# store your search in the database
store_tweets_db(tweets)
# load the database
from_db = load_tweets_db()
data <- twListToDF(from_db)
str(data )
tweets <- searchTwitter("Renzi", n=50, lang="it")
head(tweets)
store_tweets_db(tweets)
from_db = load_tweets_db()
# now your database has 100 entries!
data <- twListToDF(from_db)
str(data)

# after you finish your session write
dbDisconnect(mydb)
# if you also write the following command, your database is erased
# unlink("db.sqlite")

# if you want to reopen your database and adding new observations to that,
# simply type the following:

mydb <- dbConnect(RSQLite::SQLite(), "db.sqlite")
register_db_backend(mydb)

tweets <- searchTwitter("Obama", n=50, lang="en")
store_tweets_db(tweets)
from_db = load_tweets_db()
# now your database has 150 entries!
data <- twListToDF(from_db)
str(data )
dbDisconnect(mydb)

# search a specific account
searchTwitter('from:realDonaldTrump', resultType="recent", n=10)
x<-searchTwitter('from:realDonaldTrump', resultType="recent", n=10)
str(x)
some_txt2 <- sapply(x, function(x) x$getText())
some_txt2 

# I want to delete the retweets 
x2 <- head(strip_retweets(x, strip_manual=TRUE, strip_mt=TRUE))
some_txt3 <- sapply(x2, function(x) x$getText())
str(some_txt3)
some_txt3

# as an alternative (but better as an alternative the previous one)
userTimeline('realDonaldTrump', n=10, includeRts=TRUE)

# without retweet
userTimeline('realDonaldTrump', n=10)

# Looking at users
trump <-getUser("realDonaldTrump")
str(trump )
trump$getDescription()
trump$getFollowersCount()
trump$getFriendsCount()
# in chronological order
trump$getFriends(n=5)
trump$getFavorites(n=5)

# geolocate analysis 
# geolocate the analysis in New York City!
library(maps)
data(world.cities)
str(world.cities)
world.cities[which(world.cities$name == "New York"),]

library(ggmap)
geocode("New York")
geocode("Italy")

library(leaflet)

x2 <- searchTwitter("#dinner", geocode='40.75,-74,10km', n=300)
head(x2)
length(x2 )
nyc<- twListToDF(x2)
str(nyc)
# However, most of these tweets have no location. Why is that? You should find that if you check the user of each tweet 
# they have set their profile location and it falls within 20 miles of 42.34,-71.02'
sum(is.na(nyc$longitude)) 
sum(!is.na(nyc$longitude))

table(nyc$longitude)
table(nyc$latitude)
str(nyc)

# transorm as numeric longitude & latitude 
nyc$longitude <- as.numeric(nyc$longitude)
nyc$latitude <- as.numeric(nyc$latitude )
str(nyc)

# plot your graph!
m <- leaflet(nyc)
m <- addTiles(m)
m <- addMarkers(m, lng=nyc$longitude, lat=nyc$latitude, popup=nyc$text)
m

# alternatively (but there are several different ways to map data with R! Check it by yourself!)

library(ggplot2)
library(grid)

map.data <- map_data("state")
points <- data.frame(x = nyc$longitude, y = nyc$latitude)
ggplot(map.data) + geom_map(aes(map_id = region), map = map.data, fill = "white",   
color = "grey20", size = 0.25) + expand_limits(x = map.data$long, y = map.data$lat) + 
    theme(axis.line = element_blank(), axis.text = element_blank(), axis.ticks = element_blank(), 
        axis.title = element_blank(), plot.margin = unit(0 * c(-1.5, -1.5, -1.5, -1.5), "lines")) + geom_point(data = points, 
    aes(x = x, y = y), size = 3, alpha = 1/3, color = "darkblue")
# zoom in
nyc_plot <- ggplot(map.data) + geom_map(aes(map_id = region), map = map.data, fill = "white",   
color = "grey20", size = 0.25) + expand_limits(x = map.data$long, y = map.data$lat) + 
    theme(axis.line = element_blank(), axis.text = element_blank(), axis.ticks = element_blank(), 
        axis.title = element_blank(), plot.margin = unit(0 * c(-1.5, -1.5, -1.5, -1.5), "lines")) + geom_point(data = points, 
    aes(x = x, y = y), size = 3, alpha = 1/3, color = "darkblue")
nyc_plot
nyc_plot + coord_fixed(xlim=c(-74.12,-73.9), ylim=c(40.58,40.87),  ratio = 1.5)

# Finding trends on Twitter
avail_trends=availableTrendLocations()
head(avail_trends)
str(avail_trends)
fix(avail_trends)
table(avail_trends$country)
avail_trends[which(avail_trends$country == "United Kingdom"),]
trends=getTrends(44418)
head(trends)

world.cities[which(world.cities$name == "London"),]
close_trends=closestTrendLocations(51.52,-0.10)
head(close_trends)

# tweets from which sources? 
r_tweets<-searchTwitter("renzi",n=100)
str(r_tweets)
sources<-sapply(r_tweets,function(x)x$getStatusSource())
sources
# now some cleanings
sources<-gsub("</a>","",sources)
sources<-strsplit(sources,">")
sources<-sapply(sources,function(x)ifelse(length(x)>1,x[2],x[1]))
table(sources)
source_table=table(sources)
pie(source_table[source_table>10])

# get the tweets and pass everything to quanteda for a later text analysis
x <- searchTwitter("renzi",  n=10, lang="it")
str(x)
x20 <- twListToDF(x)
str(x20)
head(x20)
# salva direttamente come corpus!
myCorpusTwitter<- corpus(x20)
summary(myCorpusTwitter)
# the remove_twitter = TRUE implies that when I do a dfm I remove Twitter characters @ and #
myDfm3 <- dfm(myCorpusTwitter , remove = stopwords("italian"),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, stem = TRUE, remove_twitter = TRUE, remove_url = TRUE)
topfeatures(myDfm3 , 20)  # 20 top words

# exercise: download 50 tweets about Trump written in English, create a dfm without stemming an apply wordfish to them