rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/mw/Dropbox (Personale)/TOPIC MODEL")
getwd()
library(rtweet)
library(ggplot2)
library(dplyr)
library(tidytext)
library(readtext)
library(quanteda)
library(httpuv)
library(maps)
library(leaflet)
library(lattice)
library(cowplot)
library(gridExtra)
library(ggthemes) 
library(plyr)
library(syuzhet)
library(igraph)

token <- create_token(
  app = "my_twitter_research_app",
  consumer_key = "XXXXXXXXXXXXXXXXXX",
  consumer_secret = "XXXXXXXXXXXXXXXXXX",
  access_token = "XXXXXXXXXXXXXXXXXX",
  access_secret = "XXXXXXXXXXXXXXXXXX")

get_token()

## check to see if the token is loaded
identical(token, get_token())

######################################################################################
######################################################################################
############# Recover latest 3200 tweets from Donald Trump [as 6 November 2018, 8:30 am Italian time]
######################################################################################
######################################################################################

tmls <- get_timeline(c("realDonaldTrump"), n = 3200, include_rts=TRUE)
str(tmls)
table(tmls $is_retweet)

#########################
## Passing your results to Quanteda
#########################

table(tmls$name)
print(tmls$lang[1:20])

print(tmls$text[1:20])
print(tmls$created_at[1:20])
str(tmls$created_at)
tmls$date <- as.Date(tmls$created_at, "GMT")
str(tmls$date)

tmls$date_number <- as.numeric(tmls$date)
str(tmls$date_number)
tmls$date_factor <- as.factor(tmls$date)
str(tmls$date_factor )
colnames(tmls)
## examine all twitter activity using weekly intervals
ts_plot(tmls, "weeks")

myCorpusTwitter<- corpus(tmls)
summary(myCorpusTwitter)
head(myCorpusTwitter)
texts(myCorpusTwitter)[1:2]
# number of documents
ndoc(myCorpusTwitter)
# inspect the document-level variables
head(docvars(myCorpusTwitter))
ndoc(myCorpusTwitter)

library(stm)

# the remove_twitter = TRUE implies that when I do a dfm I remove Twitter characters @ and #
myDfm <- dfm(myCorpusTwitter , remove = stopwords("english"),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, stem = TRUE, remove_twitter = TRUE, remove_url = TRUE)
topfeatures(myDfm , 20)  # 20 top words
# Let me see my document-term matrix for the first 10 documents and first 10 words
myDfm[1:10, 1:10]

myDfm.trim<- dfm_trim(myDfm , min_docfreq= 0.05)

# convert the dfm from Quanteda to STM: this is a crucial step!!!
# note that you have also list the document variables that are present in the corpus
Dfmstm2 <- convert(myDfm.trim, to = "stm", docvars = docvars(myCorpusTwitter))
summary(warnings())
head(str(Dfmstm2))

########################### how many K? 

K <-c( 5:20)
storage  <- searchK(Dfmstm2$documents, Dfmstm2$vocab, K, max.em.its = 500, 
prevalence = ~ date_number, data = Dfmstm2$meta, init.type = "Spectral")
str(storage)
plot(storage$results$semcoh, storage$results$exclus,
           xlab= "Semantic coherence",
      ylab= "Exclusivity",
       col= "blue", pch = 19, cex = 1, lty = "solid", lwd = 2)
 text(storage$results$semcoh, storage$results$exclus, labels=storage$results$K, cex= 1, pos=4)

########## I keep K=10

stmFitted <- stm(Dfmstm2$documents, Dfmstm2$vocab, K = 10, max.em.its = 75, 
prevalence = ~ date_number,  data = Dfmstm2$meta, init.type = "Spectral")
str(stmFitted)

################################################################
# Interpreting the STM by plotting and inspecting results
################################################################

# Summary visualization: understanding the extracted topics through words and example documents
# First: which words/topic association?
labelTopics(stmFitted)
plot(stmFitted)
# note these grahps also show you the frequency distribution of topics across the 12 documents
plot(stmFitted, type = "summary", labeltype = c("frex"))
plot(stmFitted, type = "hist", labeltype = c("frex"))
plot(stmFitted, type = "labels", labeltype = c("frex"))

str(stmFitted)
str(stmFitted$theta)
pca <-as.data.frame(stmFitted$theta)
str(pca)
barplot(apply(pca,2,mean), main="Topic Distribution", 
  	xlab="",  col=c("red"))

################################################################
# Additional tools for interpretation and visualization
################################################################

# Uses a topic correlation graph estimated by topicCorr and the igraph package to plot a network where nodes are topics 
# and edges indicate a positive correlation. If no edges = only negative correlations!

library(igraph)
mod.out.corr <- topicCorr(stmFitted)
str(mod.out.corr)
mod.out.corr 

set.seed(12345)
par(mar=c(0,0,0,0))
plot(mod.out.corr, vertex.color = "grey", vertex.label.color = "black", # change color of labels
     vertex.label.cex = .75, # change size of labels to 75% of original size
     edge.curved=.25, # add a 25% curve to the edges
     edge.color="grey20", # change edge color to grey
    edge.width= 3, # increase the size of the edges
    vertex.shape="square" )
legend("topleft", legend=c("Topic 1", "Topic 2", "Topic 3", "Topic 4",
"Topic 5", "Topic 6", "Topic 7", "Topic 8", "Topic 9", "Topic 10"),
       cex=0.8)

#########################################
# Estimating topic prevalence
#########################################
prep <- estimateEffect(1:10 ~ date_number, stmFitted, meta = Dfmstm2$meta, uncertainty = "Global")

plot(prep,"date_number", xaxt = "n", printlegend = FALSE, method = "continuous", topics = 1, model = stmFitted)
title("Topic 1")
abline(h=0, col="blue")
axis.Date(side = 1, tmls$date_factor, format = "%d/%m/%Y")

#########################
## Applying dictionary
#########################

# Let's apply other dictionaries!
# save your results as a csv file
write_as_csv(tmls, "twitter.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8")
# and then re-open it
x <- read.csv("twitter.csv")
str(x)
x$text <- as.character(x$text)
str(x)

# Another general dictionaries
nrc_data <- get_nrc_sentiment(x$text, language="english")
str(nrc_data)

# % positive and negative tweets
colSums(prop.table(nrc_data[, 9:10]))

# plot % positive and negative tweets
barplot(
  sort(colSums(prop.table(nrc_data[, 9:10]))), 
  horiz = TRUE, 
  cex.names = 0.7, 
  las = 1, 
  main = "Sentiment in Sample text", xlab="Percentage"
  )

barplot(
  sort(colSums(prop.table(nrc_data[, 1:8]))), 
  horiz = TRUE, 
  cex.names = 0.7, 
  las = 1, 
  main = "Emotions in Sample text", xlab="Percentage"
  )

all = c(
paste(x$text[nrc_data$anger > 0], collapse=" "),
paste(x$text[nrc_data$anticipation > 0], collapse=" "),
paste(x$text[nrc_data$disgust > 0], collapse=" "),
paste(x$text[nrc_data$fear > 0], collapse=" "),
paste(x$text[nrc_data$joy > 0], collapse=" "),
paste(x$text[nrc_data$sadness > 0], collapse=" "),
paste(x$text[nrc_data$surprise > 0], collapse=" "),
paste(x$text[nrc_data$trust > 0], collapse=" ")
)

# clean the text
library(tm)
library(wordcloud)

# function to make the text suitable for analysis
clean.text = function(x)
{
# tolower
x = tolower(x)
# remove rt
x = gsub("rt", "", x)
# remove at
x = gsub("@\\w+", "", x)
# remove punctuation
x = gsub("[[:punct:]]", "", x)
# remove numbers
x = gsub("[[:digit:]]", "", x)
# remove links http
x = gsub("http\\w+", "", x)
# remove tabs
x = gsub("[ |\t]{2,}", "", x)
# remove blank spaces at the beginning
x = gsub("^ ", "", x)
# remove blank spaces at the end
x = gsub(" $", "", x)
return(x)
}

all = clean.text(all)
# remove stop-words
all = removeWords(all,  c(stopwords("english"), 'Trump' ,"trump"))
# create corpus
corpus = Corpus(VectorSource(all))
# create term-document matrix
tdm = TermDocumentMatrix(corpus)
# convert as matrix
tdm = as.matrix(tdm)
# add column names
colnames(tdm) = c('anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust')

# Plot comparison wordcloud
layout(matrix(c(1, 2), nrow=2), heights=c(1, 4))
par(mar=rep(0, 4))
plot.new()
text(x=0.5, y=0.5, 'Emotion Comparison Word Cloud')
comparison.cloud(tdm, random.order=FALSE,
colors = c("#00B2FF", "red", "#FF0099", "#6600CC", "green", "orange", "blue", "brown"),
title.size=1.5, max.words=250)

##########################
##########################
# create the left-right economic scale
##########################
##########################

dictfile <- tempfile()
download.file("https://provalisresearch.com/Download/LaverGarry.zip", dictfile, mode = "wb")
unzip(dictfile, exdir = (td <- tempdir()))
lgdict <- dictionary(file = paste(td, "LaverGarry.cat", sep = "/"))
str(lgdict)
lgdict

lg_dfm <- dfm(recentCorpus, dictionary = lgdict)
lg_dfm

x <- unlist(lgdict, recursive = FALSE)
str(x)
more_state <- as.data.frame(x[5])
str(more_state)
colnames(more_state)[1] <- "words"
more_state$words2 <- as.character(more_state$words)
str(more_state)

less_state <- as.data.frame(x[7])
str(less_state)
colnames(less_state)[1] <- "words"
less_state$words2 <- as.character(less_state$words)
str(less_state)

######################################
# Eco scale
######################################

eco <- dictionary(list(less_state = less_state$words2, more_state = more_state$words2))
eco

recentCorpus <- corpus_subset(data_corpus_inaugural, Year > 1991)
byPresMat <- dfm(recentCorpus, dictionary = eco)
byPresMat

Trump_corpus<- corpus(tmls)
Trump_dfm <- dfm(Trump_corpus, dictionary = lgdict)
Trump_dfm

# Let's focus on the categories "More State" and "Less State"
Dictionary <-convert(Trump_dfm, to="data.frame")

colnames(Dictionary )
names(Dictionary )[6] <- "More_State"
names(Dictionary )[8] <- "Less_State"
colnames(Dictionary )

Dictionary$pro_market <- Dictionary$Less_State-Dictionary$More_State
table(Dictionary$pro_market)

Dictionary$date <- tmls$date_factor
# get daily summaries of the results (sentiments and tweets)
daily <- ddply(Dictionary, ~ date, summarize, num_tweets = length(pro_market), ave_promarket = mean(pro_market)) 
str(daily)

# correlation between the volume of discussion and sentiment
cor(daily$ave_promarket, daily$num_tweets)

# plot the daily sentiment vs. volume
pro_market <- ggplot(daily, aes(x=date, y=ave_promarket, group=1)) + geom_line(linetype = "dashed", colour="red") +
 ggtitle("Pro-Market Trump") + xlab("Day") + ylab("Pro-Market") + theme_gray(base_size = 12)

volume <- ggplot(daily, aes(x=date, y=num_tweets, group=1)) + geom_line() +
 ggtitle("Trump tweets per-day") + xlab("Day") + ylab("Volume") + theme_gray(base_size = 12)

grid.arrange(pro_market, volume , ncol = 1)