rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(caret)
library(naivebayes)
library(car)
library(ggplot2)
library(dplyr)
library(reshape2)

# The data we will be using are some English social media disaster tweets discussed in 
# this article: https://arxiv.org/pdf/1705.02009.pdf 
# It consist of a number of tweets regarding accidents mixed in with a selection of control tweets (not about accidents)

#####################################################
# FIRST STEP: let's create the DfM for the training-set
#####################################################

x <- read.csv("train_disaster.csv", stringsAsFactors=FALSE)
str(x)
# class-label variable: choose_one (0=tweets not relevant; 1=relevant tweets)
table(x$choose_one)
prop.table(table(x$choose_one))
nrow(x)

myCorpusTwitterTrain <- corpus(x)
tok2 <- tokens(myCorpusTwitterTrain , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, 
remove_separators = TRUE, remove_url = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
# let's also remove the unicode symbols
tok2 <- tokens_remove(tok2, c("0*"))
tok2 <- tokens_wordstem (tok2)
Dfm_train <- dfm(tok2)

# Let's trim the dfm in order to keep only features that appear in 2 or more tweets (tweets are short texts!)
# and let's keep only features with at least 2 characters
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE)
Dfm_train  <- dfm_remove(Dfm_train , min_nchar = 2)
topfeatures(Dfm_train , 20)  # 20 top words

# Always checking if after trimming you have some texts with just 0s!
# 6 in this case

Dfm_train[ntoken(Dfm_train) == 0,]
Dfm_train  <-  Dfm_train[ntoken(Dfm_train) != 0,]
Dfm_train[ntoken(Dfm_train) == 0,]

#####################################################
# SECOND STEP: let's create the DfM for the test-set
#####################################################

x10 <- read.csv("test_disaster.csv", stringsAsFactors=FALSE)
str(x10)
nrow(x10)
myCorpusTwitterTest <- corpus(x10)
tok <- tokens(myCorpusTwitterTest , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, 
remove_separators = TRUE, remove_url = TRUE)
tok <- tokens_remove(tok, stopwords("en"))
tok <- tokens_remove(tok, c("0*"))
tok <- tokens_wordstem (tok)
Dfm_test <- dfm(tok)
Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE)
Dfm_test<- dfm_remove(Dfm_test, min_nchar = 2)
topfeatures(Dfm_test , 20)  # 20 top words

# Always checking if after trimming you have some texts with just 0s!
# 11 in this case
Dfm_test[ntoken(Dfm_test) == 0,]
Dfm_test<-  Dfm_test[ntoken(Dfm_test) != 0,]
Dfm_test[ntoken(Dfm_test) == 0,]

#####################################################
# THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern.
# after this step, we can "predict" by employing only the features included in the training-set. This step is always advisable
# everytime we have two separated datasets for the training and the test-set
#####################################################

setequal(featnames(Dfm_train), featnames(Dfm_test)) 
length(Dfm_test@Dimnames$features) 
length(Dfm_train@Dimnames$features) 
test_dfm  <- dfm_match(Dfm_test, features = featnames(Dfm_train))
length(test_dfm@Dimnames$features) 
setequal(featnames(Dfm_train), featnames(test_dfm ))

#####################################################
# FOURTH STEP: Let's convert the two DfMs into matrices for the ML algorithms to work
#####################################################

train <- as.matrix(Dfm_train) 
test <- as.matrix(test_dfm)

#####################################################
# FIFHT STEP: let's estimate a ML Model
#####################################################

#####################################################
# Let's start with a Naive Bayes Model
#####################################################

# we will use the naivebayes package. Another possibile package you can consider is the fastNaiveBayes package 

# given our training-set, we have to use a Multinomial rather rather than a Binomial distribution given that our
# features can assume a value different than just 0/1, i.e., a one-hot-encoding. Indeed:
table(Dfm_train@x )
# to run a Binomial model with naivebayes just replace "multinomial_naive_bayes" with "bernoulli_naive_bayes" in the below command

# Which is our DV?
str(Dfm_train@docvars$choose_one) # that's an integer variable. Not good for a classification! It should always be a factor variable
# So we will use as.factor to transform it into a factor variable

# Note that instead of y=as.factor(Dfm_train@docvars$choose_one) in the formula of the NB we could have used
# y=as.factor(x$choose_one) of course! However I would always suggest you to extract the DV from the docvars of the DfM,
# especially if you have trimmed it...

system.time(NB <- multinomial_naive_bayes(x=train, y=as.factor(Dfm_train@docvars$choose_one), laplace = 1))
summary(NB)
prop.table(table(Dfm_train@docvars$choose_one)) # prior probabilities
# let's see the association between words and probabilities (i.e., matrix with class conditional parameter estimates - i.e., the likelihood!).
# take a look at "casualti" and "cream". The likelihood for the former is higher for a tweet discussing about a disaster (=1) compared
# to irrelavant tweets (=0), i.e., p(casualti|1)> p(casualti|0); the opposite happens for the word cream

head(NB$params) 

# Let's investigate about this issue a bit more. Why should we care about it?

# Often, ML models are considered “black boxes” due to their complex inner-workings. However, because of their complexity, 
# they are typically more accurate for predicting nonlinear or rare phenomena. Unfortunately, more accuracy often comes at the 
# expense of interpretability, and interpretability is crucial. It is not enough to identify a ML model that optimizes 
# predictive performance; understanding and trusting model results is a hallmark of good (social and political) science! 
# Luckily, several advancements have been made to aid in interpreting ML models over the years.
# Interpreting ML models is an emerging field that has become known as "Interpretable Machine Learning" (IML).

# Approaches to model interpretability can be broadly categorized as providing global or local explanations.
# Global interpretations help us understand the inputs and their entire modeled relationship with the prediction target.
# Local interpretations help us understand model predictions for a single row of data or a group of similar rows.

# Global interpretability in particular is about understanding how the model makes predictions, based on a holistic view of 
# its features and how they influence the underlying model structure. 
# It answers questions regarding which features are relatively influential as well as how these features influence the response variable. 
# In this latter case, we assess the magnitude of a variable’s relationship with the response as compared to other variables used in the model. 

# In text analytics (given that we are dealining with huge DfM wherein the features per-se are not the main focus of our interest)
# we are mainly interested in global interpretations; however if you use ML for other aims, local interpretations become VERY important!!!

# So let's do some Global Interpretation for our NB!

NB_prob <- as.data.frame(NB$params)
NB_prob$Feature <- row.names(NB_prob)
str(NB_prob)
# let's estimate the features that change the most the difference between the relevant and irrelevant conditional probabilities
NB_prob$diff <- NB_prob[,2]-NB_prob[,1]
str(NB_prob)

# the features that present the highest absolute value in this difference can be also considered as the most important in 
# affecting the overall performance of the algorithm
NB_prob$imp <- abs(NB_prob[,2]-NB_prob[,1])
str(NB_prob)
print(head(NB_prob[order(NB_prob$imp , decreasing=TRUE),], 15)) # most relevant words overall

# clearly, the features the present the highest positive values according to NB_prob$diff are the words
# most relevant for the "disaster" tweets (=1); those with the highest negative values are the words
# most relevant for the "irrelevant" tweets (=0)
print(head(NB_prob[order(NB_prob$diff , decreasing=TRUE),], 15)) # most relevant words for "disaster" tweets
print(head(NB_prob[order(NB_prob$diff , decreasing=FALSE),], 15)) # irrelevant words for the other tweets
NB_prob$sign <- ifelse(NB_prob$diff>0,"relevant","irrelevant")
str(NB_prob)

# let's extract the top 20-most irrelevant and the 20-most relevant contributing features
NB_prob2 <- top_n(NB_prob, 20, diff ) 
NB_prob2
NB_prob3 <- top_n(NB_prob, -20, diff ) 
NB_prob3
NB_prob_new <- rbind(NB_prob2, NB_prob3)
# reorder the features
NB_prob_new <-  mutate(NB_prob_new, Feature= reorder(Feature, diff))

ggplot(NB_prob_new, aes(Feature, diff, fill = sign)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Difference in the conditional probabilities") +
scale_fill_manual(values = c("#000033", "#006699")) +
labs(title = "Disaster tweets",
subtitle = "Irrelevant (-) versus Relevant (+) words - Naive Bayes Model",
fill = "Disaster")

# let's FINALLY predict the test-set
predicted_nb <- predict(NB ,test )
table(predicted_nb )
prop.table(table(predicted_nb ))