rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(e1071)
library(randomForest)
library(caret)
library(naivebayes)
library(car)
library(ggplot2)
library(dplyr)
library(reshape2)
library(iml)
library(future)
library(future.callr)
library(gridExtra)
library(xgboost)
library(Ckmeans.1d.dp)
library(callr)

# The data we will be using are some English social media disaster tweets discussed in 
# this article: https://arxiv.org/pdf/1705.02009.pdf 
# It consist of a number of tweets regarding accidents mixed in with a selection control tweets (not about accidents)

#####################################################
# FIRST STEP: let's create the DfM for the training-set
#####################################################

x <- read.csv("train_disaster.csv", stringsAsFactors=FALSE)
str(x)
# class-label variable: choose_one (0=tweets not relevant; 1=relevant tweets)
table(x$choose_one)
prop.table(table(x$choose_one))
nrow(x)

myCorpusTwitterTrain <- corpus(x)
Dfm_train <- dfm(myCorpusTwitterTrain , remove = c(stopwords("english")), remove_punct = TRUE, remove_numbers=TRUE, 
tolower = TRUE, remove_symbols=TRUE,  remove_separators=TRUE, remove_url = TRUE, split_hyphens = TRUE)

# Let's trim the dfm in order to keep only tokens that appear in 2 or more tweets (tweets are short texts!)
# and let's keep only features with at least 2 characters
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE)
Dfm_train  <- dfm_remove(Dfm_train , min_nchar = 2)
topfeatures(Dfm_train , 20)  # 20 top words

#####################################################
# SECOND STEP: let's create the DfM for the test-set
#####################################################

x10 <- read.csv("test_disaster.csv", stringsAsFactors=FALSE)
str(x10)
nrow(x10)
myCorpusTwitterTest <- corpus(x10)
Dfm_test<- dfm(myCorpusTwitterTest , remove = c(stopwords("english")), remove_punct = TRUE, remove_numbers=TRUE, 
tolower = TRUE, remove_symbols=TRUE,  remove_separators=TRUE, remove_url = TRUE, split_hyphens = TRUE)
Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE)
Dfm_test<- dfm_remove(Dfm_test, min_nchar = 2)
topfeatures(Dfm_test , 20)  # 20 top words

#####################################################
# THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern.
# after this step, we can "predict" by employing only the features included in the training-set
#####################################################

setequal(featnames(Dfm_train), featnames(Dfm_test)) 
length(Dfm_test@Dimnames$features) 
length(Dfm_train@Dimnames$features) 
test_dfm  <- dfm_match(Dfm_test, features = featnames(Dfm_train))
length(test_dfm@Dimnames$features) 
setequal(featnames(Dfm_train), featnames(test_dfm ))

#####################################################
# FOURTH STEP: Let's convert the two DfMs into matrices for the ML algorithms to work
#####################################################

train <- as.matrix(Dfm_train) # dense matrix
object.size(train)
# a compressed sparse matrix! 
trainSP <- as(Dfm_train, "dgCMatrix") # compressed matrix
object.size(trainSP )
object.size(train)/object.size(trainSP )

# this is going to make a HUGE difference when you have a large matrix as test and train!!!
# we will mainly use normal (i.e., not compressed) matrix in our class given that we will always use small dataset.
# But keeps this in mind!

test <- as.matrix(test_dfm)

#####################################################
# FIFHT STEP: let's estimate a ML Model
#####################################################

#####################################################
# Let's start with a Naive Bayes Model
#####################################################

# we will use the naivebayes package. Another possibile package you can consider is the fastNaiveBayes package 

# given our training-set, we have to use a Multinomial rather rather than a Binomial distribution given that our
# features can assume a value different than just 0/1, i.e., a one-hot-encoding. Indeed:
table(Dfm_train@x )
# to run a Binomial model with naivebayes just replace "multinomial_naive_bayes" with "bernoulli_naive_bayes" in the below command

# Which is our DV?
str(Dfm_train@docvars$choose_one) # that's an integer variable. Not good for a classification! It should always be a factor variable
# So we will use as.factor to transform it into a factor variable

# Note that instead of y=as.factor(Dfm_train@docvars$choose_one) in the formula of the NB we could have used
# y=as.factor(x$choose_one) of course!

set.seed(123) # (define a set.seed for being able to replicate the results!)
system.time(NB <- multinomial_naive_bayes(x=train, y=as.factor(Dfm_train@docvars$choose_one), laplace = 1))
summary(NB)
prop.table(table(Dfm_train@docvars$choose_one)) # prior probabilities
# let's see the association between words and probabilities (i.e., matrix with class conditional parameter estimates).
# take a look at "video" and "cream". The former increases the probability of a relevant tweet compared to the latter one
head(NB$params) 

# Let's investigate about this issue a bit more. Why should we care about it?

# Often, ML models are considered “black boxes” due to their complex inner-workings. However, because of their complexity, 
# they are typically more accurate for predicting nonlinear, faint, or rare phenomena. Unfortunately, more accuracy often comes at the 
# expense of interpretability, and interpretability is crucial. It is not enough to identify a machine learning model that optimizes 
# predictive performance; understanding and trusting model results is a hallmark of good science! 
# Luckily, several advancements have been made to aid in interpreting ML models over the years.
# Interpreting ML models is an emerging field that has become known as "Interpretable Machine Learning" (IML).

# Approaches to model interpretability can be broadly categorized as providing global or local explanations.
# Global interpretations help us understand the inputs and their entire modeled relationship with the prediction target.
# Local interpretations help us understand model predictions for a single row of data or a group of similar rows.

# Global interpretability is about understanding how the model makes predictions, based on a holistic view of its features and how they 
# influence the underlying model structure. It answers questions regarding which features are relatively influential as well as how these 
# features influence the response variable. In this latter case, we assess the magnitude of a variable’s relationship with the response 
# as compared to other variables used in the model. 

# In text analytics (given that we are dealining with huge DfM wherein the features per-se are not the main focus of our interest)
# we are mainly interested in global interpretations; however if you use ML for other aims, local interpretations become VERY important!!!

# So let's do some Global Interpretation for our NB!

NB_prob <- as.data.frame(NB$params)
NB_prob$Feature <- row.names(NB_prob)
str(NB_prob)
# let's estimate the features that change the most the difference between the relevant and irrelevant conditional probabilities
NB_prob$diff <- NB_prob[,2]-NB_prob[,1]
str(NB_prob)
print(head(NB_prob[order(NB_prob$diff , decreasing=TRUE),], 15)) # relevant words
print(head(NB_prob[order(NB_prob$diff , decreasing=FALSE),], 15)) # irrelevant words
NB_prob$sign <- ifelse(NB_prob$diff>0,"relevant","irrelevant")
str(NB_prob)

# let's extract the top 20-most irrelevant and the 20-most relevant contributing features
NB_prob2 <- top_n(NB_prob, 20, diff ) 
NB_prob2
NB_prob3 <- top_n(NB_prob, -20, diff ) 
NB_prob3
NB_prob_new <- rbind(NB_prob2, NB_prob3)
# reorder the features
NB_prob_new <-  mutate(NB_prob_new, Feature= reorder(Feature, diff))

  ggplot(NB_prob_new, aes(Feature, diff, fill = sign)) +
  geom_col(show.legend = F) +
  coord_flip() + 
  ylab("Difference in the conditional probabilities") +
  scale_fill_manual(values = c("orange", "blue")) +
  labs(title = "Social Disaster tweets", 
       subtitle = "Irrelevant (-) versus Relevant (+) words - NB")

# let's store the graph
NB_graph <-   ggplot(NB_prob_new, aes(Feature, diff, fill = sign)) +
  geom_col(show.legend = F) +
  coord_flip() + 
  ylab("Difference in the conditional probabilities") +
  scale_fill_manual(values = c("orange", "blue")) +
  labs(title = "Social Disaster tweets", 
       subtitle = "Irrelevant (-) versus Relevant (+) words - NB")

# let's FINALLY predict the test-set
predicted_nb <- predict(NB ,test )
table(predicted_nb )
prop.table(table(predicted_nb ))

#####################################################
# let's run a Random Forest
#####################################################

# we will use the randomForest package. If you are employing compressed matrices, plz use the ranger package.
# the ranger package is also usually faster than the randomForest one

# note that as hyperameter (or tuning-parameter), I select a specific value for the number of trees. A RF has also other hyperparameters. 
# More on this later on

set.seed(123) # (define a set.seed for being able to replicate the results!)
system.time(RF <- randomForest(y= as.factor(Dfm_train@docvars$choose_one), x=train, importance=TRUE,  do.trace=TRUE, ntree=500))

# A natural benefit of the bootstrap resampling process is that random forests have an out-of-bag (OOB) sample
# that provides a reasonable approximation of the test error. This provides a built-in validation set without any extra work on your part. 
# However, REMEMBER: this is less efficient than changing several hyperparameters and doing a k-fold cross-validation as we will see later on!

str(RF$err.rate)
RF$err.rate[nrow(RF$err.rate),] # our final error (overall, and for each of the two classes: irrelevant and relevant tweets)
plot(RF) # the errors for the two classes (red=irrelevant; green=relevant. We make a much greater error for the latter class); in black the average

error <- as.data.frame(RF$err.rate)
# number of trees with the lowest OOB error
which.min(error$OOB) # 72

set.seed(123)
system.time(RF2 <- randomForest(y= as.factor(Dfm_train@docvars$choose_one), x=train, importance=TRUE, ntree=72, do.trace=TRUE))
RF$err.rate[nrow(RF$err.rate),] # our final error (overall, and for each of the two classes)
RF2$err.rate[nrow(RF2$err.rate),] # our final error (overall, and for each of the two classes)

# Now let's do some Global Interpretation for our RF!
# What about the importance of each feature for our trained model?
# Variable importance is a standard metric in machine learning, which indicates the amount of information a variable provides 
# to the model for predicting the outcome
head(RF$importance[,3:4])

# let's grap the result
varImpPlot(RF )

# Each features’s importance is assessed based on two criteria:
# -MeanDecreaseAccuracy: gives a rough estimate of the loss in prediction performance when that particular variable is omitted from the training set. 
# Caveat: if two variables are somewhat redundant, then omitting one of them may not lead to massive gains in prediction performance, 
# but would make the second variable more important.

# -MeanDecreaseGini: GINI is a measure of node impurity. Think of it like this: if you use this feature to split the data, how pure will the nodes be? 
# Highest purity means that each node contains only elements of a single class. 
# Assessing the decrease in GINI when that feature is omitted leads to an understanding of how important that feature is to split the data correctly.

# Plz note that these measures are used to rank variables in terms of importance and, thus, their absolute values could be disregarded.

# The problem is that we just get for example the GINI statistics overall w/o differentiating the words most important for specific classes.
# which are however the most important words for the relevant label? and for the irrelevant one?

# let's extract the matrix for GINI and Accuracy
importance_RF <- as.data.frame(RF$importance[,3:4])
str(importance_RF)
importance_RF$Feature<- row.names(importance_RF)
str(importance_RF)
# same words we get with 
varImpPlot(RF )
print(head(importance_RF[order(importance_RF$MeanDecreaseGini, decreasing=TRUE),]))

# let's predict the training-set texts and let's store a new variable in our dfm of the training-set with such predictions
# Why that? Cause we will use such variable to identify the "sign" of the most important features 

predicted_rf <- predict(RF, train, type="class")
table(predicted_rf )
Dfm_train@docvars$predRF <- predicted_rf
table(Dfm_train@docvars$predRF)
#  NOTE: no perfect prediction of the training-set
table(Dfm_train@docvars$choose_one, Dfm_train@docvars$predRF)

# let's assign a feature to the relevant/irrelevant class according to its frequency (if for example the word "collapse" appears
# 10 times among predicted relevant tweets and 5 times among predicted irrelevant tweets,
# we classify it as pertaining to the "relevant tweets" world; and so on)

# let's estimate the number of times a word appear in those tweets that we have classified as related to accident or otherwise as above
sums <- list()
for (v in 0:1){
    sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predRF"]==v,])
}

sums <- do.call(cbind, sums)
sums
# let's apply the sign (either 1 or 2) if a word appears more in the tweets classified as not related to accident or otherwise
sign <- apply(sums, 1, which.max)
# get the feature 
names <-  dimnames(train)[[2]]
str(names)

df <- data.frame(
    Feature = names, 
    sign = sign-1, # let's recode the sign between 0 and 1 
    stringsAsFactors=F)

str(df)

importanceRF <- merge(importance_RF, df, by="Feature")
str(importanceRF)

## best predictors
for (v in 0:1){
    cat("\n\n")
    cat("value==", v)
    importanceRF <- importanceRF [order(importanceRF $MeanDecreaseGini, decreasing=TRUE),]
    print(head(importanceRF [importanceRF $sign==v,], n=10))
    cat("\n")
    cat(paste(unique(head(importanceRF $Features[importanceRF $sign==v], n=10)), collapse=", "))
}

# let's draw a graph with our results focusing on MeanDecreaseGini
importanceRF$Gini <- ifelse(importanceRF $sign>0,importanceRF $MeanDecreaseGini,importanceRF $MeanDecreaseGini*-1)
# the twop 20 relevant words
importance2RF <- top_n(importanceRF, 20, Gini ) 
importance2RF
# the twop 20 irrelevant words
importance3RF <- top_n(importanceRF, -20, Gini ) 
importance3RF
importance_newRF <- rbind(importance2RF, importance3RF)
str(importance_newRF)
# reorder the features
importance_newRF <-  mutate(importance_newRF, Feature= reorder(Feature, Gini ))
importance_newRF$sign2<- ifelse(importance_newRF$sign>0,"relevant","irrelevant")

  ggplot(importance_newRF, aes(Feature, Gini , fill = sign2)) +
  geom_col(show.legend = F) +
  coord_flip() + 
  ylab("Mean Decrease Gini (we recoded as negative the values for the Irrelevant features)") +
  scale_fill_manual(values = c("orange", "blue")) +
  labs(title = "Social Disaster tweets", 
       subtitle = "Irrelevant (-) versus Relevant (+) features - RF")

# let's store the graph
RF_graph <-   ggplot(importance_newRF, aes(Feature, Gini , fill = sign2)) +
  geom_col(show.legend = F) +
  coord_flip() + 
  ylab("Mean Decrease Gini (we recoded as negative the values for the Irrelevant features)") +
  scale_fill_manual(values = c("orange", "blue")) +
  labs(title = "Social Disaster tweets", 
       subtitle = "Irrelevant (-) versus Relevant (+) features - RF")

#### and with 3 categories? try to find it out by yourself!

# let's FINALLY predict the test-set
system.time(predicted_rf <- predict(RF, test,type="class"))
table(predicted_rf )
prop.table(table(predicted_rf))

#####################################################
# let's run a SVM
#####################################################

# note that here I select a linear kernel (in my experience a linear kernel is doing fine with texts data)
# and, as hyperameter, a specific value for the cost C(=1)- A SVM has also other hyperparameters. More on this later on

set.seed(123)# (define a set.seed for being able to replicate the results!)
system.time(SV <- svm(y= as.factor(Dfm_train@docvars$choose_one), x=train, kernel='linear', cost = 1))

# how many supporting vectors?
length(SV$index) 
nrow(train) # 254 out of 400 tweets in the training-set data-frame
head(SV$coefs)

# Now let's do some Global Interpretation for our SVM!

# The coefficients that you're pulling out are the weights for the support vectors.
# Looking at the estimated coefficients is not as informative because they only tell us what support vectors were estimated in the model. 
# But we can have a sense of what observations are more “important” or “separate” better the data by extracting the support vectors 
# in the data matrix and then their corresponding coefficients (times the training labels). Note that this will only work with linear kernels.

# let's predict the training-set texts and let's store a new variable in our dfm of the training-set with such predictions
# Why that? Cause we will use such variable to identify the "sign" of the most important features 
predicted_sv <- predict(SV, train, type="class")
table(predicted_sv )
Dfm_train@docvars$predSV <- predicted_sv
table(Dfm_train@docvars$predSV)
#  NOTE: no perfect prediction of the training-set [that's a good thing for the permutation method we will employ below!]
table(Dfm_train@docvars$choose_one, Dfm_train@docvars$predSV)

# let's identify the 254 vectors (and their corresponding texts!)
str(x)

df_vector <- data.frame(
  vector = x$text[SV$index],
  coef = SV$coefs,
  relevant = predicted_sv[SV$index],
  stringsAsFactors = F
)

str(df_vector )

# irrelevant tweets (among the supporting vectors) 
df_vector  <- df_vector[order(df_vector $coef, decreasing=TRUE),]
head(df_vector [,c("coef", "relevant", "vector")], n=10)

# relevant tweets (among the supporting vectors)
df_vector  <- df_vector[order(df_vector $coef),]
head(df_vector [,c("coef", "relevant", "vector")], n=10)

# let's now focus on identifying the most relevant features for the SVM. For doing that we will rely on a "permutational" approach. 
# What do we mean by permutation? Basically, what we do is the following: you randomly permute the real value of each single feature 
# in your dfm. A feature is “important” if permuting its values increases the model error, because the model relied on the feature for 
# the prediction. The idea is that if we randomly permute the values of an important feature in the training data, the training performance 
# would degrade (since permuting the values of a feature effectively destroys any relationship between that feature and the target variable). 
# A feature is “unimportant” if permuting its values keeps the model error unchanged, because the model ignored the 
# feature for the prediction. (Note: the model is NOT refit to the training data after randomly permuting the values of a feature). 

# For running a permutation, you need first to convert the Dfm into a data frame, not into a matrix (the iml package requires that)

train2 <- convert(Dfm_train, to = "data.frame")
# iml does not want features that begin with @, #, etc.
# the command make.names creates a syntactically valid feature 
# it consists of letters, numbers and the dot or underline characters and starts with a letter or the dot not followed 
# by a number. Features such as ".2way" are not valid for example
# The character "X" is prepended if necessary. All invalid characters are translated to "."
# A missing value is translated to "NA". 
colnames(train2 ) <- make.names(colnames(train2 ))
colnames(train2)

# let's rerun the SVM with the data frame
set.seed(123)
svm_model <- svm(as.factor(Dfm_train@docvars$choose_one) ~ ., data = train2[-1], kernel='linear', cost = 1)
# why train2[-1]? cause the first column in train2 refers to doc_id not to a feature!
head(colnames(train2))

# let's create an object that holds the model and the data
mod <- Predictor$new(svm_model, data = train2[-1], y = as.factor(Dfm_train@docvars$choose_one), type = "prob")

# plan("callr", workers = 6) allows you to run the permutation by taking advantages of all the cores in your computer
# (I have 6 cores for example, therefore workers=6). If you have 4 cores write workers=4, etc. 
# This is a so called "parallelization". It means that the computation per feature is distributed among several cores of your machine.
# This can save you LOTS of time!

system.time({
  plan("callr", workers = 6)
 set.seed(123)
imp2 <- FeatureImp$new(mod, loss = "ce", n.repetitions=1)
})

# any feature with a value > 1 implies that it is important to estimate the model.
# why larger than 1? Cause feature importance is estimated as a ratio (that's the deafault in iml): error.permutation/error.orig
# If you get a value > 1 it means that the error increases if a given feature is permuted, therefore that
# feature is important! As an alternative you can also estimate feature importance as a difference: error.permutation-error.orig
# you can also repate the analysis (i.e. the shuffling of the feature) X time (for example: n.repetitions=5).
#  The higher the number of repetitions the more stable and accurate the results become (plus: you will have a c.i. as well!)
# However, in our case to save time we just asked for 1 repetition

imp2
res <- imp2$results[,c(1,3)]
str(res)

##########################################
# OK we know which features are important now. But what about their relationship with the class-labels?
# let's estimate that for both ham and spam class labels
##########################################

# let's replicate make.names here so that both train and train2 present the same list of features!
colnames(train) <- make.names(colnames(train))
train2 <- as.data.frame(train)
train3 <-train2[SV$index,] # let's extract from the data frame only the vectors!
train4 <- as.matrix(train3)

Dfm_train2 <-Dfm_train@docvars$predSV[SV$index] # let's extract from the DV only the vectors!
Dfm_train2 

# let's replicate exactly what we already did for the RF case

sums <- list()
for (v in 0:1){
    sums[[v+1]] <- colSums(train4[Dfm_train2==v,], na.rm = TRUE)
}

sums <- do.call(cbind, sums)
sign <- apply(sums, 1, which.max)

# get the feature 
names <-  dimnames(train)[[2]]
str(names)

df <- data.frame(
    feature = names, 
    sign = sign-1,
    stringsAsFactors=F)

str(df)
str(res)

importance <- merge(res, df, by="feature")
str(importance)

## best predictors
for (v in 0:1){
    cat("\n\n")
    cat("value==", v)
    importance <- importance[order(importance$importance, decreasing=TRUE),]
    print(head(importance[importance$sign==v,], n=10))
    cat("\n")
    cat(paste(unique(head(importance$Features[importance$sign==v], n=10)), collapse=", "))
}

str(importance)

# let's draw a graph with our results!
importance$perm <- ifelse(importance$sign>0,importance$importance,importance$importance*-1)
str(importance)
# the top 10 relevant words
importance2 <- top_n(importance, 10, perm ) 
importance2
# the top 20 irrelevant words
importance3 <- top_n(importance, -10, perm ) 
importance3
importance_new <- rbind(importance2, importance3)
str(importance_new)
# reorder the features
importance_new <-  mutate(importance_new, feature= reorder(feature, perm ))
importance_new$sign2<- ifelse(importance_new$sign>0,"Relevant","Not Relevant")
importance_new <-  mutate(importance_new, feature= reorder(feature, perm ))

  ggplot(importance_new, aes(feature, perm , fill = sign2)) +
  geom_col(show.legend = F) +
  coord_flip() + 
  ylab("Permutation results (we recoded as negative the values for the irrelevant features)") +
  scale_fill_manual(values = c("orange", "blue")) +
  labs(title = "Social Disaster tweets", 
       subtitle = "Irrelevant (-) versus Relevant (+) features - SVM")

# let's store the graph
SVM_graph <-   ggplot(importance_new, aes(feature, perm , fill = sign2)) +
  geom_col(show.legend = F) +
  coord_flip() + 
  ylab("Permutation results (we recoded as negative the values for the irrelevant features)") +
  scale_fill_manual(values = c("orange", "blue")) +
  labs(title = "Social Disaster tweets", 
       subtitle = "Irrelevant (-) versus Relevant (+) features - SVM")

# let's FINALLY predict the test-set
system.time(predicted_svm <- predict(SV , test))
table(predicted_svm )
prop.table(table(predicted_svm ))

# P.S. How does it work svm with a Multi-category classes? In one-vs-one SVM (as in our example), 
# each classifier is trained to distinguish one class from another
# For M classes, however, you have M(M-1)/2 combinations, which is also the number of resulting classifiers. 
# For example, if we had 3 classes we get 3 classifiers, i.e. class1-vs-class2 and class1-vs-class3, class2-vs-class3. 
# Practically, in each case you suppose that only 2 classes exist, and you train a classifier to distinguish among them. 
# During the prediction phase, you choose the class with a majority  vote among all the classifiers.

#####################################################
# let's run a Gradient Boosting
#####################################################

# let's first re-create the matrix as it was, before the permutation change as above, i.e., colnames(train) <- make.names(colnames(train))
train <- as.matrix(Dfm_train) # dense matrix

# You need always to add the number of classes to be classified in the formula if it is a multi-categorical variable. 
# In the present case, we have just two classes (negative & positive) so we do not care. Other situations could be
# different (as we will see)
numberOfClasses <- length(unique(Dfm_train@docvars$choose_one))
numberOfClasses 

# you DV should be always a numeric one starting from 0. If it is not the case you need to create such variable.
# In our case it is already a numeric variable starting from 0, so we do not bother about it

# you write "objective = "binary:logistic"" if you have just two classes (positive/negative) as in our example.
# Logistic regression for binary classification returns predicted probability (not class)
# Otherwise: "objective = "multi:softmax"" – multiclass classification using the softmax objective, returns predicted class (not probabilities).
# As discussed above, you also need in this case to set an additional num_class (number of classes) parameter defining the number 
# of unique classes.
# Alternatively you can use "multi:softprob" – same as softmax, but it returns predicted probability of each data point belonging to each class

# There are several hyperparameters in a GBC as we have discussed (back to this when we discuss about the grid-search)

# Here, I select a specific value for the number of trees ("nrounds") as well as for "eta".
# eta controls how quickly the algorithm proceeds down the gradient descent. 
# Smaller values reduce the chance of overfitting but also increases the time to find the optimal fit (default is 0.3)

# NOTE: "nthread" (Number of threads) implements a parallelization in your computer. And this makes everything faster!

set.seed(123)
system.time(xgb.fit1 <- xgboost(
  data = train,
  label = Dfm_train@docvars$choose_one,
  nrounds = 500,
eta = 1, nthread = 4, 
  objective = "binary:logistic",  # for binary
 eval_metric = "error", # binary classification error rate
  verbose = 1               # not silent; if you want it silent "verbose=0"
))

print(head(xgb.fit1))

# Let's do some Global Interpretation for our Gradient Boosting model!

# compute feature importance matrix
importance_matrix <- xgb.importance(model = xgb.fit1)
head(importance_matrix)

# Gain: the relative contribution of the corresponding feature to the model calculated by taking each feature’s contribution 
# for each tree in the model. More in details: at each split in each tree, xgboost computes the improvement in the split-criterion (i.e., the reduction in
# the classification error). Then it averages the improvement made by each variable across all the trees that the variable is used. 
# The variables with the largest average decrease in the classification errors are considered most important.

# Cover: the relative number of observations related to this feature. For example, you have 100 observations, 4 features and 3 trees, 
# and suppose feature1 is used to decide the leaf node for 10, 5, and 2 observations in tree1, tree2 and tree3 respectively; 
# then the metric will count cover for this feature as 10+5+2 = 17 observations. This will be calculated for all the 4 features and the cover will be 
# 17 expressed as a percentage for all features’ cover metrics.

# Frequency: the percentage representing the relative number of times a particular feature occurs in the trees of the model. 
# In the above example, if feature1 occurred in 2 splits, 1 split and 3 splits in each of tree1, tree2 and tree3; then the weightage for feature1 
# will be 2+1+3 = 6. The frequency for feature1 is calculated as its percentage weight over weights of all features.

# let's focus on GAIN (as we generally always does)
importance <- importance_matrix[order(importance_matrix$Gain, decreasing=TRUE),]
head(importance, n=20)

# the problem here is that we just get the GAIN statistics w/o differentiating the words most important for the classes
# for example: which are the most important words for the positive label? and for the negative one?
# so let's improve on that following the same path done with RF and SVM!

predicted_xgb <- round(predict(xgb.fit1, train))
table(predicted_xgb)
Dfm_train@docvars$predXGB <- predicted_xgb

sums <- list()
for (v in 0:1){
    sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predXGB"]==v,])
}

sums <- do.call(cbind, sums)
sign <- apply(sums, 1, which.max)
str(sign)

# get the feature 
names <-  dimnames(train)[[2]]
str(names)

df <- data.frame(
    Feature = names, 
    sign = sign-1,
    stringsAsFactors=F)

str(df)

importance <- merge(importance_matrix, df, by="Feature")
str(importance)
    
## best predictors
for (v in 0:1){
    cat("\n\n")
    cat("value==", v)
    importance <- importance[order(importance$Gain, decreasing=TRUE),]
    print(head(importance[importance$sign==v,], n=10))
    cat("\n")
    cat(paste(unique(head(importance$Feature[importance$sign==v], n=10)), collapse=", "))
}

# Let's draw a graph!

str(importance)

importance$Gain_OK <- ifelse(importance$sign>0,importance$Gain,importance$Gain*-1)
importance2 <- top_n(importance, 20, Gain_OK) 
importance2
importance3 <- top_n(importance, -20, Gain_OK) 
importance3
importance_new <- rbind(importance2, importance3)
str(importance_new)
# reorder the features
importance_new <-  mutate(importance_new, Feature= reorder(Feature, Gain_OK))
importance_new$sign2<- ifelse(importance_new$sign>0,"positive","negative")
importance_new <-  mutate(importance_new, Feature= reorder(Feature, Gain_OK))

  ggplot(importance_new, aes(Feature, Gain_OK, fill = sign2)) +
  geom_col(show.legend = F) +
  coord_flip() + 
  ylab("Gain (we recoded as negative the values for the Negative features)") +
  scale_fill_manual(values = c("orange", "blue")) +
  labs(title = "Movie Reviews", 
       subtitle = "Negative (-) versus Positive (+) words - XGB")

# let's store the graph
XGB_graph <-    ggplot(importance_new, aes(Feature, Gain_OK, fill = sign2)) +
  geom_col(show.legend = F) +
  coord_flip() + 
  ylab("Gain (we recoded as negative the values for the Negative features)") +
  scale_fill_manual(values = c("orange", "blue")) +
  labs(title = "Movie Reviews", 
       subtitle = "Negative (-) versus Positive (+) words - XGB")

# let's FINALLY predict the test data
predicted_xgb <- predict(xgb.fit1, test)
table(predicted_xgb)
# what's that??? Remember: logistic regression for binary classification in XGB returns predicted probability (not class).
# Therefore let's use "round"
predicted_xgb <- round(predict(xgb.fit1, test))
table(predicted_xgb)
prop.table(table(predicted_xgb))

######################################################
######################################################
# Let's compare the results out-of-sample we got via Naive Bayes, SVM & RF
######################################################
######################################################

prop.table(table(predicted_nb ))
prop.table(table(predicted_svm ))
prop.table(table(predicted_rf ))
prop.table(table(predicted_xgb ))

results <- as.data.frame(rbind(prop.table(table(predicted_nb )), prop.table(table(predicted_rf )),
prop.table(table(predicted_svm )), prop.table(table(predicted_xgb ))))
str(results)
results$algorithm <- c("NB", "RF", "SVM", "XGB")
str(results)

# Let's plot the results!

df.long<-melt(results,id.vars=c("algorithm"))
str(df.long)

ggplot(df.long,aes(algorithm,value,fill=variable))+
 geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() +  
ylab(label="Review class in the test-set") +  xlab("algorithm") + scale_fill_discrete(name = "Prediction", labels = c("Irrelevant", "Relevant"))

# let's plot the figures of feature importance
NB_graph

RF_graph

library(gridExtra)
grid.arrange(NB_graph, RF_graph, SVM_graph , XGB_graph,  nrow=2) # Plot everything together

# and so, which result is to trust more than the other one? For answering to that, we need to introduce the Cross-Validation procedure!