rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(e1071)
library(randomForest)
library(caret)
library(naivebayes)
library(car)
library(ggplot2)
library(dplyr)
library(reshape2)
library(iml)
library(future)
library(future.callr)
library(gridExtra)

# The data we will be using are some English social media disaster tweets discussed in 
# this article: https://arxiv.org/pdf/1705.02009.pdf 
# It consist of a number of tweets regarding accidents mixed in with a selection control tweets (not about accidents)

#####################################################
# FIRST STEP: let's create the DfM for the training-set
#####################################################

x <- read.csv("train_disaster.csv", stringsAsFactors=FALSE)
str(x)
# class-label variable: choose_one (0=tweets not relevant; 1=relevant tweets)
table(x$choose_one)
prop.table(table(x$choose_one))
nrow(x)

myCorpusTwitterTrain <- corpus(x)
tok2 <- tokens(myCorpusTwitterTrain , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, 
remove_separators = TRUE, remove_url = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
# let's also remove the unicode symbols
tok2 <- tokens_remove(tok2, c("0*"))
tok2 <- tokens_wordstem (tok2)
Dfm_train <- dfm(tok2)

# Let's trim the dfm in order to keep only features that appear in 2 or more tweets (tweets are short texts!)
# and let's keep only features with at least 2 characters
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE)
Dfm_train  <- dfm_remove(Dfm_train , min_nchar = 2)
topfeatures(Dfm_train , 20)  # 20 top words

#####################################################
# SECOND STEP: let's create the DfM for the test-set
#####################################################

x10 <- read.csv("test_disaster.csv", stringsAsFactors=FALSE)
str(x10)
nrow(x10)
myCorpusTwitterTest <- corpus(x10)
tok <- tokens(myCorpusTwitterTest , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, 
remove_separators = TRUE, remove_url = TRUE)
tok <- tokens_remove(tok, stopwords("en"))
tok <- tokens_remove(tok, c("0*"))
tok <- tokens_wordstem (tok)
Dfm_test <- dfm(tok)
Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE)
Dfm_test<- dfm_remove(Dfm_test, min_nchar = 2)
topfeatures(Dfm_test , 20)  # 20 top words

#####################################################
# THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern.
# after this step, we can "predict" by employing only the features included in the training-set
#####################################################

setequal(featnames(Dfm_train), featnames(Dfm_test)) 
length(Dfm_test@Dimnames$features) 
length(Dfm_train@Dimnames$features) 
test_dfm  <- dfm_match(Dfm_test, features = featnames(Dfm_train))
length(test_dfm@Dimnames$features) 
setequal(featnames(Dfm_train), featnames(test_dfm ))

#####################################################
# FOURTH STEP: Let's convert the two DfMs into matrices for the ML algorithms to work
#####################################################

train <- as.matrix(Dfm_train) 
test <- as.matrix(test_dfm)

#####################################################
# FIFHT STEP: let's estimate a ML Model
#####################################################

#####################################################
# let's run a Random Forest
#####################################################

# we will use the randomForest package. If you are employing compressed matrices (not our case here), plz use the ranger package.
# The ranger package is also usually faster than the randomForest one

# note that as hyperameter (or tuning-parameter), I select a specific value for the number of trees. A RF has also other hyperparameters. 
# More on this next week!

set.seed(123) # (define a set.seed for being able to replicate the results!)
system.time(RF <- randomForest(y= as.factor(Dfm_train@docvars$choose_one), x=train, importance=TRUE,  do.trace=TRUE, ntree=500))
RF

# As discussed, a natural benefit of the bootstrap resampling process is that random forests have an out-of-bag (OOB) sample
# that provides a reasonable approximation of the test error. This provides a built-in validation set without any extra work on your part. 
# However, REMEMBER: this is less efficient than changing several hyperparameters and doing a k-fold cross-validation as we will see later!

str(RF$err.rate)
RF$err.rate[nrow(RF$err.rate),] # our final error (overall, and for each of the two classes: irrelevant and relevant tweets)
plot(RF, type = "l", col = c("red", "black","springgreen4"), main = "Class errors in Random Forest Model / OOB") 
legend("topright", horiz = F, cex = 0.7, fill = c( "black", "red", "springgreen4"), 
c( "No-disaster tweets","Average error", "Disaster tweets"))

error <- as.data.frame(RF$err.rate)
# number of trees with the lowest OOB error
which.min(error$OOB) # 123

set.seed(123)
system.time(RF2 <- randomForest(y= as.factor(Dfm_train@docvars$choose_one), x=train, importance=TRUE, ntree=123, do.trace=TRUE))
RF$err.rate[nrow(RF$err.rate),] # our final error (overall, and for each of the two classes)
RF2$err.rate[nrow(RF2$err.rate),] # our final error (overall, and for each of the two classes)

# Now let's do some Global Interpretation for our RF!
# What about the importance of each feature for our trained model?
# Variable importance is a standard metric in machine learning, which indicates the amount of information a variable provides 
# to the model for predicting the outcome
head(RF$importance[,3:4])

# let's grap the result
varImpPlot(RF )

# Each features’s importance is assessed based on two criteria:
# -MeanDecreaseAccuracy: gives a rough estimate of the loss in prediction performance when that particular variable is omitted from the 
# training set. Caveat: if two variables are somewhat redundant, then omitting one of them may not lead to massive gains in prediction 
# performance, but would make the second variable more important.

# -MeanDecreaseGini: GINI is a measure of node impurity. Think of it like this: if you use this feature to split the data, 
# how pure will the nodes be? Highest purity means that each node contains only elements of a single class. 
# Assessing the decrease in GINI when that feature is omitted leads to an understanding of how important that feature is to split the data correctly.

# let's extract the matrix for GINI and Accuracy
importance_RF <- as.data.frame(RF$importance[,3:4])
str(importance_RF)
importance_RF$Feature<- row.names(importance_RF)
str(importance_RF)
# same words we get with 
varImpPlot(RF )
print(head(importance_RF[order(importance_RF$MeanDecreaseGini, decreasing=TRUE),]))

# Note that these measures are used to rank variables in terms of importance and, thus, their absolute values could be disregarded.
# As a result, we just get for example the GINI statistics overall w/o differentiating the words most important for specific classes.
# Which are however the most important words for the relevant label? And for the irrelevant one?

# let's predict the training-set texts and let's store a new variable in our dfm of the training-set with such predictions.
# Why that? Cause we will use such variable to identify the "sign" of the most important features 

predicted_rf <- predict(RF, train, type="class")
table(predicted_rf )
Dfm_train@docvars$predRF <- predicted_rf
str(Dfm_train@docvars)

# let's assign a feature to the relevant/irrelevant (0/1) class according to its frequency (if, for example, the word "collapse" appears
# 10 times among predicted relevant tweets and 5 times among predicted irrelevant tweets,
# we classify it as pertaining to the "relevant tweets" world; and so on)

# let's estimate the number of times a word appear in those tweets that we have classified as related to accidents or otherwise
sums <- list()
for (v in 0:1){
    sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predRF"]==v,])
}

sums <- do.call(cbind, sums)
sums
# let's apply the sign (either 1 or 2) if, respectively, a word appears more in the tweets classified as not related to accidents or related to them 
sign <- apply(sums, 1, which.max)
# get the feature 
names <-  dimnames(train)[[2]]
str(names)

df <- data.frame(
    Feature = names, 
    sign = sign-1, # let's recode the sign between 0 and 1 
    stringsAsFactors=F)

str(df)

importanceRF <- merge(importance_RF, df, by="Feature")
str(importanceRF)

## best predictors
for (v in 0:1){
    cat("\n\n")
    cat("value==", v)
    importanceRF <- importanceRF [order(importanceRF $MeanDecreaseGini, decreasing=TRUE),]
    print(head(importanceRF [importanceRF $sign==v,], n=10))
    cat("\n")
    cat(paste(unique(head(importanceRF $Features[importanceRF $sign==v], n=10)), collapse=", "))
}

# let's draw a graph with our results focusing on MeanDecreaseGini
importanceRF$Gini <- ifelse(importanceRF $sign>0,importanceRF $MeanDecreaseGini,importanceRF $MeanDecreaseGini*-1)
# the twop 20 relevant words
importance2RF <- top_n(importanceRF, 20, Gini ) 
importance2RF
# the twop 20 irrelevant words
importance3RF <- top_n(importanceRF, -20, Gini ) 
importance3RF
importance_newRF <- rbind(importance2RF, importance3RF)
str(importance_newRF)
# reorder the features
importance_newRF <-  mutate(importance_newRF, Feature= reorder(Feature, Gini ))
importance_newRF$sign2<- ifelse(importance_newRF$sign>0,"relevant","irrelevant")

ggplot(importance_newRF, aes(Feature, Gini, fill = sign2)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Mean Decrease Gini coefficient") +
scale_fill_manual(values = c("#000033", "#006699")) +
labs(title = "Disaster tweets",
subtitle = "Irrelevant (-) versus Relevant (+) words - Random Forest Model",
fill = "Disaster")

# let's store the graph
RF_graph <-  ggplot(importance_newRF, aes(Feature, Gini, fill = sign2)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Mean Decrease Gini coefficient") +
scale_fill_manual(values = c("#000033", "#006699")) +
labs(title = "Disaster tweets",
subtitle = "Irrelevant (-) versus Relevant (+) words - Random Forest Model",
fill = "Disaster")

# let's FINALLY predict the test-set
system.time(predicted_rf <- predict(RF, test,type="class"))
table(predicted_rf )
prop.table(table(predicted_rf))

#####################################################
# let's run a SVM
#####################################################

# note that here I select a linear kernel (in my experience a linear kernel is doing fine with texts data)
# and, as hyperameter, a specific value for the cost C(=1)- A SVM has also other hyperparameters. More on this later on

system.time(SV <- svm(y= as.factor(Dfm_train@docvars$choose_one), x=train, kernel='linear', cost = 1))

# how many supporting vectors?
length(SV$index) 
nrow(train) # 244 out of 400 tweets in the training-set data-frame

# The coefficients that you're pulling out are the weights for the support vectors. 
# Looking at the estimated coefficients is not however so informative because they only tell us what support vectors were estimated in the model
# i.e., which are the texts with the largest weight in our estimation
head(SV$coefs)

# let's read however such texts
vectors <- x[SV$index,]
nrow(vectors)
str(vectors) # you see that you do not have tweets from 2 to 4 cause they are not supporting vectors!
vectors$coefs <- SV$coefs[,1]
str(vectors)

vectors<- vectors[order(vectors$coef),]
head(vectors[,c("coefs", "text")], n=10)

vectors<- vectors[order(vectors$coef, decreasing=TRUE),]
head(vectors[,c("coefs", "text")], n=10)

# So now let's do some more proper Global Interpretation for our SVM!
# In particular, let's now focus on identifying the most relevant features for the SVM. For doing that we will rely on a "permutational" approach. 
# What do we mean by permutation? Basically, what we do is the following: you randomly permute the real value of each single feature 
# in your dfm. A feature is "important" if permuting its values increases the model error, because the model relied on the feature for 
# the prediction. That is: if we randomly permute the values of an important feature in the training data, the training performance 
# would degrade (since permuting the values of a feature effectively destroys any relationship between that feature and the target variable).
# This gives us a nice interpretation: Feature importance is the increase in model error when the feature’s information is destroyed. 
# A feature is “unimportant” if permuting its values keeps the model error unchanged, because the model ignored the 
# feature for the prediction.

# Note that Permutation feature importance does not require retraining the model. Some other methods suggest deleting a feature, retraining the model 
# and then comparing the model error. Since the retraining of a machine learning model can take a long time, "only" permuting a feature can save a lot 
# of time. Moreover,  we are interested in the feature importance of a fixed model. 
# Re-training with a reduced dataset creates each time a different model than the one we are interested in.

# Permuting a feature and measuring the increase in loss is not the only way to measure the importance of a feature. 
# An alternative to permutation feature importance are variance-based measures. You focus here not on the error of the model, 
# but in how much the model's output varies for a feature without considering what it means for performance. 
# This definition of importance differs from the loss-based definition as in the case of permutation feature importance. 
# This is evident in cases where a model overfitted. 
# If a model overfits and uses a feature that is unrelated to the output, then the permutation feature importance 
# would assign an importance of zero because this feature does not contribute to producing correct predictions. 
# A variance-based importance measure, on the other hand, might assign the feature high importance as the prediction can change a lot 
# when the feature is changed. Model variance (explained by the features) and feature importance correlate strongly 
# when the model generalizes well (i.e. it does not overfit). If you are interested in variance-based measures, you can take a look 
# at the shapper and fastshap packages.

# For running a permutation, you need first to convert the Dfm into a data frame, not into a matrix (the iml package requires that!)
# Moreover you need also to focus on your DV if it is a string. For example, if you have two class labels such as "relevant" and "not-relevant"
# you need to change the latter to "nonrelevant" (i.e., no space or "-" or anything else) before transforming it into a factor. 
# Otherwise the iml package will collapse. I discovered it once, after spending 2 hours on iml!!!
# Not our problem here!

as.factor(Dfm_train@docvars$choose_one)

train2 <- convert(Dfm_train, to = "data.frame")
# iml does not want features that begin with @, #, etc.
# the command make.names allows us to automatically place the character "X" in such cases at the beginning of a feature
# while all "invalid" characters are translated to "."
# By adding unique=TRUE the two words @UKIP and #UKIP will be treated differently!
str(train2)

colnames(train2 ) <- make.names(colnames(train2 ), unique = TRUE)
colnames(train2)
str(train2)

# let's rerun the SVM with the data frame (NOT with the original matrix)
svm_model <- svm(as.factor(Dfm_train@docvars$choose_one) ~ ., data = train2[-1], kernel='linear', cost = 1)
# why train2[-1]? cause the first column in train2 refers to doc_id not to a feature!
head(colnames(train2))

# let's create an object that holds the model and the data
mod <- Predictor$new(svm_model, data = train2[-1], y = as.factor(Dfm_train@docvars$choose_one), type = "prob")

# plan("callr", workers = 6) allows you to run the permutation by taking advantages of 6 cores in your computer
# If you have 4 cores write workers=4, etc. 
# This is a so called "parallelization". It means that the computation per feature is distributed among several cores of your machine.
# This can save you LOTS of time!

system.time({
  plan("callr", workers = 6)
 set.seed(123)
imp2 <- FeatureImp$new(mod, loss = "ce", n.repetitions=1)
}) # around 25 secs on my laptop

# any feature with a value > 1 implies that it is important to estimate the model.
# why larger than 1? Cause feature importance is estimated as a ratio (that's the deafault in iml).
# If you get a value > 1 it means that the error increases if a given feature is permuted, therefore that
# feature is important! As an alternative you can also estimate feature importance as a difference.
# you can also repeat the analysis (i.e. the shuffling of the feature) X time (for example: n.repetitions=5).
# The higher the number of repetitions the more stable and accurate the results become (plus: you will have a c.i. as well!)
# However, in our case to save time we just asked for 1 repetition

imp2
res <- imp2$results[,c(1,3)]
str(res)

##########################################
# OK we know which features are important now. But what about their relationship with the class-labels?
# let's estimate that for both class labels
##########################################

# let's replicate make.names here so that both train and train2 present the same list of features!
colnames(train) <- make.names(colnames(train))

# let's replicate exactly what we already did for the RF case
# As in the Random forest example, let's predict the training-set texts and let's store a new variable in our dfm of the training-set 
# with such predictions
predicted_sv <- predict(svm_model, train, type="class")
table(predicted_sv )

Dfm_train@docvars$predSV <- predicted_sv
str(Dfm_train@docvars)

sums <- list()
for (v in 0:1){
    sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predSV"]==v,])
}

sums <- do.call(cbind, sums)
sign <- apply(sums, 1, which.max)

# get the feature 
names <-  dimnames(train)[[2]]
str(names)

df <- data.frame(
    feature = names, 
    sign = sign-1,
    stringsAsFactors=F)

str(df)
str(res)

importance <- merge(res, df, by="feature")
str(importance)

## best predictors
for (v in 0:1){
    cat("\n\n")
    cat("value==", v)
    importance <- importance[order(importance$importance, decreasing=TRUE),]
    print(head(importance[importance$sign==v,], n=10))
    cat("\n")
    cat(paste(unique(head(importance$Features[importance$sign==v], n=10)), collapse=", "))
}

str(importance)
importanceSV <- importance

# let's draw a graph with our results!
importance$perm <- ifelse(importance$sign>0,importance$importance,importance$importance*-1)
str(importance)
# the top 10 relevant words
importance2 <- top_n(importance, 10, perm ) 
importance2
# the top 20 irrelevant words
importance3 <- top_n(importance, -10, perm ) 
importance3
importance_new <- rbind(importance2, importance3)
str(importance_new)
# reorder the features
importance_new <-  mutate(importance_new, feature= reorder(feature, perm ))
importance_new$sign2<- ifelse(importance_new$sign>0,"Relevant","Not Relevant")
importance_new <-  mutate(importance_new, feature= reorder(feature, perm ))

ggplot(importance_new, aes(feature, perm, fill = sign2)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Permutation Coefficient") +
scale_fill_manual(values = c("#000033", "#006699")) +
labs(title = "Disaster tweets",
subtitle = "Irrelevant (-) versus Relevant (+) words - SVM Model",
fill = "Disaster")

# let's store the graph
SVM_graph <-   ggplot(importance_new, aes(feature, perm, fill = sign2)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Permutation Coefficient") +
scale_fill_manual(values = c("#000033", "#006699")) +
labs(title = "Disaster tweets",
subtitle = "Irrelevant (-) versus Relevant (+) words - SVM Model",
fill = "Disaster")

# let's FINALLY predict the test-set
system.time(predicted_svm <- predict(SV , test))
table(predicted_svm )
prop.table(table(predicted_svm ))

######################################################
######################################################
# Let's compare the results out-of-sample we got via SVM & RF
######################################################
######################################################

prop.table(table(predicted_svm ))
prop.table(table(predicted_rf ))

results <- as.data.frame(rbind( prop.table(table(predicted_rf )),prop.table(table(predicted_svm ))))
str(results)
results$algorithm <- c("RF", "SVM")
str(results)

# Let's plot the results!
library(reshape2)
df.long<-melt(results,id.vars=c("algorithm"))
str(df.long)

ggplot(df.long,aes(algorithm,value,fill=variable))+
 geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() +  
ylab(label="Review class in the test-set") +  xlab("algorithm") + scale_fill_discrete(name = "Prediction", labels = c("Irrelevant", "Relevant"))

# Similar but not exactly the same results for the prediction!
# And so, which result is to trust more than the other one? For answering to that, we need to introduce the Cross-Validation procedure...

#################################################
# let's compare feature importance across models
#################################################

library(gridExtra)
grid.arrange( RF_graph, SVM_graph, nrow=2) # Plot everything together

# let's check some correlation

str(importanceRF)
str(importanceSV )
names(importanceSV )[1] <- "Feature"
str(importanceSV )

# let's merge the dataframe
# here the total words decrease given that for the words in the SVM you have applied (remember!) the make.names command.
# Therefore we are not exactly comparing the results of the two models on "equal foot"
words <- merge(importanceRF, importanceSV , by="Feature")
str(words)

library(PerformanceAnalytics)
chart.Correlation(words[c(3,6)])