rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(e1071)
library(randomForest)
library(caret)
library(naivebayes)
library(car)
library(ggplot2)
library(dplyr)
library(reshape2)
library(iml)
library(future)
library(future.callr)
library(gridExtra)

# We will use once again the English social media disaster set of tweets discussed in the previous LAB

#####################################################
# FIRST STEP: let's create the DfM for the training-set
#####################################################

x <- read.csv("train_disaster.csv", stringsAsFactors=FALSE)
myCorpusTwitterTrain <- corpus(x)
tok2 <- tokens(myCorpusTwitterTrain , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_remove(tok2, c("0*"))
tok2 <- tokens_wordstem (tok2)
Dfm_train <- dfm(tok2)
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE)
Dfm_train  <- dfm_remove(Dfm_train , min_nchar = 2)

#####################################################
# SECOND STEP: let's create the DfM for the test-set
#####################################################

x10 <- read.csv("test_disaster.csv", stringsAsFactors=FALSE)
myCorpusTwitterTest <- corpus(x10)
tok <- tokens(myCorpusTwitterTest , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok <- tokens_remove(tok, stopwords("en"))
tok <- tokens_remove(tok, c("0*"))
tok <- tokens_wordstem (tok)
Dfm_test <- dfm(tok)
Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE)
Dfm_test<- dfm_remove(Dfm_test, min_nchar = 2)

# Here problems with 11 texts!
Dfm_test[ntoken(Dfm_test) == 0,]
# So let's delete such texts!
Dfm_test <- Dfm_test[ntoken(Dfm_test) != 0,]
Dfm_test[ntoken(Dfm_test) == 0,]

#####################################################
# THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern.
#####################################################

setequal(featnames(Dfm_train), featnames(Dfm_test)) 
nfeat(Dfm_test)
nfeat(Dfm_train)
test_dfm  <- dfm_match(Dfm_test, features = featnames(Dfm_train))
nfeat(test_dfm) 
setequal(featnames(Dfm_train), featnames(test_dfm ))

#####################################################
# FOURTH STEP: Let's convert the two DfMs into matrices for the ML algorithms to work
#####################################################

train <- as.matrix(Dfm_train)
test <- as.matrix(test_dfm)

#####################################################
# let's estimate a SVM
#####################################################

# Note that here I select a linear kernel (in my experience a linear kernel is doing fine with texts data)
# and, as hyperameter, a specific value for the cost C(=1)- A SVM has also other hyperparameters that we can tune. More on this later on. 
# By default, the svm command scales data internally (both x and y variables) to zero mean and unit variance. Why?
# SVM is based on distance. Hence, if we do not standardize our variables to comparable ranges, the variable with the largest range will 
# completely dominate in the computation of the kernel matrix. For example, we have two variables - X1 and X2. 
# Values of variable X1 lies between 0 and 100 whereas values of X2 lies in range of 100 and 10000. 
# In this case, variable X2 would dominate variable X1. If you wanto to avoid the scaling, you should add in the command below: "scale=FALSE".
# Finally note that we can also compute our SVM working with a data frame, rather than with a matrix.

system.time(SV <- svm(y= as.factor(Dfm_train@docvars$choose_one), x=train, kernel='linear', cost = 1))

# let's predict the test-set 
system.time(predicted_svm <- predict(SV , test))
table(predicted_svm )
prop.table(table(predicted_svm ))

# how many supporting vectors are in our training-set?
length(SV$index) 
nrow(train) # 243 out of 400 tweets in the training-set data-frame 

# The coefficients that you're pulling out are the weights for the support vectors. 
# Looking at the estimated coefficients is not however so informative because they only tell us what support vectors were estimated in the model
# i.e., which are the texts with the largest weight in our estimation
head(SV$coefs)

# let's read however such texts
vectors <- x[SV$index,]
nrow(vectors)
str(vectors) # you see that you do not have tweets from 2 to 4 cause they are not supporting vectors!
vectors$coefs <- SV$coefs[,1]
str(vectors)

# tweets about incidents
vectors<- vectors[order(vectors$coef),]
head(vectors[,c("coefs", "text", "choose_one")], n=10)

# tweets not about incidents
vectors<- vectors[order(vectors$coef, decreasing=TRUE),]
head(vectors[,c("coefs", "text", "choose_one")], n=10)

# So now let's do some proper Global Interpretation for our SVM.
# Also SVM does not come with a model-specific VI. Therefore let'apply our usual permutational approach via iml package

# Remember: the iml package requires:
# 1) a data frame, not a matrix, on which computing the prediction of the model (but you have run the previous model with a matrix)
# 2) as a result of 1), a specific function that allows you to move from the original matrix to a data frame
# 3) iml does NOT want features that begin with @, #, etc. when you use a data.frame (as it is the case with the social media of our corpus)
# 4) as a result of 2), you also need to recompute the SVM results with the converted data frame created at point 3). Of course the outcome
# of the model under 4) is going to be exactly the same as your original SVM above (after all you just replace the @ in front of a word 
# with a X!)

# Let's convert the dfm of the training set into a data frame and: a) let's apply to it make.names; b) let's remove its first columns
# (that refers  to doc_id not to a feature!)

trainDF <- convert(Dfm_train, to="data.frame")
head(colnames(trainDF))
trainDF <- trainDF[,-c(1)]
head(colnames(trainDF))
colnames(trainDF) <- make.names(colnames(trainDF), unique = TRUE)

# let's create a "cleaned" matrix out of this "cleaned" data.matrix
train2 <- as.matrix(trainDF)

# let's re-estimate our SVM with this "cleaned" matrix
system.time(SV2 <- svm(y= as.factor(Dfm_train@docvars$choose_one), x=train2, kernel='linear', cost = 1))

# Once again, we can focus either on the predicted class or on the predicted probabilities below (same results of course!).
# If you want to focus on the class, you have to employ the below predictive function:

head(predict(SV2, train2))

# let's define our predictive function
predSV<-function(model, newdata){
newData_x <- data.matrix(newdata)
results <-predict(model, newData_x)
return(results)
}

head(predSV(SV2, train2))

# let's apply this function to our SVM by creating an object that holds the model and the data  
modSV <- Predictor$new(SV2, data = trainDF, y =as.factor(Dfm_train@docvars$choose_one), predict.fun = predSV)
modSV $predict(trainDF[1:5, ])

# let's compute VI for our SVM model
system.time({
  plan("callr", workers = 6)
 set.seed(123)
impSV <- FeatureImp$new(modSV , loss = "ce", n.repetitions=1)
}) 

head(impSV $results[,c(1:4)])
res <- impSV $results[,c(1,3)]
str(res)
# let's rename the column "feature" into "Feature"
colnames(res)[1] <- "Feature"
str(res)

# Let's compute some Partial Dependence Plots (PDP). Remember: 
# It serves as a depiction of the relationship between a predictor of interest and the
# dependent variable, while basically “controlling” for all other effects of other predictors.

plot(FeatureEffect$new(modSV , "collaps", method = "pdp"))
plot(FeatureEffect$new(modSV , "bomb", method = "pdp"))
plot(FeatureEffect$new(modSV , "famili", method = "pdp"))
plot(FeatureEffect$new(modSV , "let", method = "pdp"))
plot(FeatureEffect$new(modSV , "help", method = "pdp"))
plot(FeatureEffect$new(modSV , "X.youtub", method = "pdp"))

##########################################
# Now we can follow exactly the same route followed with the other ML algorithms discussed so far!
##########################################

# Let's predict the training-set texts (employing the svm model trained on the data frame above) 
# and let's store a new variable in our dfm of the training-set with such predictions

predicted_sv <- predict(SV2, train2)
table(predicted_sv )

Dfm_train@docvars$predSV <- predicted_sv
str(Dfm_train@docvars)

sums <- list()
for (v in 0:1){
    sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predSV"]==v,])
}

sums <- do.call(cbind, sums)
sign <- apply(sums, 1, which.max)
names <-  dimnames(train2)[[2]]

df <- data.frame(Feature = names, sign=sign)
str(df)
df $sign2<-factor(df $sign,labels = c("irrelevant","relevant"))
str(df )
str(res)

importanceSV <- merge(res, df, by="Feature")
str(importanceSV )

## best predictors
clist <- c("irrelevant", "relevant")
for (v in clist){
    cat("\n\n")
    cat("value==", v)
    importanceSV <- importanceSV [order(importanceSV $ importance, decreasing=TRUE),]
    print(head(importanceSV [importanceSV $sign2==v,], n=10))
   }

# the top 10 words for the disaster tweets
importance2SV <- head(importanceSV [importanceSV $sign2=="relevant",], n=10)
importance2SV 
# the top 20 words for tweets not related to disasters
importance3SV <-head(importanceSV [importanceSV $sign2=="irrelevant",], n=10)
importance3SV 
importance_newSV <- rbind(importance2SV , importance3SV )
str(importance_newSV )

# reorder the features
importance_newSV$Feature <- with(importance_newSV, factor(Feature,levels=Feature[order(ave(importance,sign2,FUN=max),importance)]))

ggplot(importance_newSV, aes(Feature, importance, fill = sign2)) +
geom_bar(stat="identity", fill= "white") + theme_classic() +
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Permutation Coefficients") +
labs(title= "Relevant to accidents versus Irrelevant to accidents features - SVM Model",
fill= "Disaster") + scale_fill_manual(values = c("#000033", "#006699"))