rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(e1071)
library(randomForest)
library(caret)
library(naivebayes)
library(car)
library(ggplot2)
library(dplyr)
library(reshape2)
library(iml)
library(future)
library(future.callr)
library(gridExtra)
# We will use once again the English social media disaster set of tweets discussed in the previous LAB
#####################################################
# FIRST STEP: let's create the DfM for the training-set
#####################################################
x <- read.csv("train_disaster.csv", stringsAsFactors=FALSE)
myCorpusTwitterTrain <- corpus(x)
tok2 <- tokens(myCorpusTwitterTrain , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_remove(tok2, c("0*"))
tok2 <- tokens_wordstem (tok2)
Dfm_train <- dfm(tok2)
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE)
Dfm_train <- dfm_remove(Dfm_train , min_nchar = 2)
topfeatures(Dfm_train , 20) # 20 top words
#####################################################
# SECOND STEP: let's create the DfM for the test-set
#####################################################
x10 <- read.csv("test_disaster.csv", stringsAsFactors=FALSE)
myCorpusTwitterTest <- corpus(x10)
tok <- tokens(myCorpusTwitterTest , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok <- tokens_remove(tok, stopwords("en"))
tok <- tokens_remove(tok, c("0*"))
tok <- tokens_wordstem (tok)
Dfm_test <- dfm(tok)
Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE)
Dfm_test<- dfm_remove(Dfm_test, min_nchar = 2)
topfeatures(Dfm_test , 20) # 20 top words
#####################################################
# THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern.
# after this step, we can "predict" by employing only the features included in the training-set
#####################################################
setequal(featnames(Dfm_train), featnames(Dfm_test))
length(Dfm_test@Dimnames$features)
length(Dfm_train@Dimnames$features)
test_dfm <- dfm_match(Dfm_test, features = featnames(Dfm_train))
length(test_dfm@Dimnames$features)
setequal(featnames(Dfm_train), featnames(test_dfm ))
#####################################################
# FOURTH STEP: Let's convert the two DfMs into matrices for the ML algorithms to work
#####################################################
train <- as.matrix(Dfm_train)
test <- as.matrix(test_dfm)
#####################################################
# let's estimate a SVM
#####################################################
# note that here I select a linear kernel (in my experience a linear kernel is doing fine with texts data)
# and, as hyperameter, a specific value for the cost C(=1)- A SVM has also other hyperparameters. More on this later on
system.time(SV <- svm(y= as.factor(Dfm_train@docvars$choose_one), x=train, kernel='linear', cost = 1))
# how many supporting vectors?
length(SV$index)
nrow(train) # 243 out of 400 tweets in the training-set data-frame
# The coefficients that you're pulling out are the weights for the support vectors.
# Looking at the estimated coefficients is not however so informative because they only tell us what support vectors were estimated in the model
# i.e., which are the texts with the largest weight in our estimation
head(SV$coefs)
# let's read however such texts
vectors <- x[SV$index,]
nrow(vectors)
str(vectors) # you see that you do not have tweets from 2 to 4 cause they are not supporting vectors!
vectors$coefs <- SV$coefs[,1]
str(vectors)
vectors<- vectors[order(vectors$coef),]
head(vectors[,c("coefs", "text", "choose_one")], n=10)
vectors<- vectors[order(vectors$coef, decreasing=TRUE),]
head(vectors[,c("coefs", "text", "choose_one")], n=10)
# So now let's do some more proper Global Interpretation for our SVM!
# In particular, let's now focus on identifying the most relevant features for the SVM. For doing that we will rely on a "permutational" approach.
# What do we mean by permutation? Basically, what we do is the following: you randomly permute the real value of each single feature
# in your dfm. A feature is "important" if permuting its values increases the model error, because the model relied on the feature for
# the prediction. That is: if we randomly permute the values of an important feature in the training data, the training performance
# would degrade (since permuting the values of a feature effectively destroys any relationship between that feature and the target variable).
# This gives us a nice interpretation: Feature importance is the increase in model error when the feature’s information is destroyed.
# A feature is “unimportant” if permuting its values keeps the model error unchanged, because the model ignored the
# feature for the prediction.
# Note that Permutation feature importance does not require retraining the model. Some other methods suggest deleting a feature, retraining the model
# and then comparing the model error. Since the retraining of a machine learning model can take a long time, "only" permuting a feature can save a lot
# of time. Moreover, we are interested in the feature importance of a fixed model.
# Re-training with a reduced dataset creates each time a different model than the one we are interested in.
# Permuting a feature and measuring the increase in loss is not the only way to measure the importance of a feature.
# An alternative to permutation feature importance are variance-based measures. You focus here not on the error of the model,
# but in how much the model's output varies for a feature without considering what it means for performance.
# This definition of importance differs from the loss-based definition as in the case of permutation feature importance.
# This is evident in cases where a model overfitted.
# If a model overfits and uses a feature that is unrelated to the output, then the permutation feature importance
# would assign an importance of zero because this feature does not contribute to producing correct predictions.
# A variance-based importance measure, on the other hand, might assign the feature high importance as the prediction can change a lot
# when the feature is changed. Model variance (explained by the features) and feature importance correlate strongly
# when the model generalizes well (i.e. it does not overfit). If you are interested in variance-based measures, you can take a look
# at the shapper and fastshap packages.
# For running a permutation, you need first to convert the Dfm into a data frame, not into a matrix (the iml package requires that!)
# Moreover you need also to focus on your DV if it is a string. For example, if you have two class labels such as "relevant" and "not-relevant"
# you need to change the latter to "nonrelevant" (i.e., no space or "-" or anything else) before transforming it into a factor.
# Otherwise the iml package will collapse. I discovered it once, after spending 2 hours on iml!!!
# Not our problem here!
as.factor(Dfm_train@docvars$choose_one)
# the iml package requires that you estimate a model with a data.frame, not with a matrix! So let's convert out Dfm into a data.frame
train2 <- convert(Dfm_train, to = "data.frame")
# iml, exactly as randomForest, does not want features that begin with @, #, etc. when you use a data.frame.
# So let's use once again the make.names function alteady discussed above
str(train2)
colnames(train2 ) <- make.names(colnames(train2 ), unique = TRUE)
colnames(train2)
# let's rerun the SVM with the data frame (NOT with the original matrix)
svm_model <- svm(as.factor(Dfm_train@docvars$choose_one) ~ ., data = train2[-1], kernel='linear', cost = 1)
# why train2[-1]? cause the first column in train2 refers to doc_id not to a feature!
head(colnames(train2))
# let's create an object that holds the model and the data
mod <- Predictor$new(svm_model, data = train2[-1], y = as.factor(Dfm_train@docvars$choose_one), type = "prob")
mod$predict(train2[50:55, ])
# plan("callr", workers = 6) allows you to run the permutation by taking advantages of 6 cores in your computer
# If you have 4 cores write workers=4, etc.
# This is a so called "parallelization". It means that the computation per feature is distributed among several cores of your machine.
# This can save you LOTS of time!
system.time({
plan("callr", workers = 6)
set.seed(123)
imp2 <- FeatureImp$new(mod, loss = "ce", n.repetitions=1)
}) # around 25 secs on my laptop
# any feature with a value > 1 implies that it is important to estimate the model.
# why larger than 1? Cause feature importance is estimated as a ratio (that's the deafault in iml).
# If you get a value > 1 it means that the error increases if a given feature is permuted, therefore that
# feature is important! As an alternative you can also estimate feature importance as a difference.
# you can also repeat the analysis (i.e. the shuffling of the feature) X time (for example: n.repetitions=5).
# The higher the number of repetitions the more stable and accurate the results become (plus: you will have a c.i. as well!)
# However, in our case to save time we just asked for 1 repetition
imp2
res <- imp2$results[,c(1,3)]
str(res)
##########################################
# OK we know which features are important now. But what about their relationship with the class-labels?
# let's estimate that for both class labels
##########################################
# let's replicate make.names here so that both train and train2 present the same list of features!
colnames(train) <- make.names(colnames(train))
# let's replicate exactly what we already did for the RF case
# As in the Random forest example, let's predict the training-set texts and let's store a new variable in our dfm of the training-set with such predictions
predicted_sv <- predict(svm_model, train, type="class")
table(predicted_sv )
Dfm_train@docvars$predSV <- predicted_sv
str(Dfm_train@docvars)
sums <- list()
for (v in 0:1){
sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predSV"]==v,])
}
sums <- do.call(cbind, sums)
sign <- apply(sums, 1, which.max)
# get the feature
names <- dimnames(train)[[2]]
str(names)
df <- data.frame(
feature = names,
sign = sign-1,
stringsAsFactors=F)
str(df)
str(res)
importance <- merge(res, df, by="feature")
str(importance)
## best predictors
for (v in 0:1){
cat("\n\n")
cat("value==", v)
importance <- importance[order(importance$importance, decreasing=TRUE),]
print(head(importance[importance$sign==v,], n=10))
cat("\n")
cat(paste(unique(head(importance$Features[importance$sign==v], n=10)), collapse=", "))
}
str(importance)
importanceSV <- importance
# let's draw a graph with our results!
importance$perm <- ifelse(importance$sign>0,importance$importance,importance$importance*-1)
str(importance)
# the top 10 relevant words
importance2 <- top_n(importance, 10, perm )
importance2
# the top 20 irrelevant words
importance3 <- top_n(importance, -10, perm )
importance3
importance_new <- rbind(importance2, importance3)
str(importance_new)
# reorder the features
importance_new <- mutate(importance_new, feature= reorder(feature, perm ))
importance_new$sign2<- ifelse(importance_new$sign>0,"Relevant","Not Relevant")
importance_new <- mutate(importance_new, feature= reorder(feature, perm ))
ggplot(importance_new, aes(feature, perm, fill = sign2)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Permutation Coefficient") +
scale_fill_manual(values = c("#000033", "#006699")) +
labs(title = "Disaster tweets",
subtitle = "Irrelevant (-) versus Relevant (+) words - SVM Model",
fill = "Disaster")
# let's store the graph
SVM_graph <- ggplot(importance_new, aes(feature, perm, fill = sign2)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Permutation Coefficient") +
scale_fill_manual(values = c("#000033", "#006699")) +
labs(title = "Disaster tweets",
subtitle = "Irrelevant (-) versus Relevant (+) words - SVM Model",
fill = "Disaster")
# let's FINALLY predict the test-set
system.time(predicted_svm <- predict(SV , test))
table(predicted_svm )
prop.table(table(predicted_svm ))
#### let's do the same thing while working with a data.frame rather than a matrix!
train2 <- convert(Dfm_train, to="data.frame")
class(train2)
# here we do not need to use the "make.names" function given that svm also accepts features that begin with @, #, etc.
# when using a data.frame
SV2 <- svm(as.factor(Dfm_train@docvars$choose_one) ~ ., data = train2[-1], kernel='linear', cost = 1)
# why train[-1]? cause the first column in train refers to doc_id not to a feature!
head(colnames(train2))
# alternatively
# let's add the DV to the data.frame
train2$choose_one <- Dfm_train@docvars$choose_one
SV2 <- svm(as.factor(choose_one) ~ ., data = train2[-1], kernel='linear', cost = 1)
# here we can directly predict using the matrix test
predicted_svm2 <- predict(SV2 , test, type=CLASS)
# same result than when working with a matrix of course!
prop.table(table(predicted_svm2 ))
prop.table(table(predicted_svm ))