rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(caTools) library(e1071) library(randomForest) library(caret) library(naivebayes) library(car) library(ggplot2) library(dplyr) library(reshape2) library(iml) library(future) library(future.callr) library(gridExtra) # We will use once again the English social media disaster set of tweets discussed in the previous LAB ##################################################### # FIRST STEP: let's create the DfM for the training-set ##################################################### x <- read.csv("train_disaster.csv", stringsAsFactors=FALSE) myCorpusTwitterTrain <- corpus(x) tok2 <- tokens(myCorpusTwitterTrain , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_remove(tok2, c("0*")) tok2 <- tokens_wordstem (tok2) Dfm_train <- dfm(tok2) Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE) Dfm_train <- dfm_remove(Dfm_train , min_nchar = 2) topfeatures(Dfm_train , 20) # 20 top words ##################################################### # SECOND STEP: let's create the DfM for the test-set ##################################################### x10 <- read.csv("test_disaster.csv", stringsAsFactors=FALSE) myCorpusTwitterTest <- corpus(x10) tok <- tokens(myCorpusTwitterTest , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok <- tokens_remove(tok, stopwords("en")) tok <- tokens_remove(tok, c("0*")) tok <- tokens_wordstem (tok) Dfm_test <- dfm(tok) Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE) Dfm_test<- dfm_remove(Dfm_test, min_nchar = 2) topfeatures(Dfm_test , 20) # 20 top words ##################################################### # THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern. # after this step, we can "predict" by employing only the features included in the training-set ##################################################### setequal(featnames(Dfm_train), featnames(Dfm_test)) length(Dfm_test@Dimnames$features) length(Dfm_train@Dimnames$features) test_dfm <- dfm_match(Dfm_test, features = featnames(Dfm_train)) length(test_dfm@Dimnames$features) setequal(featnames(Dfm_train), featnames(test_dfm )) ##################################################### # FOURTH STEP: Let's convert the two DfMs into matrices for the ML algorithms to work ##################################################### train <- as.matrix(Dfm_train) test <- as.matrix(test_dfm) ##################################################### # let's estimate a SVM ##################################################### # note that here I select a linear kernel (in my experience a linear kernel is doing fine with texts data) # and, as hyperameter, a specific value for the cost C(=1)- A SVM has also other hyperparameters. More on this later on system.time(SV <- svm(y= as.factor(Dfm_train@docvars$choose_one), x=train, kernel='linear', cost = 1)) # how many supporting vectors? length(SV$index) nrow(train) # 243 out of 400 tweets in the training-set data-frame # The coefficients that you're pulling out are the weights for the support vectors. # Looking at the estimated coefficients is not however so informative because they only tell us what support vectors were estimated in the model # i.e., which are the texts with the largest weight in our estimation head(SV$coefs) # let's read however such texts vectors <- x[SV$index,] nrow(vectors) str(vectors) # you see that you do not have tweets from 2 to 4 cause they are not supporting vectors! vectors$coefs <- SV$coefs[,1] str(vectors) vectors<- vectors[order(vectors$coef),] head(vectors[,c("coefs", "text", "choose_one")], n=10) vectors<- vectors[order(vectors$coef, decreasing=TRUE),] head(vectors[,c("coefs", "text", "choose_one")], n=10) # So now let's do some more proper Global Interpretation for our SVM! # In particular, let's now focus on identifying the most relevant features for the SVM. For doing that we will rely on a "permutational" approach. # What do we mean by permutation? Basically, what we do is the following: you randomly permute the real value of each single feature # in your dfm. A feature is "important" if permuting its values increases the model error, because the model relied on the feature for # the prediction. That is: if we randomly permute the values of an important feature in the training data, the training performance # would degrade (since permuting the values of a feature effectively destroys any relationship between that feature and the target variable). # This gives us a nice interpretation: Feature importance is the increase in model error when the feature’s information is destroyed. # A feature is “unimportant” if permuting its values keeps the model error unchanged, because the model ignored the # feature for the prediction. # Note that Permutation feature importance does not require retraining the model. Some other methods suggest deleting a feature, retraining the model # and then comparing the model error. Since the retraining of a machine learning model can take a long time, "only" permuting a feature can save a lot # of time. Moreover, we are interested in the feature importance of a fixed model. # Re-training with a reduced dataset creates each time a different model than the one we are interested in. # Permuting a feature and measuring the increase in loss is not the only way to measure the importance of a feature. # An alternative to permutation feature importance are variance-based measures. You focus here not on the error of the model, # but in how much the model's output varies for a feature without considering what it means for performance. # This definition of importance differs from the loss-based definition as in the case of permutation feature importance. # This is evident in cases where a model overfitted. # If a model overfits and uses a feature that is unrelated to the output, then the permutation feature importance # would assign an importance of zero because this feature does not contribute to producing correct predictions. # A variance-based importance measure, on the other hand, might assign the feature high importance as the prediction can change a lot # when the feature is changed. Model variance (explained by the features) and feature importance correlate strongly # when the model generalizes well (i.e. it does not overfit). If you are interested in variance-based measures, you can take a look # at the shapper and fastshap packages. # For running a permutation, you need first to convert the Dfm into a data frame, not into a matrix (the iml package requires that!) # Moreover you need also to focus on your DV if it is a string. For example, if you have two class labels such as "relevant" and "not-relevant" # you need to change the latter to "nonrelevant" (i.e., no space or "-" or anything else) before transforming it into a factor. # Otherwise the iml package will collapse. I discovered it once, after spending 2 hours on iml!!! # Not our problem here! as.factor(Dfm_train@docvars$choose_one) # the iml package requires that you estimate a model with a data.frame, not with a matrix! So let's convert out Dfm into a data.frame train2 <- convert(Dfm_train, to = "data.frame") # iml, exactly as randomForest, does not want features that begin with @, #, etc. when you use a data.frame. # So let's use once again the make.names function alteady discussed above str(train2) colnames(train2 ) <- make.names(colnames(train2 ), unique = TRUE) colnames(train2) # let's rerun the SVM with the data frame (NOT with the original matrix) svm_model <- svm(as.factor(Dfm_train@docvars$choose_one) ~ ., data = train2[-1], kernel='linear', cost = 1) # why train2[-1]? cause the first column in train2 refers to doc_id not to a feature! head(colnames(train2)) # let's create an object that holds the model and the data mod <- Predictor$new(svm_model, data = train2[-1], y = as.factor(Dfm_train@docvars$choose_one), type = "prob") mod$predict(train2[50:55, ]) # plan("callr", workers = 6) allows you to run the permutation by taking advantages of 6 cores in your computer # If you have 4 cores write workers=4, etc. # This is a so called "parallelization". It means that the computation per feature is distributed among several cores of your machine. # This can save you LOTS of time! system.time({ plan("callr", workers = 6) set.seed(123) imp2 <- FeatureImp$new(mod, loss = "ce", n.repetitions=1) }) # around 25 secs on my laptop # any feature with a value > 1 implies that it is important to estimate the model. # why larger than 1? Cause feature importance is estimated as a ratio (that's the deafault in iml). # If you get a value > 1 it means that the error increases if a given feature is permuted, therefore that # feature is important! As an alternative you can also estimate feature importance as a difference. # you can also repeat the analysis (i.e. the shuffling of the feature) X time (for example: n.repetitions=5). # The higher the number of repetitions the more stable and accurate the results become (plus: you will have a c.i. as well!) # However, in our case to save time we just asked for 1 repetition imp2 res <- imp2$results[,c(1,3)] str(res) ########################################## # OK we know which features are important now. But what about their relationship with the class-labels? # let's estimate that for both class labels ########################################## # let's replicate make.names here so that both train and train2 present the same list of features! colnames(train) <- make.names(colnames(train)) # let's replicate exactly what we already did for the RF case # As in the Random forest example, let's predict the training-set texts and let's store a new variable in our dfm of the training-set with such predictions predicted_sv <- predict(svm_model, train, type="class") table(predicted_sv ) Dfm_train@docvars$predSV <- predicted_sv str(Dfm_train@docvars) sums <- list() for (v in 0:1){ sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predSV"]==v,]) } sums <- do.call(cbind, sums) sign <- apply(sums, 1, which.max) # get the feature names <- dimnames(train)[[2]] str(names) df <- data.frame( feature = names, sign = sign-1, stringsAsFactors=F) str(df) str(res) importance <- merge(res, df, by="feature") str(importance) ## best predictors for (v in 0:1){ cat("\n\n") cat("value==", v) importance <- importance[order(importance$importance, decreasing=TRUE),] print(head(importance[importance$sign==v,], n=10)) cat("\n") cat(paste(unique(head(importance$Features[importance$sign==v], n=10)), collapse=", ")) } str(importance) importanceSV <- importance # let's draw a graph with our results! importance$perm <- ifelse(importance$sign>0,importance$importance,importance$importance*-1) str(importance) # the top 10 relevant words importance2 <- top_n(importance, 10, perm ) importance2 # the top 20 irrelevant words importance3 <- top_n(importance, -10, perm ) importance3 importance_new <- rbind(importance2, importance3) str(importance_new) # reorder the features importance_new <- mutate(importance_new, feature= reorder(feature, perm )) importance_new$sign2<- ifelse(importance_new$sign>0,"Relevant","Not Relevant") importance_new <- mutate(importance_new, feature= reorder(feature, perm )) ggplot(importance_new, aes(feature, perm, fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Permutation Coefficient") + scale_fill_manual(values = c("#000033", "#006699")) + labs(title = "Disaster tweets", subtitle = "Irrelevant (-) versus Relevant (+) words - SVM Model", fill = "Disaster") # let's store the graph SVM_graph <- ggplot(importance_new, aes(feature, perm, fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Permutation Coefficient") + scale_fill_manual(values = c("#000033", "#006699")) + labs(title = "Disaster tweets", subtitle = "Irrelevant (-) versus Relevant (+) words - SVM Model", fill = "Disaster") # let's FINALLY predict the test-set system.time(predicted_svm <- predict(SV , test)) table(predicted_svm ) prop.table(table(predicted_svm )) #### let's do the same thing while working with a data.frame rather than a matrix! train2 <- convert(Dfm_train, to="data.frame") class(train2) # here we do not need to use the "make.names" function given that svm also accepts features that begin with @, #, etc. # when using a data.frame SV2 <- svm(as.factor(Dfm_train@docvars$choose_one) ~ ., data = train2[-1], kernel='linear', cost = 1) # why train[-1]? cause the first column in train refers to doc_id not to a feature! head(colnames(train2)) # alternatively # let's add the DV to the data.frame train2$choose_one <- Dfm_train@docvars$choose_one SV2 <- svm(as.factor(choose_one) ~ ., data = train2[-1], kernel='linear', cost = 1) # here we can directly predict using the matrix test predicted_svm2 <- predict(SV2 , test, type=CLASS) # same result than when working with a matrix of course! prop.table(table(predicted_svm2 )) prop.table(table(predicted_svm ))