rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(caTools) library(e1071) library(randomForest) library(caret) library(naivebayes) library(car) library(ggplot2) library(dplyr) library(reshape2) library(iml) library(future) library(future.callr) library(gridExtra) # We will use once again the English social media disaster set of tweets discussed in the previous LAB ##################################################### # FIRST STEP: let's create the DfM for the training-set ##################################################### x <- read.csv("train_disaster.csv", stringsAsFactors=FALSE) myCorpusTwitterTrain <- corpus(x) tok2 <- tokens(myCorpusTwitterTrain , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_remove(tok2, c("0*")) tok2 <- tokens_wordstem (tok2) Dfm_train <- dfm(tok2) Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE) Dfm_train <- dfm_remove(Dfm_train , min_nchar = 2) ##################################################### # SECOND STEP: let's create the DfM for the test-set ##################################################### x10 <- read.csv("test_disaster.csv", stringsAsFactors=FALSE) myCorpusTwitterTest <- corpus(x10) tok <- tokens(myCorpusTwitterTest , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok <- tokens_remove(tok, stopwords("en")) tok <- tokens_remove(tok, c("0*")) tok <- tokens_wordstem (tok) Dfm_test <- dfm(tok) Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE) Dfm_test<- dfm_remove(Dfm_test, min_nchar = 2) # Here problems with 11 texts! Dfm_test[ntoken(Dfm_test) == 0,] # So let's delete such texts! Dfm_test <- Dfm_test[ntoken(Dfm_test) != 0,] Dfm_test[ntoken(Dfm_test) == 0,] ##################################################### # THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern. ##################################################### setequal(featnames(Dfm_train), featnames(Dfm_test)) nfeat(Dfm_test) nfeat(Dfm_train) test_dfm <- dfm_match(Dfm_test, features = featnames(Dfm_train)) nfeat(test_dfm) setequal(featnames(Dfm_train), featnames(test_dfm )) ##################################################### # FOURTH STEP: Let's convert the two DfMs into matrices for the ML algorithms to work ##################################################### train <- as.matrix(Dfm_train) test <- as.matrix(test_dfm) ##################################################### # let's estimate a SVM ##################################################### # Note that here I select a linear kernel (in my experience a linear kernel is doing fine with texts data) # and, as hyperameter, a specific value for the cost C(=1)- A SVM has also other hyperparameters that we can tune. More on this later on. # By default, the svm command scales data internally (both x and y variables) to zero mean and unit variance. Why? # SVM is based on distance. Hence, if we do not standardize our variables to comparable ranges, the variable with the largest range will # completely dominate in the computation of the kernel matrix. For example, we have two variables - X1 and X2. # Values of variable X1 lies between 0 and 100 whereas values of X2 lies in range of 100 and 10000. # In this case, variable X2 would dominate variable X1. If you wanto to avoid the scaling, you should add in the command below: "scale=FALSE". # Finally note that we can also compute our SVM working with a data frame, rather than with a matrix. system.time(SV <- svm(y= as.factor(Dfm_train@docvars$choose_one), x=train, kernel='linear', cost = 1)) # let's predict the test-set system.time(predicted_svm <- predict(SV , test)) table(predicted_svm ) prop.table(table(predicted_svm )) # how many supporting vectors are in our training-set? length(SV$index) nrow(train) # 243 out of 400 tweets in the training-set data-frame # The coefficients that you're pulling out are the weights for the support vectors. # Looking at the estimated coefficients is not however so informative because they only tell us what support vectors were estimated in the model # i.e., which are the texts with the largest weight in our estimation head(SV$coefs) # let's read however such texts vectors <- x[SV$index,] nrow(vectors) str(vectors) # you see that you do not have tweets from 2 to 4 cause they are not supporting vectors! vectors$coefs <- SV$coefs[,1] str(vectors) # tweets about incidents vectors<- vectors[order(vectors$coef),] head(vectors[,c("coefs", "text", "choose_one")], n=10) # tweets not about incidents vectors<- vectors[order(vectors$coef, decreasing=TRUE),] head(vectors[,c("coefs", "text", "choose_one")], n=10) # So now let's do some proper Global Interpretation for our SVM. # Also SVM does not come with a model-specific VI. Therefore let'apply our usual permutational approach via iml package # Remember: the iml package requires: # 1) a data frame, not a matrix, on which computing the prediction of the model (but you have run the previous model with a matrix) # 2) as a result of 1), a specific function that allows you to move from the original matrix to a data frame # 3) iml does NOT want features that begin with @, #, etc. when you use a data.frame (as it is the case with the social media of our corpus) # 4) as a result of 2), you also need to recompute the SVM results with the converted data frame created at point 3). Of course the outcome # of the model under 4) is going to be exactly the same as your original SVM above (after all you just replace the @ in front of a word # with a X!) # Let's convert the dfm of the training set into a data frame and: a) let's apply to it make.names; b) let's remove its first columns # (that refers to doc_id not to a feature!) trainDF <- convert(Dfm_train, to="data.frame") head(colnames(trainDF)) trainDF <- trainDF[,-c(1)] head(colnames(trainDF)) colnames(trainDF) <- make.names(colnames(trainDF), unique = TRUE) # let's create a "cleaned" matrix out of this "cleaned" data.matrix train2 <- as.matrix(trainDF) # let's re-estimate our SVM with this "cleaned" matrix system.time(SV2 <- svm(y= as.factor(Dfm_train@docvars$choose_one), x=train2, kernel='linear', cost = 1)) # Once again, we can focus either on the predicted class or on the predicted probabilities below (same results of course!). # If you want to focus on the class, you have to employ the below predictive function: head(predict(SV2, train2)) # let's define our predictive function predSV<-function(model, newdata){ newData_x <- data.matrix(newdata) results <-predict(model, newData_x) return(results) } head(predSV(SV2, train2)) # let's apply this function to our SVM by creating an object that holds the model and the data modSV <- Predictor$new(SV2, data = trainDF, y =as.factor(Dfm_train@docvars$choose_one), predict.fun = predSV) modSV $predict(trainDF[1:5, ]) # let's compute VI for our SVM model system.time({ plan("callr", workers = 6) set.seed(123) impSV <- FeatureImp$new(modSV , loss = "ce", n.repetitions=1) }) head(impSV $results[,c(1:4)]) res <- impSV $results[,c(1,3)] str(res) # let's rename the column "feature" into "Feature" colnames(res)[1] <- "Feature" str(res) # Let's compute some Partial Dependence Plots (PDP). Remember: # It serves as a depiction of the relationship between a predictor of interest and the # dependent variable, while basically “controlling” for all other effects of other predictors. plot(FeatureEffect$new(modSV , "collaps", method = "pdp")) plot(FeatureEffect$new(modSV , "bomb", method = "pdp")) plot(FeatureEffect$new(modSV , "famili", method = "pdp")) plot(FeatureEffect$new(modSV , "let", method = "pdp")) plot(FeatureEffect$new(modSV , "help", method = "pdp")) plot(FeatureEffect$new(modSV , "X.youtub", method = "pdp")) ########################################## # Now we can follow exactly the same route followed with the other ML algorithms discussed so far! ########################################## # Let's predict the training-set texts (employing the svm model trained on the data frame above) # and let's store a new variable in our dfm of the training-set with such predictions predicted_sv <- predict(SV2, train2) table(predicted_sv ) Dfm_train@docvars$predSV <- predicted_sv str(Dfm_train@docvars) sums <- list() for (v in 0:1){ sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predSV"]==v,]) } sums <- do.call(cbind, sums) sign <- apply(sums, 1, which.max) names <- dimnames(train2)[[2]] df <- data.frame(Feature = names, sign=sign) str(df) df $sign2<-factor(df $sign,labels = c("irrelevant","relevant")) str(df ) str(res) importanceSV <- merge(res, df, by="Feature") str(importanceSV ) ## best predictors clist <- c("irrelevant", "relevant") for (v in clist){ cat("\n\n") cat("value==", v) importanceSV <- importanceSV [order(importanceSV $ importance, decreasing=TRUE),] print(head(importanceSV [importanceSV $sign2==v,], n=10)) } # the top 10 words for the disaster tweets importance2SV <- head(importanceSV [importanceSV $sign2=="relevant",], n=10) importance2SV # the top 20 words for tweets not related to disasters importance3SV <-head(importanceSV [importanceSV $sign2=="irrelevant",], n=10) importance3SV importance_newSV <- rbind(importance2SV , importance3SV ) str(importance_newSV ) # reorder the features importance_newSV$Feature <- with(importance_newSV, factor(Feature,levels=Feature[order(ave(importance,sign2,FUN=max),importance)])) ggplot(importance_newSV, aes(Feature, importance, fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic() + geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Permutation Coefficients") + labs(title= "Relevant to accidents versus Irrelevant to accidents features - SVM Model", fill= "Disaster") + scale_fill_manual(values = c("#000033", "#006699"))