rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(caTools) library(e1071) library(randomForest) library(caret) library(naivebayes) library(car) library(ggplot2) library(dplyr) library(reshape2) library(iml) library(future) library(future.callr) library(gridExtra) # The data we will be using are some English social media disaster tweets discussed in # this article: https://arxiv.org/pdf/1705.02009.pdf # It consist of a number of tweets regarding accidents mixed in with a selection control tweets (not about accidents) ##################################################### # FIRST STEP: let's create the DfM for the training-set ##################################################### x <- read.csv("train_disaster.csv", stringsAsFactors=FALSE) str(x) # class-label variable: choose_one (0=tweets not relevant; 1=relevant tweets) table(x$choose_one) prop.table(table(x$choose_one)) nrow(x) myCorpusTwitterTrain <- corpus(x) tok2 <- tokens(myCorpusTwitterTrain , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_url = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) # let's also remove the unicode symbols tok2 <- tokens_remove(tok2, c("0*")) tok2 <- tokens_wordstem (tok2) Dfm_train <- dfm(tok2) # Let's trim the dfm in order to keep only features that appear in 2 or more tweets (tweets are short texts!) # and let's keep only features with at least 2 characters Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE) Dfm_train <- dfm_remove(Dfm_train , min_nchar = 2) topfeatures(Dfm_train , 20) # 20 top words ##################################################### # SECOND STEP: let's create the DfM for the test-set ##################################################### x10 <- read.csv("test_disaster.csv", stringsAsFactors=FALSE) str(x10) nrow(x10) myCorpusTwitterTest <- corpus(x10) tok <- tokens(myCorpusTwitterTest , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_url = TRUE) tok <- tokens_remove(tok, stopwords("en")) tok <- tokens_remove(tok, c("0*")) tok <- tokens_wordstem (tok) Dfm_test <- dfm(tok) Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE) Dfm_test<- dfm_remove(Dfm_test, min_nchar = 2) topfeatures(Dfm_test , 20) # 20 top words ##################################################### # THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern. # after this step, we can "predict" by employing only the features included in the training-set ##################################################### setequal(featnames(Dfm_train), featnames(Dfm_test)) length(Dfm_test@Dimnames$features) length(Dfm_train@Dimnames$features) test_dfm <- dfm_match(Dfm_test, features = featnames(Dfm_train)) length(test_dfm@Dimnames$features) setequal(featnames(Dfm_train), featnames(test_dfm )) ##################################################### # FOURTH STEP: Let's convert the two DfMs into matrices for the ML algorithms to work ##################################################### train <- as.matrix(Dfm_train) test <- as.matrix(test_dfm) ##################################################### # FIFHT STEP: let's estimate a ML Model ##################################################### ##################################################### # let's run a Random Forest ##################################################### # we will use the randomForest package. If you are employing compressed matrices (not our case here), plz use the ranger package. # The ranger package is also usually faster than the randomForest one # note that as hyperameter (or tuning-parameter), I select a specific value for the number of trees. A RF has also other hyperparameters. # More on this next week! set.seed(123) # (define a set.seed for being able to replicate the results!) system.time(RF <- randomForest(y= as.factor(Dfm_train@docvars$choose_one), x=train, importance=TRUE, do.trace=TRUE, ntree=500)) RF # As discussed, a natural benefit of the bootstrap resampling process is that random forests have an out-of-bag (OOB) sample # that provides a reasonable approximation of the test error. This provides a built-in validation set without any extra work on your part. # However, REMEMBER: this is less efficient than changing several hyperparameters and doing a k-fold cross-validation as we will see later! str(RF$err.rate) RF$err.rate[nrow(RF$err.rate),] # our final error (overall, and for each of the two classes: irrelevant and relevant tweets) plot(RF, type = "l", col = c("red", "black","springgreen4"), main = "Class errors in Random Forest Model / OOB") legend("topright", horiz = F, cex = 0.7, fill = c( "black", "red", "springgreen4"), c( "No-disaster tweets","Average error", "Disaster tweets")) error <- as.data.frame(RF$err.rate) # number of trees with the lowest OOB error which.min(error$OOB) # 123 set.seed(123) system.time(RF2 <- randomForest(y= as.factor(Dfm_train@docvars$choose_one), x=train, importance=TRUE, ntree=123, do.trace=TRUE)) RF$err.rate[nrow(RF$err.rate),] # our final error (overall, and for each of the two classes) RF2$err.rate[nrow(RF2$err.rate),] # our final error (overall, and for each of the two classes) # Now let's do some Global Interpretation for our RF! # What about the importance of each feature for our trained model? # Variable importance is a standard metric in machine learning, which indicates the amount of information a variable provides # to the model for predicting the outcome head(RF$importance[,3:4]) # let's grap the result varImpPlot(RF ) # Each features’s importance is assessed based on two criteria: # -MeanDecreaseAccuracy: gives a rough estimate of the loss in prediction performance when that particular variable is omitted from the # training set. Caveat: if two variables are somewhat redundant, then omitting one of them may not lead to massive gains in prediction # performance, but would make the second variable more important. # -MeanDecreaseGini: GINI is a measure of node impurity. Think of it like this: if you use this feature to split the data, # how pure will the nodes be? Highest purity means that each node contains only elements of a single class. # Assessing the decrease in GINI when that feature is omitted leads to an understanding of how important that feature is to split the data correctly. # let's extract the matrix for GINI and Accuracy importance_RF <- as.data.frame(RF$importance[,3:4]) str(importance_RF) importance_RF$Feature<- row.names(importance_RF) str(importance_RF) # same words we get with varImpPlot(RF ) print(head(importance_RF[order(importance_RF$MeanDecreaseGini, decreasing=TRUE),])) # Note that these measures are used to rank variables in terms of importance and, thus, their absolute values could be disregarded. # As a result, we just get for example the GINI statistics overall w/o differentiating the words most important for specific classes. # Which are however the most important words for the relevant label? And for the irrelevant one? # let's predict the training-set texts and let's store a new variable in our dfm of the training-set with such predictions. # Why that? Cause we will use such variable to identify the "sign" of the most important features predicted_rf <- predict(RF, train, type="class") table(predicted_rf ) Dfm_train@docvars$predRF <- predicted_rf str(Dfm_train@docvars) # let's assign a feature to the relevant/irrelevant (0/1) class according to its frequency (if, for example, the word "collapse" appears # 10 times among predicted relevant tweets and 5 times among predicted irrelevant tweets, # we classify it as pertaining to the "relevant tweets" world; and so on) # let's estimate the number of times a word appear in those tweets that we have classified as related to accidents or otherwise sums <- list() for (v in 0:1){ sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predRF"]==v,]) } sums <- do.call(cbind, sums) sums # let's apply the sign (either 1 or 2) if, respectively, a word appears more in the tweets classified as not related to accidents or related to them sign <- apply(sums, 1, which.max) # get the feature names <- dimnames(train)[[2]] str(names) df <- data.frame( Feature = names, sign = sign-1, # let's recode the sign between 0 and 1 stringsAsFactors=F) str(df) importanceRF <- merge(importance_RF, df, by="Feature") str(importanceRF) ## best predictors for (v in 0:1){ cat("\n\n") cat("value==", v) importanceRF <- importanceRF [order(importanceRF $MeanDecreaseGini, decreasing=TRUE),] print(head(importanceRF [importanceRF $sign==v,], n=10)) cat("\n") cat(paste(unique(head(importanceRF $Features[importanceRF $sign==v], n=10)), collapse=", ")) } # let's draw a graph with our results focusing on MeanDecreaseGini importanceRF$Gini <- ifelse(importanceRF $sign>0,importanceRF $MeanDecreaseGini,importanceRF $MeanDecreaseGini*-1) # the twop 20 relevant words importance2RF <- top_n(importanceRF, 20, Gini ) importance2RF # the twop 20 irrelevant words importance3RF <- top_n(importanceRF, -20, Gini ) importance3RF importance_newRF <- rbind(importance2RF, importance3RF) str(importance_newRF) # reorder the features importance_newRF <- mutate(importance_newRF, Feature= reorder(Feature, Gini )) importance_newRF$sign2<- ifelse(importance_newRF$sign>0,"relevant","irrelevant") ggplot(importance_newRF, aes(Feature, Gini, fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Mean Decrease Gini coefficient") + scale_fill_manual(values = c("#000033", "#006699")) + labs(title = "Disaster tweets", subtitle = "Irrelevant (-) versus Relevant (+) words - Random Forest Model", fill = "Disaster") # let's store the graph RF_graph <- ggplot(importance_newRF, aes(Feature, Gini, fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Mean Decrease Gini coefficient") + scale_fill_manual(values = c("#000033", "#006699")) + labs(title = "Disaster tweets", subtitle = "Irrelevant (-) versus Relevant (+) words - Random Forest Model", fill = "Disaster") # let's FINALLY predict the test-set system.time(predicted_rf <- predict(RF, test,type="class")) table(predicted_rf ) prop.table(table(predicted_rf)) ##################################################### # let's run a SVM ##################################################### # note that here I select a linear kernel (in my experience a linear kernel is doing fine with texts data) # and, as hyperameter, a specific value for the cost C(=1)- A SVM has also other hyperparameters. More on this later on system.time(SV <- svm(y= as.factor(Dfm_train@docvars$choose_one), x=train, kernel='linear', cost = 1)) # how many supporting vectors? length(SV$index) nrow(train) # 244 out of 400 tweets in the training-set data-frame # The coefficients that you're pulling out are the weights for the support vectors. # Looking at the estimated coefficients is not however so informative because they only tell us what support vectors were estimated in the model # i.e., which are the texts with the largest weight in our estimation head(SV$coefs) # let's read however such texts vectors <- x[SV$index,] nrow(vectors) str(vectors) # you see that you do not have tweets from 2 to 4 cause they are not supporting vectors! vectors$coefs <- SV$coefs[,1] str(vectors) vectors<- vectors[order(vectors$coef),] head(vectors[,c("coefs", "text")], n=10) vectors<- vectors[order(vectors$coef, decreasing=TRUE),] head(vectors[,c("coefs", "text")], n=10) # So now let's do some more proper Global Interpretation for our SVM! # In particular, let's now focus on identifying the most relevant features for the SVM. For doing that we will rely on a "permutational" approach. # What do we mean by permutation? Basically, what we do is the following: you randomly permute the real value of each single feature # in your dfm. A feature is "important" if permuting its values increases the model error, because the model relied on the feature for # the prediction. That is: if we randomly permute the values of an important feature in the training data, the training performance # would degrade (since permuting the values of a feature effectively destroys any relationship between that feature and the target variable). # This gives us a nice interpretation: Feature importance is the increase in model error when the feature’s information is destroyed. # A feature is “unimportant” if permuting its values keeps the model error unchanged, because the model ignored the # feature for the prediction. # Note that Permutation feature importance does not require retraining the model. Some other methods suggest deleting a feature, retraining the model # and then comparing the model error. Since the retraining of a machine learning model can take a long time, "only" permuting a feature can save a lot # of time. Moreover, we are interested in the feature importance of a fixed model. # Re-training with a reduced dataset creates each time a different model than the one we are interested in. # Permuting a feature and measuring the increase in loss is not the only way to measure the importance of a feature. # An alternative to permutation feature importance are variance-based measures. You focus here not on the error of the model, # but in how much the model's output varies for a feature without considering what it means for performance. # This definition of importance differs from the loss-based definition as in the case of permutation feature importance. # This is evident in cases where a model overfitted. # If a model overfits and uses a feature that is unrelated to the output, then the permutation feature importance # would assign an importance of zero because this feature does not contribute to producing correct predictions. # A variance-based importance measure, on the other hand, might assign the feature high importance as the prediction can change a lot # when the feature is changed. Model variance (explained by the features) and feature importance correlate strongly # when the model generalizes well (i.e. it does not overfit). If you are interested in variance-based measures, you can take a look # at the shapper and fastshap packages. # For running a permutation, you need first to convert the Dfm into a data frame, not into a matrix (the iml package requires that!) # Moreover you need also to focus on your DV if it is a string. For example, if you have two class labels such as "relevant" and "not-relevant" # you need to change the latter to "nonrelevant" (i.e., no space or "-" or anything else) before transforming it into a factor. # Otherwise the iml package will collapse. I discovered it once, after spending 2 hours on iml!!! # Not our problem here! as.factor(Dfm_train@docvars$choose_one) train2 <- convert(Dfm_train, to = "data.frame") # iml does not want features that begin with @, #, etc. # the command make.names allows us to automatically place the character "X" in such cases at the beginning of a feature # while all "invalid" characters are translated to "." # By adding unique=TRUE the two words @UKIP and #UKIP will be treated differently! str(train2) colnames(train2 ) <- make.names(colnames(train2 ), unique = TRUE) colnames(train2) str(train2) # let's rerun the SVM with the data frame (NOT with the original matrix) svm_model <- svm(as.factor(Dfm_train@docvars$choose_one) ~ ., data = train2[-1], kernel='linear', cost = 1) # why train2[-1]? cause the first column in train2 refers to doc_id not to a feature! head(colnames(train2)) # let's create an object that holds the model and the data mod <- Predictor$new(svm_model, data = train2[-1], y = as.factor(Dfm_train@docvars$choose_one), type = "prob") # plan("callr", workers = 6) allows you to run the permutation by taking advantages of 6 cores in your computer # If you have 4 cores write workers=4, etc. # This is a so called "parallelization". It means that the computation per feature is distributed among several cores of your machine. # This can save you LOTS of time! system.time({ plan("callr", workers = 6) set.seed(123) imp2 <- FeatureImp$new(mod, loss = "ce", n.repetitions=1) }) # around 25 secs on my laptop # any feature with a value > 1 implies that it is important to estimate the model. # why larger than 1? Cause feature importance is estimated as a ratio (that's the deafault in iml). # If you get a value > 1 it means that the error increases if a given feature is permuted, therefore that # feature is important! As an alternative you can also estimate feature importance as a difference. # you can also repeat the analysis (i.e. the shuffling of the feature) X time (for example: n.repetitions=5). # The higher the number of repetitions the more stable and accurate the results become (plus: you will have a c.i. as well!) # However, in our case to save time we just asked for 1 repetition imp2 res <- imp2$results[,c(1,3)] str(res) ########################################## # OK we know which features are important now. But what about their relationship with the class-labels? # let's estimate that for both class labels ########################################## # let's replicate make.names here so that both train and train2 present the same list of features! colnames(train) <- make.names(colnames(train)) # let's replicate exactly what we already did for the RF case # As in the Random forest example, let's predict the training-set texts and let's store a new variable in our dfm of the training-set # with such predictions predicted_sv <- predict(svm_model, train, type="class") table(predicted_sv ) Dfm_train@docvars$predSV <- predicted_sv str(Dfm_train@docvars) sums <- list() for (v in 0:1){ sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predSV"]==v,]) } sums <- do.call(cbind, sums) sign <- apply(sums, 1, which.max) # get the feature names <- dimnames(train)[[2]] str(names) df <- data.frame( feature = names, sign = sign-1, stringsAsFactors=F) str(df) str(res) importance <- merge(res, df, by="feature") str(importance) ## best predictors for (v in 0:1){ cat("\n\n") cat("value==", v) importance <- importance[order(importance$importance, decreasing=TRUE),] print(head(importance[importance$sign==v,], n=10)) cat("\n") cat(paste(unique(head(importance$Features[importance$sign==v], n=10)), collapse=", ")) } str(importance) importanceSV <- importance # let's draw a graph with our results! importance$perm <- ifelse(importance$sign>0,importance$importance,importance$importance*-1) str(importance) # the top 10 relevant words importance2 <- top_n(importance, 10, perm ) importance2 # the top 20 irrelevant words importance3 <- top_n(importance, -10, perm ) importance3 importance_new <- rbind(importance2, importance3) str(importance_new) # reorder the features importance_new <- mutate(importance_new, feature= reorder(feature, perm )) importance_new$sign2<- ifelse(importance_new$sign>0,"Relevant","Not Relevant") importance_new <- mutate(importance_new, feature= reorder(feature, perm )) ggplot(importance_new, aes(feature, perm, fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Permutation Coefficient") + scale_fill_manual(values = c("#000033", "#006699")) + labs(title = "Disaster tweets", subtitle = "Irrelevant (-) versus Relevant (+) words - SVM Model", fill = "Disaster") # let's store the graph SVM_graph <- ggplot(importance_new, aes(feature, perm, fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Permutation Coefficient") + scale_fill_manual(values = c("#000033", "#006699")) + labs(title = "Disaster tweets", subtitle = "Irrelevant (-) versus Relevant (+) words - SVM Model", fill = "Disaster") # let's FINALLY predict the test-set system.time(predicted_svm <- predict(SV , test)) table(predicted_svm ) prop.table(table(predicted_svm )) ###################################################### ###################################################### # Let's compare the results out-of-sample we got via SVM & RF ###################################################### ###################################################### prop.table(table(predicted_svm )) prop.table(table(predicted_rf )) results <- as.data.frame(rbind( prop.table(table(predicted_rf )),prop.table(table(predicted_svm )))) str(results) results$algorithm <- c("RF", "SVM") str(results) # Let's plot the results! library(reshape2) df.long<-melt(results,id.vars=c("algorithm")) str(df.long) ggplot(df.long,aes(algorithm,value,fill=variable))+ geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() + ylab(label="Review class in the test-set") + xlab("algorithm") + scale_fill_discrete(name = "Prediction", labels = c("Irrelevant", "Relevant")) # Similar but not exactly the same results for the prediction! # And so, which result is to trust more than the other one? For answering to that, we need to introduce the Cross-Validation procedure... ################################################# # let's compare feature importance across models ################################################# library(gridExtra) grid.arrange( RF_graph, SVM_graph, nrow=2) # Plot everything together # let's check some correlation str(importanceRF) str(importanceSV ) names(importanceSV )[1] <- "Feature" str(importanceSV ) # let's merge the dataframe # here the total words decrease given that for the words in the SVM you have applied (remember!) the make.names command. # Therefore we are not exactly comparing the results of the two models on "equal foot" words <- merge(importanceRF, importanceSV , by="Feature") str(words) library(PerformanceAnalytics) chart.Correlation(words[c(3,6)])