rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(caTools) library(e1071) library(randomForest) library(caret) library(naivebayes) library(car) library(ggplot2) library(dplyr) library(reshape2) library(iml) library(future) library(future.callr) library(gridExtra) # The data we will be using are some English social media disaster tweets discussed in # this article: https://arxiv.org/pdf/1705.02009.pdf # It consist of a number of tweets regarding accidents mixed in with a selection control tweets (not about accidents) ##################################################### # FIRST STEP: let's create the DfM for the training-set ##################################################### x <- read.csv("train_disaster.csv", stringsAsFactors=FALSE) str(x) # class-label variable: choose_one (0=tweets not relevant; 1=relevant tweets) table(x$choose_one) prop.table(table(x$choose_one)) nrow(x) myCorpusTwitterTrain <- corpus(x) tok2 <- tokens(myCorpusTwitterTrain , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) # let's also remove the unicode symbols tok2 <- tokens_remove(tok2, c("0*")) tok2 <- tokens_wordstem (tok2) Dfm_train <- dfm(tok2) # Let's trim the dfm in order to keep only tokens that appear in 2 or more tweets (tweets are short texts!) # and let's keep only features with at least 2 characters Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE) Dfm_train <- dfm_remove(Dfm_train , min_nchar = 2) topfeatures(Dfm_train , 20) # 20 top words ##################################################### # SECOND STEP: let's create the DfM for the test-set ##################################################### x10 <- read.csv("test_disaster.csv", stringsAsFactors=FALSE) str(x10) nrow(x10) myCorpusTwitterTest <- corpus(x10) tok <- tokens(myCorpusTwitterTest , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok <- tokens_remove(tok, stopwords("en")) tok <- tokens_remove(tok, c("0*")) tok <- tokens_wordstem (tok) Dfm_test <- dfm(tok) Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE) Dfm_test<- dfm_remove(Dfm_test, min_nchar = 2) topfeatures(Dfm_test , 20) # 20 top words ##################################################### # THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern. # after this step, we can "predict" by employing only the features included in the training-set ##################################################### setequal(featnames(Dfm_train), featnames(Dfm_test)) length(Dfm_test@Dimnames$features) length(Dfm_train@Dimnames$features) test_dfm <- dfm_match(Dfm_test, features = featnames(Dfm_train)) length(test_dfm@Dimnames$features) setequal(featnames(Dfm_train), featnames(test_dfm )) ##################################################### # FOURTH STEP: Let's convert the two DfMs into matrices for the ML algorithms to work ##################################################### train <- as.matrix(Dfm_train) # dense matrix object.size(train) # a compressed sparse matrix! trainSP <- as(Dfm_train, "dgCMatrix") # compressed matrix object.size(trainSP ) object.size(train)/object.size(trainSP ) str(train) str(trainSP) # this is going to make a HUGE difference when you have a large matrix as test and train!!! # we will mainly use normal (i.e., not compressed) matrix in our class given that we will always use small dataset. # But keeps this in mind! test <- as.matrix(test_dfm) ##################################################### # FIFHT STEP: let's estimate a ML Model ##################################################### ##################################################### # Let's start with a Naive Bayes Model ##################################################### # we will use the naivebayes package. Another possibile package you can consider is the fastNaiveBayes package # given our training-set, we have to use a Multinomial rather rather than a Binomial distribution given that our # features can assume a value different than just 0/1, i.e., a one-hot-encoding. Indeed: table(Dfm_train@x ) # to run a Binomial model with naivebayes just replace "multinomial_naive_bayes" with "bernoulli_naive_bayes" in the below command # Which is our DV? str(Dfm_train@docvars$choose_one) # that's an integer variable. Not good for a classification! It should always be a factor variable # So we will use as.factor to transform it into a factor variable # Note that instead of y=as.factor(Dfm_train@docvars$choose_one) in the formula of the NB we could have used # y=as.factor(x$choose_one) of course! system.time(NB <- multinomial_naive_bayes(x=train, y=as.factor(Dfm_train@docvars$choose_one), laplace = 1)) summary(NB) prop.table(table(Dfm_train@docvars$choose_one)) # prior probabilities # let's see the association between words and probabilities (i.e., matrix with class conditional parameter estimates). # take a look at "casualti" and "cream". The former increases the probability of a relevant tweet (=1) compared to the latter one (=0) head(NB$params) # Let's investigate about this issue a bit more. Why should we care about it? # Often, ML models are considered “black boxes” due to their complex inner-workings. However, because of their complexity, # they are typically more accurate for predicting nonlinear or rare phenomena. Unfortunately, more accuracy often comes at the # expense of interpretability, and interpretability is crucial. It is not enough to identify a ML model that optimizes # predictive performance; understanding and trusting model results is a hallmark of good (social and political) science! # Luckily, several advancements have been made to aid in interpreting ML models over the years. # Interpreting ML models is an emerging field that has become known as "Interpretable Machine Learning" (IML). # Approaches to model interpretability can be broadly categorized as providing global or local explanations. # Global interpretations help us understand the inputs and their entire modeled relationship with the prediction target. # Local interpretations help us understand model predictions for a single row of data or a group of similar rows. # Global interpretability in particular is about understanding how the model makes predictions, based on a holistic view of # its features and how they influence the underlying model structure. # It answers questions regarding which features are relatively influential as well as how these features influence the response variable. # In this latter case, we assess the magnitude of a variable’s relationship with the response as compared to other variables used in the model. # In text analytics (given that we are dealining with huge DfM wherein the features per-se are not the main focus of our interest) # we are mainly interested in global interpretations; however if you use ML for other aims, local interpretations become VERY important!!! # So let's do some Global Interpretation for our NB! NB_prob <- as.data.frame(NB$params) NB_prob$Feature <- row.names(NB_prob) str(NB_prob) # let's estimate the features that change the most the difference between the relevant and irrelevant conditional probabilities NB_prob$diff <- NB_prob[,2]-NB_prob[,1] str(NB_prob) # the features that present the highest absolute value in this difference can be also considered as the most important in # affecting the overall performance of the algorithm NB_prob$imp <- abs(NB_prob[,2]-NB_prob[,1]) str(NB_prob) print(head(NB_prob[order(NB_prob$imp , decreasing=TRUE),], 15)) # most relevant words overall # clearly, the features the present the highest positive values according to NB_prob$diff are the words # most relevant for the "disaster" tweets (=1); those with the highest negative values are the words # most relevant for the "irrelevant" tweets (=0) print(head(NB_prob[order(NB_prob$diff , decreasing=TRUE),], 15)) # most relevant words for "disaster" tweets print(head(NB_prob[order(NB_prob$diff , decreasing=FALSE),], 15)) # irrelevant words for the other tweets NB_prob$sign <- ifelse(NB_prob$diff>0,"relevant","irrelevant") str(NB_prob) # let's extract the top 20-most irrelevant and the 20-most relevant contributing features NB_prob2 <- top_n(NB_prob, 20, diff ) NB_prob2 NB_prob3 <- top_n(NB_prob, -20, diff ) NB_prob3 NB_prob_new <- rbind(NB_prob2, NB_prob3) # reorder the features NB_prob_new <- mutate(NB_prob_new, Feature= reorder(Feature, diff)) ggplot(NB_prob_new, aes(Feature, diff, fill = sign)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Difference in the conditional probabilities") + scale_fill_manual(values = c("#000033", "#006699")) + labs(title = "Disaster tweets", subtitle = "Irrelevant (-) versus Relevant (+) words - Naive Bayes Model", fill = "Disaster") # let's store the graph NB_graph <- ggplot(NB_prob_new, aes(Feature, diff, fill = sign)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Difference in the conditional probabilities") + scale_fill_manual(values = c("#000033", "#006699")) + labs(title = "Disaster tweets", subtitle = "Irrelevant (-) versus Relevant (+) words - Naive Bayes Model", fill = "Disaster") # let's FINALLY predict the test-set predicted_nb <- predict(NB ,test ) table(predicted_nb ) prop.table(table(predicted_nb )) ##################################################### # let's run a Random Forest ##################################################### # we will use the randomForest package. If you are employing compressed matrices, plz use the ranger package. # the ranger package is also usually faster than the randomForest one # note that as hyperameter (or tuning-parameter), I select a specific value for the number of trees. A RF has also other hyperparameters. # More on this next week! set.seed(123) # (define a set.seed for being able to replicate the results!) system.time(RF <- randomForest(y= as.factor(Dfm_train@docvars$choose_one), x=train, importance=TRUE, do.trace=TRUE, ntree=500)) # As discussed yesterday, a natural benefit of the bootstrap resampling process is that random forests have an out-of-bag (OOB) sample # that provides a reasonable approximation of the test error. This provides a built-in validation set without any extra work on your part. # However, REMEMBER: this is less efficient than changing several hyperparameters and doing a k-fold cross-validation as we will see in the next weeks! str(RF$err.rate) RF$err.rate[nrow(RF$err.rate),] # our final error (overall, and for each of the two classes: irrelevant and relevant tweets) plot(RF) # the errors for the two classes (red=irrelevant; green=relevant. We make a much greater error for the latter class); in black the average error <- as.data.frame(RF$err.rate) # number of trees with the lowest OOB error which.min(error$OOB) # 441 set.seed(123) system.time(RF2 <- randomForest(y= as.factor(Dfm_train@docvars$choose_one), x=train, importance=TRUE, ntree=441, do.trace=TRUE)) RF$err.rate[nrow(RF$err.rate),] # our final error (overall, and for each of the two classes) RF2$err.rate[nrow(RF2$err.rate),] # our final error (overall, and for each of the two classes) # Now let's do some Global Interpretation for our RF! # What about the importance of each feature for our trained model? # Variable importance is a standard metric in machine learning, which indicates the amount of information a variable provides # to the model for predicting the outcome head(RF$importance[,3:4]) # let's grap the result varImpPlot(RF ) # Each features’s importance is assessed based on two criteria: # -MeanDecreaseAccuracy: gives a rough estimate of the loss in prediction performance when that particular variable is omitted from the training set. # Caveat: if two variables are somewhat redundant, then omitting one of them may not lead to massive gains in prediction performance, # but would make the second variable more important. # -MeanDecreaseGini: GINI is a measure of node impurity. Think of it like this: if you use this feature to split the data, how pure will the nodes be? # Highest purity means that each node contains only elements of a single class. # Assessing the decrease in GINI when that feature is omitted leads to an understanding of how important that feature is to split the data correctly. # Plz note that these measures are used to rank variables in terms of importance and, thus, their absolute values could be disregarded. # The problem is that we just get for example the GINI statistics overall w/o differentiating the words most important for specific classes. # Which are however the most important words for the relevant label? And for the irrelevant one? # let's extract the matrix for GINI and Accuracy importance_RF <- as.data.frame(RF$importance[,3:4]) str(importance_RF) importance_RF$Feature<- row.names(importance_RF) str(importance_RF) # same words we get with varImpPlot(RF ) print(head(importance_RF[order(importance_RF$MeanDecreaseGini, decreasing=TRUE),])) # let's predict the training-set texts and let's store a new variable in our dfm of the training-set with such predictions. # Why that? Cause we will use such variable to identify the "sign" of the most important features predicted_rf <- predict(RF, train, type="class") table(predicted_rf ) Dfm_train@docvars$predRF <- predicted_rf str(Dfm_train@docvars) # let's assign a feature to the relevant/irrelevant class according to its frequency (if, for example, the word "collapse" appears # 10 times among predicted relevant tweets and 5 times among predicted irrelevant tweets, # we classify it as pertaining to the "relevant tweets" world; and so on) # let's estimate the number of times a word appear in those tweets that we have classified as related to accidents or otherwise sums <- list() for (v in 0:1){ sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predRF"]==v,]) } sums <- do.call(cbind, sums) sums # let's apply the sign (either 1 or 2) if, respectively, a word appears more in the tweets classified as not related to accidents or related to them sign <- apply(sums, 1, which.max) # get the feature names <- dimnames(train)[[2]] str(names) df <- data.frame( Feature = names, sign = sign-1, # let's recode the sign between 0 and 1 stringsAsFactors=F) str(df) importanceRF <- merge(importance_RF, df, by="Feature") str(importanceRF) ## best predictors for (v in 0:1){ cat("\n\n") cat("value==", v) importanceRF <- importanceRF [order(importanceRF $MeanDecreaseGini, decreasing=TRUE),] print(head(importanceRF [importanceRF $sign==v,], n=10)) cat("\n") cat(paste(unique(head(importanceRF $Features[importanceRF $sign==v], n=10)), collapse=", ")) } # let's draw a graph with our results focusing on MeanDecreaseGini importanceRF$Gini <- ifelse(importanceRF $sign>0,importanceRF $MeanDecreaseGini,importanceRF $MeanDecreaseGini*-1) # the twop 20 relevant words importance2RF <- top_n(importanceRF, 20, Gini ) importance2RF # the twop 20 irrelevant words importance3RF <- top_n(importanceRF, -20, Gini ) importance3RF importance_newRF <- rbind(importance2RF, importance3RF) str(importance_newRF) # reorder the features importance_newRF <- mutate(importance_newRF, Feature= reorder(Feature, Gini )) importance_newRF$sign2<- ifelse(importance_newRF$sign>0,"relevant","irrelevant") ggplot(importance_newRF, aes(Feature, Gini, fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Mean Decrease Gini coefficient") + scale_fill_manual(values = c("#000033", "#006699")) + labs(title = "Disaster tweets", subtitle = "Irrelevant (-) versus Relevant (+) words - Random Forest Model", fill = "Disaster") # let's store the graph RF_graph <- ggplot(importance_newRF, aes(Feature, Gini, fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Mean Decrease Gini coefficient") + scale_fill_manual(values = c("#000033", "#006699")) + labs(title = "Disaster tweets", subtitle = "Irrelevant (-) versus Relevant (+) words - Random Forest Model", fill = "Disaster") # let's FINALLY predict the test-set system.time(predicted_rf <- predict(RF, test,type="class")) table(predicted_rf ) prop.table(table(predicted_rf)) ##################################################### # let's run a SVM ##################################################### # note that here I select a linear kernel (in my experience a linear kernel is doing fine with texts data) # and, as hyperameter, a specific value for the cost C(=1)- A SVM has also other hyperparameters. More on this later on system.time(SV <- svm(y= as.factor(Dfm_train@docvars$choose_one), x=train, kernel='linear', cost = 1)) # how many supporting vectors? length(SV$index) nrow(train) # 243 out of 400 tweets in the training-set data-frame # The coefficients that you're pulling out are the weights for the support vectors. # Looking at the estimated coefficients is not however so informative because they only tell us what support vectors were estimated in the model head(SV$coefs) # So now let's do some more proper Global Interpretation for our SVM! # In particular, let's now focus on identifying the most relevant features for the SVM. For doing that we will rely on a "permutational" approach. # What do we mean by permutation? Basically, what we do is the following: you randomly permute the real value of each single feature # in your dfm. A feature is "important" if permuting its values increases the model error, because the model relied on the feature for # the prediction. That is: if we randomly permute the values of an important feature in the training data, the training performance # would degrade (since permuting the values of a feature effectively destroys any relationship between that feature and the target variable). # A feature is “unimportant” if permuting its values keeps the model error unchanged, because the model ignored the # feature for the prediction. # For running a permutation, you need first to convert the Dfm into a data frame, not into a matrix (the iml package requires that) train2 <- convert(Dfm_train, to = "data.frame") # iml does not want features that begin with @, #, etc. # the command make.names creates a syntactically valid feature # it consists of letters, numbers and the dot or underline characters and starts with a letter or the dot not followed # by a number. Features such as ".2way" are not valid for example # The character "X." is prepended if necessary. All invalid characters are translated to "." # A missing value is translated to "NA". # By adding unique=TRUE the two words @UKIP and #UKIP will be treated differently! colnames(train2 ) <- make.names(colnames(train2 ), unique = TRUE) colnames(train2) # let's rerun the SVM with the data frame (NOT with the original matrix) set.seed(123) svm_model <- svm(as.factor(Dfm_train@docvars$choose_one) ~ ., data = train2[-1], kernel='linear', cost = 1) # why train2[-1]? cause the first column in train2 refers to doc_id not to a feature! head(colnames(train2)) # let's create an object that holds the model and the data mod <- Predictor$new(svm_model, data = train2[-1], y = as.factor(Dfm_train@docvars$choose_one), type = "prob") # plan("callr", workers = 6) allows you to run the permutation by taking advantages of 6 cores in your computer # If you have 4 cores write workers=4, etc. # This is a so called "parallelization". It means that the computation per feature is distributed among several cores of your machine. # This can save you LOTS of time! system.time({ plan("callr", workers = 6) set.seed(123) imp2 <- FeatureImp$new(mod, loss = "ce", n.repetitions=1) }) # around 25 secs on my laptop # any feature with a value > 1 implies that it is important to estimate the model. # why larger than 1? Cause feature importance is estimated as a ratio (that's the deafault in iml) # If you get a value > 1 it means that the error increases if a given feature is permuted, therefore that # feature is important! As an alternative you can also estimate feature importance as a difference. # you can also repeat the analysis (i.e. the shuffling of the feature) X time (for example: n.repetitions=5). # The higher the number of repetitions the more stable and accurate the results become (plus: you will have a c.i. as well!) # However, in our case to save time we just asked for 1 repetition imp2 res <- imp2$results[,c(1,3)] str(res) ########################################## # OK we know which features are important now. But what about their relationship with the class-labels? # let's estimate that for both class labels ########################################## # let's replicate make.names here so that both train and train2 present the same list of features! colnames(train) <- make.names(colnames(train)) # let's replicate exactly what we already did for the RF case # As in the Random forest example, let's predict the training-set texts and let's store a new variable in our dfm of the training-set with such predictions predicted_sv <- predict(svm_model, train, type="class") table(predicted_sv ) Dfm_train@docvars$predSV <- predicted_sv str(Dfm_train@docvars) sums <- list() for (v in 0:1){ sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predSV"]==v,]) } sums <- do.call(cbind, sums) sign <- apply(sums, 1, which.max) # get the feature names <- dimnames(train)[[2]] str(names) df <- data.frame( feature = names, sign = sign-1, stringsAsFactors=F) str(df) str(res) importance <- merge(res, df, by="feature") str(importance) ## best predictors for (v in 0:1){ cat("\n\n") cat("value==", v) importance <- importance[order(importance$importance, decreasing=TRUE),] print(head(importance[importance$sign==v,], n=10)) cat("\n") cat(paste(unique(head(importance$Features[importance$sign==v], n=10)), collapse=", ")) } str(importance) importanceSV <- importance # let's draw a graph with our results! importance$perm <- ifelse(importance$sign>0,importance$importance,importance$importance*-1) str(importance) # the top 10 relevant words importance2 <- top_n(importance, 10, perm ) importance2 # the top 20 irrelevant words importance3 <- top_n(importance, -10, perm ) importance3 importance_new <- rbind(importance2, importance3) str(importance_new) # reorder the features importance_new <- mutate(importance_new, feature= reorder(feature, perm )) importance_new$sign2<- ifelse(importance_new$sign>0,"Relevant","Not Relevant") importance_new <- mutate(importance_new, feature= reorder(feature, perm )) ggplot(importance_new, aes(feature, perm, fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Permutation Coefficient") + scale_fill_manual(values = c("#000033", "#006699")) + labs(title = "Disaster tweets", subtitle = "Irrelevant (-) versus Relevant (+) words - SVM Model", fill = "Disaster") # let's store the graph SVM_graph <- ggplot(importance_new, aes(feature, perm, fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Permutation Coefficient") + scale_fill_manual(values = c("#000033", "#006699")) + labs(title = "Disaster tweets", subtitle = "Irrelevant (-) versus Relevant (+) words - SVM Model", fill = "Disaster") # let's FINALLY predict the test-set system.time(predicted_svm <- predict(SV , test)) table(predicted_svm ) prop.table(table(predicted_svm )) ###################################################### ###################################################### # Let's compare the results out-of-sample we got via Naive Bayes, SVM & RF ###################################################### ###################################################### prop.table(table(predicted_nb )) prop.table(table(predicted_svm )) prop.table(table(predicted_rf )) results <- as.data.frame(rbind(prop.table(table(predicted_nb )), prop.table(table(predicted_rf )), prop.table(table(predicted_svm )))) str(results) results$algorithm <- c("NB", "RF", "SVM") str(results) # Let's plot the results! library(reshape2) df.long<-melt(results,id.vars=c("algorithm")) str(df.long) ggplot(df.long,aes(algorithm,value,fill=variable))+ geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() + ylab(label="Review class in the test-set") + xlab("algorithm") + scale_fill_discrete(name = "Prediction", labels = c("Irrelevant", "Relevant")) # and so, which result is to trust more than the other one? For answering to that, we need to introduce the Cross-Validation procedure...next week! ################################################# # let's compare feature importance across models ################################################# library(gridExtra) grid.arrange(NB_graph, RF_graph, SVM_graph, nrow=2) # Plot everything together # let's check some correlation str(NB_prob) str(importanceRF) str(importanceSV ) names(importanceSV )[1] <- "Feature" str(importanceSV ) # let's replace "X." with the original "@" importanceSV$Feature <- gsub("X.", "@", importanceSV$Feature) str(importanceSV ) # let's merge the dataframe (w/o XGB for the moment) words <- merge(NB_prob, importanceRF, by="Feature") str(words) # here the total words decrease given that for the words in the SVM you have applied (remember!) the make.names command words <- merge(words , importanceSV , by="Feature") str(words) library(PerformanceAnalytics) chart.Correlation(words[c(5, 7:8,11)])