rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(caTools) library(randomForest) library(caret) library(naivebayes) library(car) library(ggplot2) library(dplyr) library(reshape2) library(gridExtra) library(PerformanceAnalytics) library(plotly) library(iml) library(future) library(future.callr) # The data we will be using are some English social media disaster tweets discussed in # this article: https://arxiv.org/pdf/1705.02009.pdf # It consists of a number of tweets regarding accidents mixed in with a selection control tweets (not about accidents) ##################################################### # FIRST STEP: let's create the DfM for the training-set ##################################################### x <- read.csv("train_disaster.csv", stringsAsFactors=FALSE) str(x) # class-label variable: choose_one (0=tweets not relevant for accidencts; 1=relevant tweets for accidents) table(x$choose_one) prop.table(table(x$choose_one)) nrow(x) myCorpusTwitterTrain <- corpus(x) tok2 <- tokens(myCorpusTwitterTrain , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_URL = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) # let's also remove the unicode symbols tok2 <- tokens_remove(tok2, c("0*")) tok2 <- tokens_wordstem (tok2) Dfm_train <- dfm(tok2) # Let's trim the dfm in order to keep only tokens that appear in 2 or more tweets (tweets are short texts!) # and let's keep only features with at least 2 characters Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE) Dfm_train <- dfm_remove(Dfm_train , min_nchar = 2) topfeatures(Dfm_train , 20) # 20 top words ##################################################### # SECOND STEP: let's create the DfM for the test-set ##################################################### x10 <- read.csv("test_disaster.csv", stringsAsFactors=FALSE) str(x10) nrow(x10) myCorpusTwitterTest <- corpus(x10) tok <- tokens(myCorpusTwitterTest , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_URL = TRUE) tok <- tokens_remove(tok, stopwords("en")) tok <- tokens_remove(tok, c("0*")) tok <- tokens_wordstem (tok) Dfm_test <- dfm(tok) Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE) Dfm_test<- dfm_remove(Dfm_test, min_nchar = 2) # Always check if after trimming your test-set you have some texts with just 0s! Why? Cause otherwise how could you predict the class of a text in a test-set # with just 0s in its DfM? It would be a non-reliable prediction by definition Dfm_test[ntoken(Dfm_test) == 0,] # So let's delete such texts! Dfm_test <- Dfm_test[ntoken(Dfm_test) != 0,] Dfm_test[ntoken(Dfm_test) == 0,] topfeatures(Dfm_test , 20) # 20 top words ##################################################### # THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern. # After this step, we can "predict" the test-set by employing only the features included in the training-set. # Avoiding to use dfm_match would generate a problem with, for example the RandomForest predictions on the test-set. # That is, w/o dfm_match you would get the following error: Error in predict.randomForest - variables in the training data missing in newdata ##################################################### setequal(featnames(Dfm_train), featnames(Dfm_test)) nfeat(Dfm_test) nfeat(Dfm_train) test_dfm <- dfm_match(Dfm_test, features = featnames(Dfm_train)) nfeat(test_dfm) setequal(featnames(Dfm_train), featnames(test_dfm )) ##################################################### # FOURTH STEP: Let's convert the two DfMs into matrices for the ML algorithms to work ##################################################### train <- as.matrix(Dfm_train) # dense matrix str(train) object.size(train) # Note that we could have saved the dfm as a compressed sparse matrix! # What's the meaning of that? Take a look at the EXTRA slides on the home-page! trainSP <- as(Dfm_train, "dgCMatrix") # compressed matrix object.size(trainSP ) object.size(train)/object.size(trainSP ) str(train) str(trainSP) # The difference between a dense and a compressed matrix is going to make a HUGE difference when you have a large matrix as test and train!!! # We will mainly use dense (i.e., not compressed) matrix in our class given that we will always use small dataset. # But keeps this in mind! test <- as.matrix(test_dfm) # As an alternative, we could have saved the dfm as a data.frame. The problem with that is double: a) not all the packages that implement ML algorithms require a data.frame (but almost all of them # require a matrix); b) when you convert a dfm into a data.frame, some packages implement a ML algorithm (for example the package randomForest) can have problems to read those columns that # correspond to features either beginning with a number, of with @ or # (as it is quite typical when dealing with social media texts). # There are ways to deal with this problem however (see below). ##################################################### # FIFHT STEP: let's estimate a ML Model ##################################################### ##################################################### ##################################################### # Let's start with a Naive Bayes Model ##################################################### ##################################################### # We will use the naivebayes package. Another possibile package you can consider is the fastNaiveBayes package # The naivebayes package requires to work on a matrix, not on a data frame! # Given our training-set, we have to use a Multinomial rather rather than a Binomial distribution given that our # features can assume a value different than just 0/1, i.e., a one-hot-encoding. Indeed: table(Dfm_train@x ) # to run a Binomial model with naivebayes just replace "multinomial_naive_bayes" with "bernoulli_naive_bayes" in the below command # Which is our DV? str(Dfm_train@docvars$choose_one) # that's an integer variable. Not good for a classification! It should always be a factor (or a string # at least if using the naivebayes package) # So we will use as.factor to transform it into a factor variable. As laplace smoother let's select 1. # Remember, however, this is a tuning parameter! More on this later on system.time(NB <- multinomial_naive_bayes(x=train, y=as.factor(Dfm_train@docvars$choose_one), laplace = 1)) summary(NB) prop.table(table(Dfm_train@docvars$choose_one)) # prior probabilities # let's predict the test-set predicted_nb <- predict(NB ,test ) table(predicted_nb ) prop.table(table(predicted_nb )) # so basically to train and predict you need just two lines of command! # NB <- multinomial_naive_bayes(x=train, y=as.factor(Dfm_train@docvars$choose_one), laplace = 1) # predicted_nb <- predict(NB ,test ) # However, this is not the end of the story! Never blindly trust the results of your ML algorithm! # So let's do some Global Interpretation of our NB! # let's see for example the association between words and probabilities (i.e., matrix with class conditional parameter estimates - i.e., the likelihood!). # Take a look at "casualti" and "cream". The likelihood for the former is higher for a tweet discussing about a disaster ((i.e., Dfm_train@docvars$choose_one=1) compared # to irrelavant tweets ((i.e., Dfm_train@docvars$choose_one=0), i.e., p(casualti|1)> p(casualti|0); the opposite happens for the word cream. head(NB$params) # But now let's take a step forward. The package naivebayes does not come with a model-specific VI. # Here we could either 1) focus on the likelihoods above, i.e., we could argue that those features that present the highest absolute value # in the difference between likelihoods can be also considered as the most important ones in affecting the overall performance of the algorithm. # (note that this can become messy when you deal with a multi-class DV), # or 2) we can apply a permutational approach via iml package. # Let's follow this latter option! # Remember: in both cases, we are assessing feature importance based on the training data # (given that we haven't any validation data, i.e., data that have been coded but not employed to train the algorithm here). # The procedure of randomly shuffling a single feature value breaks the relationship between the feature and the target, # thus the drop in the model score is indicative of how much the model depends on the feature. # REMEMBER: Permutation importance does not reflect to the intrinsic predictive value of a feature by itself but how important # this feature is for a particular model. # For running a permutation, you need first to focus on your DV if it is a string. For example, if you have two class labels such as "relevant" and "not-relevant" # you need to change the latter to "nonrelevant" (i.e., no space or "-" or anything else) before transforming it into a factor. # Otherwise the iml package will collapse. I discovered it once, after spending 2 hours on iml!!! # Not our problem here! as.factor(Dfm_train@docvars$choose_one) # The iml package requires: # 1) a data frame, not a matrix, on which computing the prediction of the model (note that you have run the previous model with a matrix!) # 2) as a result of 1), a specific function that allows you to move from the original matrix to a data frame when computing the class (or # the probabilities) of each instance/text # 3) iml does NOT want features that begin with @, #, etc. when you use a data.frame (as it is the case with the social media of our corpus). # So you have to solve this problem! # 4) as a result of what just underlined, you also need to recompute the NB results with the converted data frame created at point 3). Of course the outcome # of the model under 4) is going to be exactly the same as your original NB above (after all you just replace the @ in front of a word # with a X - as we will see!) # Let's convert the dfm of the training set into a data frame and let's remove its first columns # (that refers to doc_id not to a feature!) trainDF <- convert(Dfm_train, to="data.frame") head(colnames(trainDF)) trainDF <- trainDF[,-c(1)] head(colnames(trainDF)) # As discussed above, iml does NOT want features that begin with @, #, etc. when you use a data.frame. # The command make.names allows us to automatically place the character "X" in such cases at the beginning of a feature # while all "invalid" characters are translated to "." # By adding unique=TRUE the two words @best and #best will be treated differently! # focus on obs [537] and [538] colnames(trainDF) colnames(trainDF) <- make.names(colnames(trainDF), unique = TRUE) colnames(trainDF) # let's create a "cleaned" matrix out of this "cleaned" data.matrix train2 <- as.matrix(trainDF) # let's re-estimate our NB with this "cleaned" matrix system.time(NB2 <- multinomial_naive_bayes(x=train2, y=as.factor(Dfm_train@docvars$choose_one), laplace = 1)) # of course, same likelihoods as above! head(NB2$params) head(NB$params) # We can focus either on the predicted class or on the predicted probabilities below (same results of course!). # Let's focus on the latter one. Here we want to predict our training-set! head(predict(NB2, train2)) # let's define our predictive function predNB <-function(model, newdata){ newData_x <- as.matrix(newdata) results<-predict(model, newData_x) return(results) } head(predNB (NB2, train2)) # let's apply this function to our NB by creating an object that holds the model and the data modNB <- Predictor$new(NB2, data = trainDF, y =as.factor(Dfm_train@docvars$choose_one), predict.fun = predNB) modNB $predict(trainDF[1:5, ]) # let's define the performance metric for the trained model we want to use to compute our model error library(help = "Metrics") # here we select "ce" # plan("callr", workers = 6) below allows you to run the permutation by taking advantages of 6 cores in your computer # If you have 4 cores write workers=4, etc. # This is a so called "parallelization". It means that the computation per feature is distributed among several cores of your machine. # This can save you LOTS of time! # You can also replicate the analysis (i.e. the shuffling of the feature) X time (for example: n.repetitions=5). # The higher the number of repetitions the more stable and accurate the results become (plus: you will have # also the opportunity to measure their variability). # However, in our case to save time we just asked for 1 repetition! # let's compute VI for our NB model system.time({ plan("callr", workers = 6) set.seed(123) impNB <- FeatureImp$new(modNB , loss = "ce", n.repetitions=1) }) # remember: feature importance is estimated as a ratio (permuted error / original model error). Any value larger than 1 identifies therefore an important feature. # If you ML model overfits (i.e., perfectly predicts) the training-set, by default iml switches from computing feature importance as a ratio to a difference. # In this latter case, any value larger than 0 identifies an important feature plot(impNB) # let's add some interactivity! graph <- plot(impNB) ggplotly(graph ) head(impNB $results[,c(1:4)]) res <- impNB $results[,c(1,3)] str(res) # let's rename the column "feature" into "Feature" colnames(res)[1] <- "Feature" str(res) ######## OK we know which features are important now. But what about their relationship with the class-labels? # The nice thing with iml is that you compute Partial Dependence Plots (PDP) that tells you exactly that. # The PDP plot displays the average change in predicted DV as we vary a given feature value while holding all other variables constant. # It serves as a depiction of the relationship between a predictor of interest and the # dependent variable, while basically “controlling” for all other effects of other predictors. # The intuitive explanation here is that we hold all control variables, Xc, at their actual value for each observation i. # We then set Xs to a single value of Xs for all observations (e.g., fix Xs = 1 for all observations, where Xs can be for example the feature "fear"). # Next, this fixed value of Xs (as well as the corresponding Xc values specific to each observation) is run through the prediction function # in order to obtain a prediction for every observation i. # We then fix Xs to the next value (e.g., Xs = 2) calculate new predictions, and continue this process of iterating across all values of Xs for which # we want to create the PDP. As a final step, we average over all resulting predictions in order to obtain the average predicted value of the outcome, # given each value of Xs. This forms the PDP. plot(FeatureEffect$new(modNB , "fear", method = "pdp")) plot(FeatureEffect$new(modNB , "kill", method = "pdp")) plot(FeatureEffect$new(modNB , "collid", method = "pdp")) plot(FeatureEffect$new(modNB , "bag", method = "pdp")) plot(FeatureEffect$new(modNB , "need", method = "pdp")) plot(FeatureEffect$new(modNB , "fire", method = "pdp")) # The problem here is that we need to produce a plot for each single feature. What about trying to find a most straightforward approach? # This option is the SAME one we will always employ with any ML algorithm we will discuss. It has the advantage of being more flexible # than the previous one, in particular when you have more than 2 class-labels as your DV # First of all, let's predict the training-set texts and let's store a new variable in our dfm of the training-set with such predictions. # Why that? Cause we will use such variable to identify the "sign" (i.e., the class-labels) of the most important features predicted_nb <- predict(NB2, train2) table(predicted_nb ) Dfm_train@docvars$predNB <- predicted_nb str(Dfm_train@docvars) # First, let's estimate the number of times a word appear in those tweets that we have classified as related to accidents or otherwise. # Remember that in this case predNB assumes two values: 0 or 1. str(Dfm_train@docvars$predNB) # For other situations, you should change the "for (v in 0:1)" part. # For example, if predNB assumes values "1" and "2", you should write: "for (v in 1:2)". # Moreover I would suggest you to transform your predicted DV saved in your dfm into an integer if the former one is a string! # Below, I add a +1 to v here: "sums[[v+1]]" cause you cannot add an object to a list with a position in it=0! # Otherwise, you would get an error. Try yourself! sums <- list() for (v in 0:1){ sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predNB"]==v,]) } str(sums) head(sums[[1]]) head(sums[[2]]) # do.call allows to use cbind on a list to create a matrix (i.e., a collection of objects of different types as in the case of sums) sums <- do.call(cbind, sums) class(sums) str(sums) print(sums[2:7,]) # Let's apply the sign's value (either 1 or 2) if, respectively, a word appears more in tweets classified as not related to accidents or related to them. # Note for the below command: when you write "1", the apply function works over rows; "2" over columns. Given that we want here to identify the highest # frequencies over rows (i.e., the highest frequencies of each word over the class labels), we specify below "1" sign <- apply(sums, 1, which.max) # you get a value of 1(or 2) if a word appears more (or less) often with respect to the first class (here (i.e., Dfm_train@docvars$choose_one=0) than to the second class (here (i.e., Dfm_train@docvars$choose_one=1) print(sign[2:7]) # What if you have a 3 class-labels as your DV? The only thing to change above (as far as you use the as.factor option in the estimation of the naive bayes model) # would be once again the "for (v in 0:1)" part. It must become now for example "for (v in 0:2)". # Of course in this latter case you would also get three values (1, 2 and 3) # get the feature names <- dimnames(train2)[[2]] str(names) df <- data.frame(Feature = names, sign=sign) str(df) df$sign2<-factor(df$sign,labels = c("irrelevant","relevant")) str(df) # let's merge the dataframe we just created with the dataframe that was including the VI permutation scores importanceNB <- merge(res, df, by="Feature") str(importanceNB ) clist <- c("irrelevant", "relevant") for (v in clist){ cat("\n\n") cat("value==", v) importanceNB <- importanceNB [order(importanceNB $ importance, decreasing=TRUE),] print(head(importanceNB [importanceNB $sign2==v,], n=10)) } # the top 10 words for the disaster tweets (i.e., Dfm_train@docvars$choose_one=1) importance2NB <- head(importanceNB [importanceNB $sign2=="relevant",], n=10) importance2NB # the top 10 words for tweets not related to disasters (i.e., Dfm_train@docvars$choose_one=0) importance3NB <-head(importanceNB [importanceNB $sign2=="irrelevant",], n=10) importance3NB importance_newNB <- rbind(importance2NB , importance3NB ) str(importance_newNB ) # let's reorder the features importance_newNB$Feature <- with(importance_newNB , factor(Feature,levels=Feature[order(ave(importance,sign2,FUN=max),importance)])) str(importance_newNB) ggplot(importance_newNB, aes(Feature, importance, fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic() + geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Permutation Coefficients") + labs(title= "Relevant to accidents versus Irrelevant to accidents features - NB Model", fill= "Disaster") + scale_fill_manual(values = c("#000033", "#006699")) ##################################################### ##################################################### # Let's now run a Random Forest ##################################################### ##################################################### # We will use the randomForest package. If you are employing compressed matrices, plz use the ranger package. # The ranger package is also usually faster than the randomForest one. # The randomForest package also work with a data frame, rather than with a matrix. But remember the issue about the @ and # discussed above! # Note that as hyperameter (or tuning-parameter), I select a specific value for the number of trees. # A RF has also other hyperparameters. More on this tomorrow! set.seed(123) # (define a set.seed for being able to replicate the results!) system.time(RF <- randomForest(y= as.factor(Dfm_train@docvars$choose_one), x=train, importance=TRUE, do.trace=TRUE, ntree=500)) RF # let's predict the test-set. We define the seed to get always the same prediction. Otherwise, you can get each time slighty # different results according to how to randomly classify a given document when you have a draw across decision trees # (i.e., say 250 trees say "positive" and 250 trees say "negative") set.seed(123) system.time(predicted_rf <- predict(RF, test)) table(predicted_rf ) set.seed(14) system.time(predicted_rf <- predict(RF, test)) table(predicted_rf ) # note that if you add: "predict.all=TRUE", then the returned object is a list # of 2 components: aggregate, which is the vector of predicted values by the forest (as above), # and individual, which is a matrix where each column contains prediction by a tree in the forest. set.seed(123) system.time(predicted_rf2 <- predict(RF, test, predict.all=TRUE)) str(predicted_rf2) table(predicted_rf2$aggregate ) nrow(test) head(predicted_rf2$individual) # As discussed, a natural benefit of the bootstrap resampling process is that random forests have an out-of-bag (OOB) sample # that provides a reasonable approximation of the test error. This provides a built-in validation set without any extra work on your part. # However, REMEMBER: this is less efficient (especially with medium to large training-set) compared with # doing a k-fold cross-validation as we will see tomorrow! str(RF$err.rate) RF$err.rate[nrow(RF$err.rate),] # our final error (overall, and for each of the two classes: irrelevant and relevant tweets) plot(RF, type = "l", col = c("red", "black","springgreen4"), main = "Class errors in Random Forest Model / OOB") legend("topright", horiz = F, cex = 0.7, fill = c( "black", "red", "springgreen4"), c( "No-disaster tweets","Average error", "Disaster tweets")) error <- as.data.frame(RF$err.rate) # number of trees with the lowest OOB error which.min(error$OOB) # 455 set.seed(123) system.time(RF2 <- randomForest(y= as.factor(Dfm_train@docvars$choose_one), x=train, importance=TRUE, ntree=455, do.trace=TRUE)) RF$err.rate[nrow(RF$err.rate),] # our final error (overall, and for each of the two classes) RF2$err.rate[nrow(RF2$err.rate),] # our final error (overall, and for each of the two classes) # Now let's do some Global Interpretation also for our RF! We could have employed once again a permutational approach as above of course. # However, the randomForest package has it own way of quantifying the importance of each feature - i.e., it produces a model-specific VI head(RF$importance[,3:4]) # The importance of each featuress is assessed based on two criteria: # -MeanDecreaseAccuracy: gives a rough estimate of the loss in prediction performance of a particular variable. # This measure is computed from permuting OOB data: for each tree, the prediction error on the OOB portion of the data is recorded # (error rate for classification). Then the same is done after permuting a given predictor variable. # The difference between the two are then averaged over all trees, and normalized by the standard deviation of the differences. # (i.e., we are assessing feature importance based on the validation - OOB - data!) varImpPlot(RF ) # From the graph we can conclude that a randomly permuted "fire" variable increases the avg. error rate by around # 19 percent relative to the actual variable value. Thus, we would conclude that including "fire" is # important if we are trying to predict our DV in the OOB data. # Caveat: if two variables are somewhat redundant, then omitting one of them may not lead to massive gains in prediction performance, # but would make the second variable more important. # -MeanDecreaseGini: GINI importance measures the average gain of purity by splits of a given variable. Highest purity means that each node # contains only elements of a single class. Think of it like this: if you use this feature to split the data, how pure will the nodes be? # As a result, permuting a useful variable, tend to give relatively large decrease in mean gini-gain. # On the contrary, splitting by a permuted not useful variable tends neither to increase nor decrease node purities. # Typically, MeanDecreaseAccuracy tends to be more reliable than MeanDecreaseGini. Therefore we focus here on the latter one # let's extract the matrix for GINI and Accuracy importance_RF <- as.data.frame(RF$importance[,3:4]) str(importance_RF) importance_RF$Feature<- row.names(importance_RF) str(importance_RF) print(head(importance_RF[order(importance_RF$MeanDecreaseAccuracy, decreasing=TRUE),])) # why don't we get the same results we got from here? varImpPlot(RF ) # Cause the above graph is shown with all predictors lined up from most to least important, where each value shows the average decrease in accuracy, # SCALED by its standard deviation in order to normalize the measures! And where is this standard deviation? # there are three values for SD (for 0, 1 and the avg) str(RF$importanceSD) # focus on "fire" importance_RF[row.names(importance_RF)=="fire", ] importance_RF$mdaNORM <- importance_RF$MeanDecreaseAccuracy/RF$importanceSD[,3] importance_RF[row.names(importance_RF)=="fire", ] # now same results! varImpPlot(RF ) # Note that these measures are used to rank variables in terms of importance. # As a result, we just get for example the MeanDecreaseAccuracy statistics overall w/o differentiating the words most important for each specific classes. # Which are however the most important words for the relevant label? And for the irrelevant one? # Let's do exactly what we did above for the Naive Bayes algorithm...that is: # let's first predict the training-set texts and let's store a new variable in our dfm of the training-set with such predictions predicted_rf <- predict(RF, train) table(predicted_rf ) Dfm_train@docvars$predRF <- predicted_rf str(Dfm_train@docvars) # let's now assign a feature to the relevant/irrelevant class according to its frequency sums <- list() for (v in 0:1){ sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predRF"]==v,]) } sums <- do.call(cbind, sums) # let's apply the sign (either 1 or 2) if, respectively, a word appears more in the tweets classified as not related to accidents or related to them sign <- apply(sums, 1, which.max) names <- dimnames(train)[[2]] dfF <- data.frame(Feature = names, sign=sign, stringsAsFactors=F) str(dfF ) dfF $sign2<-factor(dfF $sign,labels = c("irrelevant","relevant")) str(dfF ) importanceRF <- merge(importance_RF, dfF, by="Feature") str(importanceRF) clist <- c("irrelevant", "relevant") for (v in clist){ cat("\n\n") cat("value==", v) importanceRF <- importanceRF [order(importanceRF $ mdaNORM , decreasing=TRUE),] print(head(importanceRF [importanceRF $sign2==v,], n=10)) } # the top 10 words for the disaster tweets importance2RF <- head(importanceRF [importanceRF $sign2=="relevant",], n=10) importance2RF # the top 20 words for tweets not related to disasters importance3RF <-head(importanceRF [importanceRF $sign2=="irrelevant",], n=10) importance3RF importance_newRF <- rbind(importance2RF, importance3RF) str(importance_newRF) # reorder the features importance_newRF$Feature <- with(importance_newRF, factor(Feature,levels=Feature[order(ave(mdaNORM ,sign2,FUN=max),mdaNORM )])) str(importance_newRF) ggplot(importance_newRF, aes(Feature, mdaNORM , fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic() + geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Mean Decrease Accuracy coefficient") + labs(title= "Relevant to accidents versus Irrelevant to accidents features - Random Forest Model", fill= "Disaster") + scale_fill_manual(values = c("#000033", "#006699")) ###################################################### ###################################################### # Let's compare the results out-of-sample we got via Naive Bayes & RF ###################################################### ###################################################### prop.table(table(predicted_nb )) prop.table(table(predicted_rf )) results <- as.data.frame(rbind(prop.table(table(predicted_nb )), prop.table(table(predicted_rf )))) str(results) results$algorithm <- c("NB", "RF") str(results) # Let's plot the results! df.long<-melt(results,id.vars=c("algorithm")) str(df.long) ggplot(df.long,aes(algorithm,value,fill=variable))+ geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() + ylab(label="Review class in the test-set") + xlab("algorithm") + scale_fill_discrete(name = "Prediction", labels = c("Irrelevant", "Relevant")) # Similar but not exactly the same results for the prediction! # And so, which result is to trust more than the other one? # For answering to that, we need to introduce the Cross-Validation procedure... ################################################# # let's compare feature importance across models ################################################# # let's check some correlation. For doing that, let's merge the dataframe words <- merge(importanceNB, importanceRF, by="Feature") # why 555 and not 571 obs? Cause in the importanceNB data frame you have applied the make.names command. Do you remember? str(words) str(importanceNB) str(importanceRF) # therefore let's do apply it also to the importanceRF data frame! importanceRF$Feature <- make.names(importanceRF$Feature, unique = TRUE) words <- merge(importanceNB, importanceRF, by="Feature") # now we are happy! str(words) # let's see some correlation chart.Correlation(words[c(2,7)])