rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(randomForest)
library(caret)
library(naivebayes)
library(car)
library(ggplot2)
library(dplyr)
library(reshape2)
library(gridExtra)
library(PerformanceAnalytics)
library(plotly)
library(iml)
library(future)
library(future.callr)
# The data we will be using are some English social media disaster tweets discussed in
# this article: https://arxiv.org/pdf/1705.02009.pdf
# It consists of a number of tweets regarding accidents mixed in with a selection control tweets (not about accidents)
#####################################################
# FIRST STEP: let's create the DfM for the training-set
#####################################################
x <- read.csv("train_disaster.csv", stringsAsFactors=FALSE)
str(x)
# class-label variable: choose_one (0=tweets not relevant for accidencts; 1=relevant tweets for accidents)
table(x$choose_one)
prop.table(table(x$choose_one))
nrow(x)
myCorpusTwitterTrain <- corpus(x)
tok2 <- tokens(myCorpusTwitterTrain , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_URL = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
# let's also remove the unicode symbols
tok2 <- tokens_remove(tok2, c("0*"))
tok2 <- tokens_wordstem (tok2)
Dfm_train <- dfm(tok2)
# Let's trim the dfm in order to keep only tokens that appear in 2 or more tweets (tweets are short texts!)
# and let's keep only features with at least 2 characters
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE)
Dfm_train <- dfm_remove(Dfm_train , min_nchar = 2)
topfeatures(Dfm_train , 20) # 20 top words
#####################################################
# SECOND STEP: let's create the DfM for the test-set
#####################################################
x10 <- read.csv("test_disaster.csv", stringsAsFactors=FALSE)
str(x10)
nrow(x10)
myCorpusTwitterTest <- corpus(x10)
tok <- tokens(myCorpusTwitterTest , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_URL = TRUE)
tok <- tokens_remove(tok, stopwords("en"))
tok <- tokens_remove(tok, c("0*"))
tok <- tokens_wordstem (tok)
Dfm_test <- dfm(tok)
Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE)
Dfm_test<- dfm_remove(Dfm_test, min_nchar = 2)
# Always check if after trimming your test-set you have some texts with just 0s! Why? Cause otherwise how could you predict the class of a text in a test-set
# with just 0s in its DfM? It would be a non-reliable prediction by definition
Dfm_test[ntoken(Dfm_test) == 0,]
# So let's delete such texts!
Dfm_test <- Dfm_test[ntoken(Dfm_test) != 0,]
Dfm_test[ntoken(Dfm_test) == 0,]
topfeatures(Dfm_test , 20) # 20 top words
#####################################################
# THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern.
# After this step, we can "predict" the test-set by employing only the features included in the training-set.
# Avoiding to use dfm_match would generate a problem with, for example the RandomForest predictions on the test-set.
# That is, w/o dfm_match you would get the following error: Error in predict.randomForest - variables in the training data missing in newdata
#####################################################
setequal(featnames(Dfm_train), featnames(Dfm_test))
nfeat(Dfm_test)
nfeat(Dfm_train)
test_dfm <- dfm_match(Dfm_test, features = featnames(Dfm_train))
nfeat(test_dfm)
setequal(featnames(Dfm_train), featnames(test_dfm ))
#####################################################
# FOURTH STEP: Let's convert the two DfMs into matrices for the ML algorithms to work
#####################################################
train <- as.matrix(Dfm_train) # dense matrix
str(train)
object.size(train)
# Note that we could have saved the dfm as a compressed sparse matrix!
# What's the meaning of that? Take a look at the EXTRA slides on the home-page!
trainSP <- as(Dfm_train, "dgCMatrix") # compressed matrix
object.size(trainSP )
object.size(train)/object.size(trainSP )
str(train)
str(trainSP)
# The difference between a dense and a compressed matrix is going to make a HUGE difference when you have a large matrix as test and train!!!
# We will mainly use dense (i.e., not compressed) matrix in our class given that we will always use small dataset.
# But keeps this in mind!
test <- as.matrix(test_dfm)
# As an alternative, we could have saved the dfm as a data.frame. The problem with that is double: a) not all the packages that implement ML algorithms require a data.frame (but almost all of them
# require a matrix); b) when you convert a dfm into a data.frame, some packages implement a ML algorithm (for example the package randomForest) can have problems to read those columns that
# correspond to features either beginning with a number, of with @ or # (as it is quite typical when dealing with social media texts).
# There are ways to deal with this problem however (see below).
#####################################################
# FIFHT STEP: let's estimate a ML Model
#####################################################
#####################################################
#####################################################
# Let's start with a Naive Bayes Model
#####################################################
#####################################################
# We will use the naivebayes package. Another possibile package you can consider is the fastNaiveBayes package
# The naivebayes package requires to work on a matrix, not on a data frame!
# Given our training-set, we have to use a Multinomial rather rather than a Binomial distribution given that our
# features can assume a value different than just 0/1, i.e., a one-hot-encoding. Indeed:
table(Dfm_train@x )
# to run a Binomial model with naivebayes just replace "multinomial_naive_bayes" with "bernoulli_naive_bayes" in the below command
# Which is our DV?
str(Dfm_train@docvars$choose_one) # that's an integer variable. Not good for a classification! It should always be a factor (or a string
# at least if using the naivebayes package)
# So we will use as.factor to transform it into a factor variable. As laplace smoother let's select 1.
# Remember, however, this is a tuning parameter! More on this later on
system.time(NB <- multinomial_naive_bayes(x=train, y=as.factor(Dfm_train@docvars$choose_one), laplace = 1))
summary(NB)
prop.table(table(Dfm_train@docvars$choose_one)) # prior probabilities
# let's predict the test-set
predicted_nb <- predict(NB ,test )
table(predicted_nb )
prop.table(table(predicted_nb ))
# so basically to train and predict you need just two lines of command!
# NB <- multinomial_naive_bayes(x=train, y=as.factor(Dfm_train@docvars$choose_one), laplace = 1)
# predicted_nb <- predict(NB ,test )
# However, this is not the end of the story! Never blindly trust the results of your ML algorithm!
# So let's do some Global Interpretation of our NB!
# let's see for example the association between words and probabilities (i.e., matrix with class conditional parameter estimates - i.e., the likelihood!).
# Take a look at "casualti" and "cream". The likelihood for the former is higher for a tweet discussing about a disaster ((i.e., Dfm_train@docvars$choose_one=1) compared
# to irrelavant tweets ((i.e., Dfm_train@docvars$choose_one=0), i.e., p(casualti|1)> p(casualti|0); the opposite happens for the word cream.
head(NB$params)
# But now let's take a step forward. The package naivebayes does not come with a model-specific VI.
# Here we could either 1) focus on the likelihoods above, i.e., we could argue that those features that present the highest absolute value
# in the difference between likelihoods can be also considered as the most important ones in affecting the overall performance of the algorithm.
# (note that this can become messy when you deal with a multi-class DV),
# or 2) we can apply a permutational approach via iml package.
# Let's follow this latter option!
# Remember: in both cases, we are assessing feature importance based on the training data
# (given that we haven't any validation data, i.e., data that have been coded but not employed to train the algorithm here).
# The procedure of randomly shuffling a single feature value breaks the relationship between the feature and the target,
# thus the drop in the model score is indicative of how much the model depends on the feature.
# REMEMBER: Permutation importance does not reflect to the intrinsic predictive value of a feature by itself but how important
# this feature is for a particular model.
# For running a permutation, you need first to focus on your DV if it is a string. For example, if you have two class labels such as "relevant" and "not-relevant"
# you need to change the latter to "nonrelevant" (i.e., no space or "-" or anything else) before transforming it into a factor.
# Otherwise the iml package will collapse. I discovered it once, after spending 2 hours on iml!!!
# Not our problem here!
as.factor(Dfm_train@docvars$choose_one)
# The iml package requires:
# 1) a data frame, not a matrix, on which computing the prediction of the model (note that you have run the previous model with a matrix!)
# 2) as a result of 1), a specific function that allows you to move from the original matrix to a data frame when computing the class (or
# the probabilities) of each instance/text
# 3) iml does NOT want features that begin with @, #, etc. when you use a data.frame (as it is the case with the social media of our corpus).
# So you have to solve this problem!
# 4) as a result of what just underlined, you also need to recompute the NB results with the converted data frame created at point 3). Of course the outcome
# of the model under 4) is going to be exactly the same as your original NB above (after all you just replace the @ in front of a word
# with a X - as we will see!)
# Let's convert the dfm of the training set into a data frame and let's remove its first columns
# (that refers to doc_id not to a feature!)
trainDF <- convert(Dfm_train, to="data.frame")
head(colnames(trainDF))
trainDF <- trainDF[,-c(1)]
head(colnames(trainDF))
# As discussed above, iml does NOT want features that begin with @, #, etc. when you use a data.frame.
# The command make.names allows us to automatically place the character "X" in such cases at the beginning of a feature
# while all "invalid" characters are translated to "."
# By adding unique=TRUE the two words @best and #best will be treated differently!
# focus on obs [537] and [538]
colnames(trainDF)
colnames(trainDF) <- make.names(colnames(trainDF), unique = TRUE)
colnames(trainDF)
# let's create a "cleaned" matrix out of this "cleaned" data.matrix
train2 <- as.matrix(trainDF)
# let's re-estimate our NB with this "cleaned" matrix
system.time(NB2 <- multinomial_naive_bayes(x=train2, y=as.factor(Dfm_train@docvars$choose_one), laplace = 1))
# of course, same likelihoods as above!
head(NB2$params)
head(NB$params)
# We can focus either on the predicted class or on the predicted probabilities below (same results of course!).
# Let's focus on the latter one. Here we want to predict our training-set!
head(predict(NB2, train2))
# let's define our predictive function
predNB <-function(model, newdata){
newData_x <- as.matrix(newdata)
results<-predict(model, newData_x)
return(results)
}
head(predNB (NB2, train2))
# let's apply this function to our NB by creating an object that holds the model and the data
modNB <- Predictor$new(NB2, data = trainDF, y =as.factor(Dfm_train@docvars$choose_one), predict.fun = predNB)
modNB $predict(trainDF[1:5, ])
# let's define the performance metric for the trained model we want to use to compute our model error
library(help = "Metrics")
# here we select "ce"
# plan("callr", workers = 6) below allows you to run the permutation by taking advantages of 6 cores in your computer
# If you have 4 cores write workers=4, etc.
# This is a so called "parallelization". It means that the computation per feature is distributed among several cores of your machine.
# This can save you LOTS of time!
# You can also replicate the analysis (i.e. the shuffling of the feature) X time (for example: n.repetitions=5).
# The higher the number of repetitions the more stable and accurate the results become (plus: you will have
# also the opportunity to measure their variability).
# However, in our case to save time we just asked for 1 repetition!
# let's compute VI for our NB model
system.time({
plan("callr", workers = 6)
set.seed(123)
impNB <- FeatureImp$new(modNB , loss = "ce", n.repetitions=1)
})
# remember: feature importance is estimated as a ratio (permuted error / original model error). Any value larger than 1 identifies therefore an important feature.
# If you ML model overfits (i.e., perfectly predicts) the training-set, by default iml switches from computing feature importance as a ratio to a difference.
# In this latter case, any value larger than 0 identifies an important feature
plot(impNB)
# let's add some interactivity!
graph <- plot(impNB)
ggplotly(graph )
head(impNB $results[,c(1:4)])
res <- impNB $results[,c(1,3)]
str(res)
# let's rename the column "feature" into "Feature"
colnames(res)[1] <- "Feature"
str(res)
######## OK we know which features are important now. But what about their relationship with the class-labels?
# The nice thing with iml is that you compute Partial Dependence Plots (PDP) that tells you exactly that.
# The PDP plot displays the average change in predicted DV as we vary a given feature value while holding all other variables constant.
# It serves as a depiction of the relationship between a predictor of interest and the
# dependent variable, while basically “controlling” for all other effects of other predictors.
# The intuitive explanation here is that we hold all control variables, Xc, at their actual value for each observation i.
# We then set Xs to a single value of Xs for all observations (e.g., fix Xs = 1 for all observations, where Xs can be for example the feature "fear").
# Next, this fixed value of Xs (as well as the corresponding Xc values specific to each observation) is run through the prediction function
# in order to obtain a prediction for every observation i.
# We then fix Xs to the next value (e.g., Xs = 2) calculate new predictions, and continue this process of iterating across all values of Xs for which
# we want to create the PDP. As a final step, we average over all resulting predictions in order to obtain the average predicted value of the outcome,
# given each value of Xs. This forms the PDP.
plot(FeatureEffect$new(modNB , "fear", method = "pdp"))
plot(FeatureEffect$new(modNB , "kill", method = "pdp"))
plot(FeatureEffect$new(modNB , "collid", method = "pdp"))
plot(FeatureEffect$new(modNB , "bag", method = "pdp"))
plot(FeatureEffect$new(modNB , "need", method = "pdp"))
plot(FeatureEffect$new(modNB , "fire", method = "pdp"))
# The problem here is that we need to produce a plot for each single feature. What about trying to find a most straightforward approach?
# This option is the SAME one we will always employ with any ML algorithm we will discuss. It has the advantage of being more flexible
# than the previous one, in particular when you have more than 2 class-labels as your DV
# First of all, let's predict the training-set texts and let's store a new variable in our dfm of the training-set with such predictions.
# Why that? Cause we will use such variable to identify the "sign" (i.e., the class-labels) of the most important features
predicted_nb <- predict(NB2, train2)
table(predicted_nb )
Dfm_train@docvars$predNB <- predicted_nb
str(Dfm_train@docvars)
# First, let's estimate the number of times a word appear in those tweets that we have classified as related to accidents or otherwise.
# Remember that in this case predNB assumes two values: 0 or 1.
str(Dfm_train@docvars$predNB)
# For other situations, you should change the "for (v in 0:1)" part.
# For example, if predNB assumes values "1" and "2", you should write: "for (v in 1:2)".
# Moreover I would suggest you to transform your predicted DV saved in your dfm into an integer if the former one is a string!
# Below, I add a +1 to v here: "sums[[v+1]]" cause you cannot add an object to a list with a position in it=0!
# Otherwise, you would get an error. Try yourself!
sums <- list()
for (v in 0:1){
sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predNB"]==v,])
}
str(sums)
head(sums[[1]])
head(sums[[2]])
# do.call allows to use cbind on a list to create a matrix (i.e., a collection of objects of different types as in the case of sums)
sums <- do.call(cbind, sums)
class(sums)
str(sums)
print(sums[2:7,])
# Let's apply the sign's value (either 1 or 2) if, respectively, a word appears more in tweets classified as not related to accidents or related to them.
# Note for the below command: when you write "1", the apply function works over rows; "2" over columns. Given that we want here to identify the highest
# frequencies over rows (i.e., the highest frequencies of each word over the class labels), we specify below "1"
sign <- apply(sums, 1, which.max)
# you get a value of 1(or 2) if a word appears more (or less) often with respect to the first class (here (i.e., Dfm_train@docvars$choose_one=0) than to the second class (here (i.e., Dfm_train@docvars$choose_one=1)
print(sign[2:7])
# What if you have a 3 class-labels as your DV? The only thing to change above (as far as you use the as.factor option in the estimation of the naive bayes model)
# would be once again the "for (v in 0:1)" part. It must become now for example "for (v in 0:2)".
# Of course in this latter case you would also get three values (1, 2 and 3)
# get the feature
names <- dimnames(train2)[[2]]
str(names)
df <- data.frame(Feature = names, sign=sign)
str(df)
df$sign2<-factor(df$sign,labels = c("irrelevant","relevant"))
str(df)
# let's merge the dataframe we just created with the dataframe that was including the VI permutation scores
importanceNB <- merge(res, df, by="Feature")
str(importanceNB )
clist <- c("irrelevant", "relevant")
for (v in clist){
cat("\n\n")
cat("value==", v)
importanceNB <- importanceNB [order(importanceNB $ importance, decreasing=TRUE),]
print(head(importanceNB [importanceNB $sign2==v,], n=10))
}
# the top 10 words for the disaster tweets (i.e., Dfm_train@docvars$choose_one=1)
importance2NB <- head(importanceNB [importanceNB $sign2=="relevant",], n=10)
importance2NB
# the top 10 words for tweets not related to disasters (i.e., Dfm_train@docvars$choose_one=0)
importance3NB <-head(importanceNB [importanceNB $sign2=="irrelevant",], n=10)
importance3NB
importance_newNB <- rbind(importance2NB , importance3NB )
str(importance_newNB )
# let's reorder the features
importance_newNB$Feature <- with(importance_newNB , factor(Feature,levels=Feature[order(ave(importance,sign2,FUN=max),importance)]))
str(importance_newNB)
ggplot(importance_newNB, aes(Feature, importance, fill = sign2)) +
geom_bar(stat="identity", fill= "white") + theme_classic() +
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Permutation Coefficients") +
labs(title= "Relevant to accidents versus Irrelevant to accidents features - NB Model",
fill= "Disaster") + scale_fill_manual(values = c("#000033", "#006699"))
#####################################################
#####################################################
# Let's now run a Random Forest
#####################################################
#####################################################
# We will use the randomForest package. If you are employing compressed matrices, plz use the ranger package.
# The ranger package is also usually faster than the randomForest one.
# The randomForest package also work with a data frame, rather than with a matrix. But remember the issue about the @ and # discussed above!
# Note that as hyperameter (or tuning-parameter), I select a specific value for the number of trees.
# A RF has also other hyperparameters. More on this tomorrow!
set.seed(123) # (define a set.seed for being able to replicate the results!)
system.time(RF <- randomForest(y= as.factor(Dfm_train@docvars$choose_one), x=train, importance=TRUE, do.trace=TRUE, ntree=500))
RF
# let's predict the test-set. We define the seed to get always the same prediction. Otherwise, you can get each time slighty
# different results according to how to randomly classify a given document when you have a draw across decision trees
# (i.e., say 250 trees say "positive" and 250 trees say "negative")
set.seed(123)
system.time(predicted_rf <- predict(RF, test))
table(predicted_rf )
set.seed(14)
system.time(predicted_rf <- predict(RF, test))
table(predicted_rf )
# note that if you add: "predict.all=TRUE", then the returned object is a list
# of 2 components: aggregate, which is the vector of predicted values by the forest (as above),
# and individual, which is a matrix where each column contains prediction by a tree in the forest.
set.seed(123)
system.time(predicted_rf2 <- predict(RF, test, predict.all=TRUE))
str(predicted_rf2)
table(predicted_rf2$aggregate )
nrow(test)
head(predicted_rf2$individual)
# As discussed, a natural benefit of the bootstrap resampling process is that random forests have an out-of-bag (OOB) sample
# that provides a reasonable approximation of the test error. This provides a built-in validation set without any extra work on your part.
# However, REMEMBER: this is less efficient (especially with medium to large training-set) compared with
# doing a k-fold cross-validation as we will see tomorrow!
str(RF$err.rate)
RF$err.rate[nrow(RF$err.rate),] # our final error (overall, and for each of the two classes: irrelevant and relevant tweets)
plot(RF, type = "l", col = c("red", "black","springgreen4"), main = "Class errors in Random Forest Model / OOB")
legend("topright", horiz = F, cex = 0.7, fill = c( "black", "red", "springgreen4"),
c( "No-disaster tweets","Average error", "Disaster tweets"))
error <- as.data.frame(RF$err.rate)
# number of trees with the lowest OOB error
which.min(error$OOB) # 455
set.seed(123)
system.time(RF2 <- randomForest(y= as.factor(Dfm_train@docvars$choose_one), x=train, importance=TRUE, ntree=455, do.trace=TRUE))
RF$err.rate[nrow(RF$err.rate),] # our final error (overall, and for each of the two classes)
RF2$err.rate[nrow(RF2$err.rate),] # our final error (overall, and for each of the two classes)
# Now let's do some Global Interpretation also for our RF! We could have employed once again a permutational approach as above of course.
# However, the randomForest package has it own way of quantifying the importance of each feature - i.e., it produces a model-specific VI
head(RF$importance[,3:4])
# The importance of each featuress is assessed based on two criteria:
# -MeanDecreaseAccuracy: gives a rough estimate of the loss in prediction performance of a particular variable.
# This measure is computed from permuting OOB data: for each tree, the prediction error on the OOB portion of the data is recorded
# (error rate for classification). Then the same is done after permuting a given predictor variable.
# The difference between the two are then averaged over all trees, and normalized by the standard deviation of the differences.
# (i.e., we are assessing feature importance based on the validation - OOB - data!)
varImpPlot(RF )
# From the graph we can conclude that a randomly permuted "fire" variable increases the avg. error rate by around
# 19 percent relative to the actual variable value. Thus, we would conclude that including "fire" is
# important if we are trying to predict our DV in the OOB data.
# Caveat: if two variables are somewhat redundant, then omitting one of them may not lead to massive gains in prediction performance,
# but would make the second variable more important.
# -MeanDecreaseGini: GINI importance measures the average gain of purity by splits of a given variable. Highest purity means that each node
# contains only elements of a single class. Think of it like this: if you use this feature to split the data, how pure will the nodes be?
# As a result, permuting a useful variable, tend to give relatively large decrease in mean gini-gain.
# On the contrary, splitting by a permuted not useful variable tends neither to increase nor decrease node purities.
# Typically, MeanDecreaseAccuracy tends to be more reliable than MeanDecreaseGini. Therefore we focus here on the latter one
# let's extract the matrix for GINI and Accuracy
importance_RF <- as.data.frame(RF$importance[,3:4])
str(importance_RF)
importance_RF$Feature<- row.names(importance_RF)
str(importance_RF)
print(head(importance_RF[order(importance_RF$MeanDecreaseAccuracy, decreasing=TRUE),]))
# why don't we get the same results we got from here?
varImpPlot(RF )
# Cause the above graph is shown with all predictors lined up from most to least important, where each value shows the average decrease in accuracy,
# SCALED by its standard deviation in order to normalize the measures! And where is this standard deviation?
# there are three values for SD (for 0, 1 and the avg)
str(RF$importanceSD)
# focus on "fire"
importance_RF[row.names(importance_RF)=="fire", ]
importance_RF$mdaNORM <- importance_RF$MeanDecreaseAccuracy/RF$importanceSD[,3]
importance_RF[row.names(importance_RF)=="fire", ]
# now same results!
varImpPlot(RF )
# Note that these measures are used to rank variables in terms of importance.
# As a result, we just get for example the MeanDecreaseAccuracy statistics overall w/o differentiating the words most important for each specific classes.
# Which are however the most important words for the relevant label? And for the irrelevant one?
# Let's do exactly what we did above for the Naive Bayes algorithm...that is:
# let's first predict the training-set texts and let's store a new variable in our dfm of the training-set with such predictions
predicted_rf <- predict(RF, train)
table(predicted_rf )
Dfm_train@docvars$predRF <- predicted_rf
str(Dfm_train@docvars)
# let's now assign a feature to the relevant/irrelevant class according to its frequency
sums <- list()
for (v in 0:1){
sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predRF"]==v,])
}
sums <- do.call(cbind, sums)
# let's apply the sign (either 1 or 2) if, respectively, a word appears more in the tweets classified as not related to accidents or related to them
sign <- apply(sums, 1, which.max)
names <- dimnames(train)[[2]]
dfF <- data.frame(Feature = names, sign=sign, stringsAsFactors=F)
str(dfF )
dfF $sign2<-factor(dfF $sign,labels = c("irrelevant","relevant"))
str(dfF )
importanceRF <- merge(importance_RF, dfF, by="Feature")
str(importanceRF)
clist <- c("irrelevant", "relevant")
for (v in clist){
cat("\n\n")
cat("value==", v)
importanceRF <- importanceRF [order(importanceRF $ mdaNORM , decreasing=TRUE),]
print(head(importanceRF [importanceRF $sign2==v,], n=10))
}
# the top 10 words for the disaster tweets
importance2RF <- head(importanceRF [importanceRF $sign2=="relevant",], n=10)
importance2RF
# the top 20 words for tweets not related to disasters
importance3RF <-head(importanceRF [importanceRF $sign2=="irrelevant",], n=10)
importance3RF
importance_newRF <- rbind(importance2RF, importance3RF)
str(importance_newRF)
# reorder the features
importance_newRF$Feature <- with(importance_newRF, factor(Feature,levels=Feature[order(ave(mdaNORM ,sign2,FUN=max),mdaNORM )]))
str(importance_newRF)
ggplot(importance_newRF, aes(Feature, mdaNORM , fill = sign2)) +
geom_bar(stat="identity", fill= "white") + theme_classic() +
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Mean Decrease Accuracy coefficient") +
labs(title= "Relevant to accidents versus Irrelevant to accidents features - Random Forest Model",
fill= "Disaster") + scale_fill_manual(values = c("#000033", "#006699"))
######################################################
######################################################
# Let's compare the results out-of-sample we got via Naive Bayes & RF
######################################################
######################################################
prop.table(table(predicted_nb ))
prop.table(table(predicted_rf ))
results <- as.data.frame(rbind(prop.table(table(predicted_nb )), prop.table(table(predicted_rf ))))
str(results)
results$algorithm <- c("NB", "RF")
str(results)
# Let's plot the results!
df.long<-melt(results,id.vars=c("algorithm"))
str(df.long)
ggplot(df.long,aes(algorithm,value,fill=variable))+
geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() +
ylab(label="Review class in the test-set") + xlab("algorithm") + scale_fill_discrete(name = "Prediction", labels = c("Irrelevant", "Relevant"))
# Similar but not exactly the same results for the prediction!
# And so, which result is to trust more than the other one?
# For answering to that, we need to introduce the Cross-Validation procedure...
#################################################
# let's compare feature importance across models
#################################################
# let's check some correlation. For doing that, let's merge the dataframe
words <- merge(importanceNB, importanceRF, by="Feature")
# why 555 and not 571 obs? Cause in the importanceNB data frame you have applied the make.names command. Do you remember?
str(words)
str(importanceNB)
str(importanceRF)
# therefore let's do apply it also to the importanceRF data frame!
importanceRF$Feature <- make.names(importanceRF$Feature, unique = TRUE)
words <- merge(importanceNB, importanceRF, by="Feature")
# now we are happy!
str(words)
# let's see some correlation
chart.Correlation(words[c(2,7)])