rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(e1071)
library(randomForest)
library(caret)
library(naivebayes)
library(car)
library(ggplot2)
library(dplyr)
library(reshape2)
library(iml)
library(future)
library(future.callr)
library(gridExtra)
# The data we will be using are some English social media disaster tweets discussed in
# this article: https://arxiv.org/pdf/1705.02009.pdf
# It consist of a number of tweets regarding accidents mixed in with a selection control tweets (not about accidents)
#####################################################
# FIRST STEP: let's create the DfM for the training-set
#####################################################
x <- read.csv("train_disaster.csv", stringsAsFactors=FALSE)
str(x)
# class-label variable: choose_one (0=tweets not relevant; 1=relevant tweets)
table(x$choose_one)
prop.table(table(x$choose_one))
nrow(x)
myCorpusTwitterTrain <- corpus(x)
tok2 <- tokens(myCorpusTwitterTrain , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
# let's also remove the unicode symbols
tok2 <- tokens_remove(tok2, c("0*"))
tok2 <- tokens_wordstem (tok2)
Dfm_train <- dfm(tok2)
# Let's trim the dfm in order to keep only tokens that appear in 2 or more tweets (tweets are short texts!)
# and let's keep only features with at least 2 characters
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE)
Dfm_train <- dfm_remove(Dfm_train , min_nchar = 2)
topfeatures(Dfm_train , 20) # 20 top words
#####################################################
# SECOND STEP: let's create the DfM for the test-set
#####################################################
x10 <- read.csv("test_disaster.csv", stringsAsFactors=FALSE)
str(x10)
nrow(x10)
myCorpusTwitterTest <- corpus(x10)
tok <- tokens(myCorpusTwitterTest , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok <- tokens_remove(tok, stopwords("en"))
tok <- tokens_remove(tok, c("0*"))
tok <- tokens_wordstem (tok)
Dfm_test <- dfm(tok)
Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE)
Dfm_test<- dfm_remove(Dfm_test, min_nchar = 2)
topfeatures(Dfm_test , 20) # 20 top words
#####################################################
# THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern.
# after this step, we can "predict" by employing only the features included in the training-set
#####################################################
setequal(featnames(Dfm_train), featnames(Dfm_test))
length(Dfm_test@Dimnames$features)
length(Dfm_train@Dimnames$features)
test_dfm <- dfm_match(Dfm_test, features = featnames(Dfm_train))
length(test_dfm@Dimnames$features)
setequal(featnames(Dfm_train), featnames(test_dfm ))
#####################################################
# FOURTH STEP: Let's convert the two DfMs into matrices for the ML algorithms to work
#####################################################
train <- as.matrix(Dfm_train) # dense matrix
object.size(train)
# a compressed sparse matrix! What's the meaning of that? Take a look at the EXTRA slides on the home-page
# of the course!
trainSP <- as(Dfm_train, "dgCMatrix") # compressed matrix
object.size(trainSP )
object.size(train)/object.size(trainSP )
str(train)
str(trainSP)
# this is going to make a HUGE difference when you have a large matrix as test and train!!!
# we will mainly use normal (i.e., not compressed) matrix in our class given that we will always use small dataset.
# But keeps this in mind!
test <- as.matrix(test_dfm)
#####################################################
# FIFHT STEP: let's estimate a ML Model
#####################################################
#####################################################
# Let's start with a Naive Bayes Model
#####################################################
# we will use the naivebayes package. Another possibile package you can consider is the fastNaiveBayes package
# given our training-set, we have to use a Multinomial rather rather than a Binomial distribution given that our
# features can assume a value different than just 0/1, i.e., a one-hot-encoding. Indeed:
table(Dfm_train@x )
# to run a Binomial model with naivebayes just replace "multinomial_naive_bayes" with "bernoulli_naive_bayes" in the below command
# Which is our DV?
str(Dfm_train@docvars$choose_one) # that's an integer variable. Not good for a classification! It should always be a factor variable
# So we will use as.factor to transform it into a factor variable
# Note that instead of y=as.factor(Dfm_train@docvars$choose_one) in the formula of the NB we could have used
# y=as.factor(x$choose_one) of course! However I would always suggest you to extract the DV from the docvars of the DfM,
# especially if you have trimmed it...
system.time(NB <- multinomial_naive_bayes(x=train, y=as.factor(Dfm_train@docvars$choose_one), laplace = 1))
summary(NB)
prop.table(table(Dfm_train@docvars$choose_one)) # prior probabilities
# let's see the association between words and probabilities (i.e., matrix with class conditional parameter estimates - i.e., the likelihood!).
# take a look at "casualti" and "cream". The likelihood for the former is higher for a tweet discussing about a disaster (=1) compared
# to irrelavant tweets (=0), i.e., p(casualti|1)> p(casualti|0); the opposite happens for the word cream
head(NB$params)
# Let's investigate about this issue a bit more. Therefore, let's do some Global Interpretation of our NB!
NB_prob <- as.data.frame(NB$params)
NB_prob$Feature <- row.names(NB_prob)
str(NB_prob)
# let's estimate the features that change the most the difference between the relevant and irrelevant conditional probabilities
NB_prob$diff <- NB_prob[,2]-NB_prob[,1]
str(NB_prob)
# the features that present the highest absolute value in this difference can be also considered as the most important in
# affecting the overall performance of the algorithm
NB_prob$imp <- abs(NB_prob[,2]-NB_prob[,1])
str(NB_prob)
print(head(NB_prob[order(NB_prob$imp , decreasing=TRUE),], 15)) # most relevant words overall
# clearly, the features the present the highest positive values according to NB_prob$diff are the words
# most relevant for the "disaster" tweets (=1); those with the highest negative values are the words
# most relevant for the "irrelevant" tweets (=0)
print(head(NB_prob[order(NB_prob$diff , decreasing=TRUE),], 15)) # most relevant words for "disaster" tweets
print(head(NB_prob[order(NB_prob$diff , decreasing=FALSE),], 15)) # irrelevant words for the other tweets
NB_prob$sign <- ifelse(NB_prob$diff>0,"relevant","irrelevant")
str(NB_prob)
# let's extract the top 20-most irrelevant and the 20-most relevant contributing features
NB_prob2 <- top_n(NB_prob, 20, diff )
NB_prob2
NB_prob3 <- top_n(NB_prob, -20, diff )
NB_prob3
NB_prob_new <- rbind(NB_prob2, NB_prob3)
# reorder the features
NB_prob_new <- mutate(NB_prob_new, Feature= reorder(Feature, diff))
ggplot(NB_prob_new, aes(Feature, diff, fill = sign)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Difference in the conditional probabilities") +
scale_fill_manual(values = c("#000033", "#006699")) +
labs(title = "Disaster tweets",
subtitle = "Irrelevant (-) versus Relevant (+) words - Naive Bayes Model",
fill = "Disaster")
# let's store the graph
NB_graph <- ggplot(NB_prob_new, aes(Feature, diff, fill = sign)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Difference in the conditional probabilities") +
scale_fill_manual(values = c("#000033", "#006699")) +
labs(title = "Disaster tweets",
subtitle = "Irrelevant (-) versus Relevant (+) words - Naive Bayes Model",
fill = "Disaster")
# let's FINALLY predict the test-set
predicted_nb <- predict(NB ,test )
table(predicted_nb )
prop.table(table(predicted_nb ))
#####################################################
# let's run a Random Forest
#####################################################
# we will use the randomForest package. If you are employing compressed matrices, plz use the ranger package.
# the ranger package is also usually faster than the randomForest one
# note that as hyperameter (or tuning-parameter), I select a specific value for the number of trees. A RF has also other hyperparameters.
# More on this next week!
set.seed(123) # (define a set.seed for being able to replicate the results!)
system.time(RF <- randomForest(y= as.factor(Dfm_train@docvars$choose_one), x=train, importance=TRUE, do.trace=TRUE, ntree=500))
RF
# As discussed, a natural benefit of the bootstrap resampling process is that random forests have an out-of-bag (OOB) sample
# that provides a reasonable approximation of the test error. This provides a built-in validation set without any extra work on your part.
# However, REMEMBER: this is less efficient than changing several hyperparameters and doing a k-fold cross-validation as we will see in the next weeks!
str(RF$err.rate)
RF$err.rate[nrow(RF$err.rate),] # our final error (overall, and for each of the two classes: irrelevant and relevant tweets)
plot(RF, type = "l", col = c("red", "black","springgreen4"), main = "Class errors in Random Forest Model / OOB")
legend("topright", horiz = F, cex = 0.7, fill = c( "black", "red", "springgreen4"),
c( "No-disaster tweets","Average error", "Disaster tweets"))
error <- as.data.frame(RF$err.rate)
# number of trees with the lowest OOB error
which.min(error$OOB) # 441
set.seed(123)
system.time(RF2 <- randomForest(y= as.factor(Dfm_train@docvars$choose_one), x=train, importance=TRUE, ntree=441, do.trace=TRUE))
RF$err.rate[nrow(RF$err.rate),] # our final error (overall, and for each of the two classes)
RF2$err.rate[nrow(RF2$err.rate),] # our final error (overall, and for each of the two classes)
# Now let's do some Global Interpretation for our RF!
# What about the importance of each feature for our trained model?
# Variable importance is a standard metric in machine learning, which indicates the amount of information a variable provides
# to the model for predicting the outcome
head(RF$importance[,3:4])
# let's grap the result
varImpPlot(RF )
# Each features’s importance is assessed based on two criteria:
# -MeanDecreaseAccuracy: gives a rough estimate of the loss in prediction performance when that particular variable is omitted from the training set.
# Caveat: if two variables are somewhat redundant, then omitting one of them may not lead to massive gains in prediction performance,
# but would make the second variable more important.
# -MeanDecreaseGini: GINI is a measure of node impurity. Think of it like this: if you use this feature to split the data, how pure will the nodes be?
# Highest purity means that each node contains only elements of a single class.
# Assessing the decrease in GINI when that feature is omitted leads to an understanding of how important that feature is to split the data correctly.
# let's extract the matrix for GINI and Accuracy
importance_RF <- as.data.frame(RF$importance[,3:4])
str(importance_RF)
importance_RF$Feature<- row.names(importance_RF)
str(importance_RF)
# same words we get with
varImpPlot(RF )
print(head(importance_RF[order(importance_RF$MeanDecreaseGini, decreasing=TRUE),]))
# Note that these measures are used to rank variables in terms of importance and, thus, their absolute values could be disregarded.
# As a result, we just get for example the GINI statistics overall w/o differentiating the words most important for specific classes.
# Which are however the most important words for the relevant label? And for the irrelevant one?
# let's predict the training-set texts and let's store a new variable in our dfm of the training-set with such predictions.
# Why that? Cause we will use such variable to identify the "sign" of the most important features
predicted_rf <- predict(RF, train, type="class")
table(predicted_rf )
Dfm_train@docvars$predRF <- predicted_rf
str(Dfm_train@docvars)
# let's assign a feature to the relevant/irrelevant class according to its frequency (if, for example, the word "collapse" appears
# 10 times among predicted relevant tweets and 5 times among predicted irrelevant tweets,
# we classify it as pertaining to the "relevant tweets" world; and so on)
# let's estimate the number of times a word appear in those tweets that we have classified as related to accidents or otherwise
sums <- list()
for (v in 0:1){
sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predRF"]==v,])
}
str(sums)
sums <- do.call(cbind, sums) # do.call allows to use cbind on a list (i.e., collection of objects of different types as in the case of sums)
sums
# let's apply the sign (either 1 or 2) if, respectively, a word appears more in the tweets classified as not related to accidents or related to them
sign <- apply(sums, 1, which.max)
# get the feature
names <- dimnames(train)[[2]]
str(names)
df <- data.frame(
Feature = names,
sign = sign-1, # let's recode the sign between 0 and 1
stringsAsFactors=F)
str(df)
importanceRF <- merge(importance_RF, df, by="Feature")
str(importanceRF)
## best predictors
for (v in 0:1){
cat("\n\n")
cat("value==", v)
importanceRF <- importanceRF [order(importanceRF $MeanDecreaseGini, decreasing=TRUE),]
print(head(importanceRF [importanceRF $sign==v,], n=10))
cat("\n")
cat(paste(unique(head(importanceRF $Features[importanceRF $sign==v], n=10)), collapse=", "))
}
# let's draw a graph with our results focusing on MeanDecreaseGini
importanceRF$Gini <- ifelse(importanceRF $sign>0,importanceRF $MeanDecreaseGini,importanceRF $MeanDecreaseGini*-1)
# the twop 20 relevant words
importance2RF <- top_n(importanceRF, 20, Gini )
importance2RF
# the twop 20 irrelevant words
importance3RF <- top_n(importanceRF, -20, Gini )
importance3RF
importance_newRF <- rbind(importance2RF, importance3RF)
str(importance_newRF)
# reorder the features
importance_newRF <- mutate(importance_newRF, Feature= reorder(Feature, Gini ))
importance_newRF$sign2<- ifelse(importance_newRF$sign>0,"relevant","irrelevant")
ggplot(importance_newRF, aes(Feature, Gini, fill = sign2)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Mean Decrease Gini coefficient") +
scale_fill_manual(values = c("#000033", "#006699")) +
labs(title = "Disaster tweets",
subtitle = "Irrelevant (-) versus Relevant (+) words - Random Forest Model",
fill = "Disaster")
# let's store the graph
RF_graph <- ggplot(importance_newRF, aes(Feature, Gini, fill = sign2)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Mean Decrease Gini coefficient") +
scale_fill_manual(values = c("#000033", "#006699")) +
labs(title = "Disaster tweets",
subtitle = "Irrelevant (-) versus Relevant (+) words - Random Forest Model",
fill = "Disaster")
# let's FINALLY predict the test-set
system.time(predicted_rf <- predict(RF, test,type="class"))
table(predicted_rf )
prop.table(table(predicted_rf))
#### let's do the same thing while working with a data.frame rather than a matrix as above!
train2 <- convert(Dfm_train, to="data.frame")
class(train2)
test2 <- convert(test_dfm, to="data.frame")
class(test2)
# As a data frame we have to write below train[-1] cause the first column in our data frame refers now to doc_id, not to a feature!
head(colnames(train2))
# when dealing with matrices we write - as above:
# set.seed(123)
# system.time(RF <- randomForest(y= as.factor(Dfm_train@docvars$choose_one), x=train, importance=TRUE, do.trace=TRUE, ntree=500))
# when dealing with data.frame we write:
set.seed(123)
RF2 <- randomForest(as.factor(Dfm_train@docvars$choose_one) ~ ., data = train2[-1], importance=TRUE, do.trace=TRUE, ntree=500)
# why we get an error? cause the package randomForest does not want features that begin with @, #, etc. when you use a data.frame
# (while it is fine when using a matrix...)
# the command make.names allows us to automatically place the character "X" in such cases at the beginning of a feature
# while all "invalid" characters are translated to "."
# By adding unique=TRUE the two words @UKIP and #UKIP will be treated differently!
colnames(train2) <- make.names(colnames(train2), unique = TRUE)
colnames(train2)
colnames(test2) <- make.names(colnames(test2), unique = TRUE)
colnames(test2)
set.seed(123)
RF2 <- randomForest(as.factor(Dfm_train@docvars$choose_one) ~ ., data = train2[-1], importance=TRUE, do.trace=TRUE, ntree=500)
predicted_rf2 <- predict(RF2 , test2, type="class")
# basically the same result (at the third decimal values) as when working with a matrix of course!
round(prop.table(table(predicted_rf2 )),3)
round(prop.table(table(predicted_rf )),3)
######################################################
######################################################
# Let's compare the results out-of-sample we got via Naive Bayes & RF
######################################################
######################################################
prop.table(table(predicted_nb ))
prop.table(table(predicted_rf ))
results <- as.data.frame(rbind(prop.table(table(predicted_nb )), prop.table(table(predicted_rf ))))
str(results)
results$algorithm <- c("NB", "RF")
str(results)
# Let's plot the results!
library(reshape2)
df.long<-melt(results,id.vars=c("algorithm"))
str(df.long)
ggplot(df.long,aes(algorithm,value,fill=variable))+
geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() +
ylab(label="Review class in the test-set") + xlab("algorithm") + scale_fill_discrete(name = "Prediction", labels = c("Irrelevant", "Relevant"))
# Similar but not exactly the same results for the prediction!
# And so, which result is to trust more than the other one?
# For answering to that, we need to introduce the Cross-Validation procedure...
#################################################
# let's compare feature importance across models
#################################################
library(gridExtra)
grid.arrange(NB_graph, RF_graph, nrow=2) # Plot everything together
# let's check some correlation
# let's merge the dataframe
words <- merge(NB_prob, importanceRF, by="Feature")
str(words)
library(PerformanceAnalytics)
chart.Correlation(words[c(5, 7:8)])