rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(randomForest)
library(caret)
library(naivebayes)
library(car)
library(ggplot2)
library(dplyr)
library(reshape2)
# The data we will be using are some English social media disaster tweets discussed in
# this article: https://arxiv.org/pdf/1705.02009.pdf
# It consist of a number of tweets regarding accidents mixed in with a selection control tweets (not about accidents)
#####################################################
# FIRST STEP: let's create the DfM for the training-set
#####################################################
x <- read.csv("train_disaster.csv", stringsAsFactors=FALSE)
str(x)
# class-label variable: choose_one (0=tweets not relevant; 1=relevant tweets)
table(x$choose_one)
prop.table(table(x$choose_one))
nrow(x)
myCorpusTwitterTrain <- corpus(x)
tok2 <- tokens(myCorpusTwitterTrain , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
# let's also remove the unicode symbols
tok2 <- tokens_remove(tok2, c("0*"))
tok2 <- tokens_wordstem (tok2)
Dfm_train <- dfm(tok2)
# Let's trim the dfm in order to keep only tokens that appear in 2 or more tweets (tweets are short texts!)
# and let's keep only features with at least 2 characters
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE)
Dfm_train <- dfm_remove(Dfm_train , min_nchar = 2)
topfeatures(Dfm_train , 20) # 20 top words
#####################################################
# SECOND STEP: let's create the DfM for the test-set
#####################################################
x10 <- read.csv("test_disaster.csv", stringsAsFactors=FALSE)
str(x10)
nrow(x10)
myCorpusTwitterTest <- corpus(x10)
tok <- tokens(myCorpusTwitterTest , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok <- tokens_remove(tok, stopwords("en"))
tok <- tokens_remove(tok, c("0*"))
tok <- tokens_wordstem (tok)
Dfm_test <- dfm(tok)
Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE)
Dfm_test<- dfm_remove(Dfm_test, min_nchar = 2)
topfeatures(Dfm_test , 20) # 20 top words
#####################################################
# THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern.
# after this step, we can "predict" by employing only the features included in the training-set
#####################################################
setequal(featnames(Dfm_train), featnames(Dfm_test))
length(Dfm_test@Dimnames$features)
length(Dfm_train@Dimnames$features)
test_dfm <- dfm_match(Dfm_test, features = featnames(Dfm_train))
length(test_dfm@Dimnames$features)
setequal(featnames(Dfm_train), featnames(test_dfm ))
#####################################################
# FOURTH STEP: Let's convert the two DfMs into matrices for the ML algorithms to work
#####################################################
train <- as.matrix(Dfm_train)
test <- as.matrix(test_dfm)
#####################################################
# FIFHT STEP: let's estimate a ML Model
#####################################################
#####################################################
# Let's start with a Naive Bayes Model
#####################################################
# we will use the naivebayes package. Another possibile package you can consider is the fastNaiveBayes package
# given our training-set, we have to use a Multinomial rather rather than a Binomial distribution given that our
# features can assume a value different than just 0/1, i.e., a one-hot-encoding. Indeed:
table(Dfm_train@x )
# to run a Binomial model with naivebayes just replace "multinomial_naive_bayes" with "bernoulli_naive_bayes" in the below command
# Which is our DV?
str(Dfm_train@docvars$choose_one) # that's an integer variable. Not good for a classification! It should always be a factor variable
# So we will use as.factor to transform it into a factor variable
# Note that instead of y=as.factor(Dfm_train@docvars$choose_one) in the formula of the NB we could have used
# y=as.factor(x$choose_one) of course!
system.time(NB <- multinomial_naive_bayes(x=train, y=as.factor(Dfm_train@docvars$choose_one), laplace = 1))
summary(NB)
prop.table(table(Dfm_train@docvars$choose_one)) # prior probabilities
# let's see the association between words and probabilities (i.e., matrix with class conditional parameter estimates).
# take a look at "casualti" and "cream". The former increases the probability of a relevant tweet (=1) compared to the latter one (=0)
head(NB$params)
# Let's investigate about this issue a bit more. Why should we care about it?
# Often, ML models are considered “black boxes” due to their complex inner-workings. However, because of their complexity,
# they are typically more accurate for predicting nonlinear or rare phenomena. Unfortunately, more accuracy often comes at the
# expense of interpretability, and interpretability is crucial. It is not enough to identify a ML model that optimizes
# predictive performance; understanding and trusting model results is a hallmark of good (social and political) science!
# Luckily, several advancements have been made to aid in interpreting ML models over the years.
# Interpreting ML models is an emerging field that has become known as "Interpretable Machine Learning" (IML).
# Approaches to model interpretability can be broadly categorized as providing global or local explanations.
# Global interpretations help us understand the inputs and their entire modeled relationship with the prediction target.
# Local interpretations help us understand model predictions for a single row of data or a group of similar rows.
# Global interpretability in particular is about understanding how the model makes predictions, based on a holistic view of
# its features and how they influence the underlying model structure.
# It answers questions regarding which features are relatively influential as well as how these features influence the response variable.
# In this latter case, we assess the magnitude of a variable’s relationship with the response as compared to other variables used in the model.
# In text analytics (given that we are dealining with huge DfM wherein the features per-se are not the main focus of our interest)
# we are mainly interested in global interpretations; however if you use ML for other aims, local interpretations become VERY important!!!
# So let's do some Global Interpretation for our NB!
NB_prob <- as.data.frame(NB$params)
NB_prob$Feature <- row.names(NB_prob)
str(NB_prob)
# let's estimate the features that change the most the difference between the relevant and irrelevant conditional probabilities
NB_prob$diff <- NB_prob[,2]-NB_prob[,1]
str(NB_prob)
# the features that present the highest absolute value in this difference can be also considered as the most important in
# affecting the overall performance of the algorithm
NB_prob$imp <- abs(NB_prob[,2]-NB_prob[,1])
str(NB_prob)
print(head(NB_prob[order(NB_prob$imp , decreasing=TRUE),], 15)) # most relevant words overall
# clearly, the features the present the highest positive values according to NB_prob$diff are the words
# most relevant for the "disaster" tweets (=1); those with the highest negative values are the words
# most relevant for the "irrelevant" tweets (=0)
print(head(NB_prob[order(NB_prob$diff , decreasing=TRUE),], 15)) # most relevant words for "disaster" tweets
print(head(NB_prob[order(NB_prob$diff , decreasing=FALSE),], 15)) # irrelevant words for the other tweets
NB_prob$sign <- ifelse(NB_prob$diff>0,"relevant","irrelevant")
str(NB_prob)
# let's extract the top 20-most irrelevant and the 20-most relevant contributing features
NB_prob2 <- top_n(NB_prob, 20, diff )
NB_prob2
NB_prob3 <- top_n(NB_prob, -20, diff )
NB_prob3
NB_prob_new <- rbind(NB_prob2, NB_prob3)
# reorder the features
NB_prob_new <- mutate(NB_prob_new, Feature= reorder(Feature, diff))
ggplot(NB_prob_new, aes(Feature, diff, fill = sign)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Difference in the conditional probabilities") +
scale_fill_manual(values = c("#000033", "#006699")) +
labs(title = "Disaster tweets",
subtitle = "Irrelevant (-) versus Relevant (+) words - Naive Bayes Model",
fill = "Disaster")
# let's store the graph
NB_graph <- ggplot(NB_prob_new, aes(Feature, diff, fill = sign)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Difference in the conditional probabilities") +
scale_fill_manual(values = c("#000033", "#006699")) +
labs(title = "Disaster tweets",
subtitle = "Irrelevant (-) versus Relevant (+) words - Naive Bayes Model",
fill = "Disaster")
# let's FINALLY predict the test-set
predicted_nb <- predict(NB ,test )
table(predicted_nb )
prop.table(table(predicted_nb ))
#####################################################
# let's run a Random Forest
#####################################################
# we will use the randomForest package. Another interesting pacakge you could explore for running a RF is the ranger package.
# note that as hyperameter (or tuning-parameter), I select a specific value for the number of trees. A RF has also other hyperparameters.
# More on this next week!
set.seed(123) # (define a set.seed for being able to replicate the results!)
system.time(RF <- randomForest(y= as.factor(Dfm_train@docvars$choose_one), x=train, importance=TRUE, do.trace=TRUE, ntree=500))
# As discussed yesterday, a natural benefit of the bootstrap resampling process is that random forests have an out-of-bag (OOB) sample
# that provides a reasonable approximation of the test error. This provides a built-in validation set without any extra work on your part.
# However, REMEMBER: this is less efficient than changing several hyperparameters and doing a k-fold cross-validation as we will see in the next weeks!
str(RF$err.rate)
RF$err.rate[nrow(RF$err.rate),] # our final error (overall, and for each of the two classes: irrelevant and relevant tweets)
plot(RF) # the errors for the two classes (red=irrelevant; green=relevant). We make a much greater error for the latter class); in black the average
error <- as.data.frame(RF$err.rate)
# number of trees with the lowest OOB error
which.min(error$OOB) # 441
set.seed(123)
system.time(RF2 <- randomForest(y= as.factor(Dfm_train@docvars$choose_one), x=train, importance=TRUE, ntree=441, do.trace=TRUE))
RF$err.rate[nrow(RF$err.rate),] # our final error (overall, and for each of the two classes)
RF2$err.rate[nrow(RF2$err.rate),] # our final error (overall, and for each of the two classes)
# Now let's do some Global Interpretation for our RF!
# What about the importance of each feature for our trained model?
# Variable importance is a standard metric in machine learning, which indicates the amount of information a variable provides
# to the model for predicting the outcome
head(RF$importance[,3:4])
# let's grap the result
varImpPlot(RF )
# Each features’s importance is assessed based on two criteria:
# -MeanDecreaseAccuracy: gives a rough estimate of the loss in prediction performance when that particular variable is omitted from the training set.
# Caveat: if two variables are somewhat redundant, then omitting one of them may not lead to massive gains in prediction performance,
# but would make the second variable more important.
# -MeanDecreaseGini: GINI is a measure of node impurity. Think of it like this: if you use this feature to split the data, how pure will the nodes be?
# Highest purity means that each node contains only elements of a single class.
# Assessing the decrease in GINI when that feature is omitted leads to an understanding of how important that feature is to split the data correctly.
# Plz note that these measures are used to rank variables in terms of importance and, thus, their absolute values could be disregarded.
# The problem is that we just get for example the GINI statistics overall w/o differentiating the words most important for specific classes.
# Which are however the most important words for the relevant label? And for the irrelevant one?
# let's extract the matrix for GINI and Accuracy
importance_RF <- as.data.frame(RF$importance[,3:4])
str(importance_RF)
importance_RF$Feature<- row.names(importance_RF)
str(importance_RF)
# same words we get with
varImpPlot(RF )
print(head(importance_RF[order(importance_RF$MeanDecreaseGini, decreasing=TRUE),]))
# let's predict the training-set texts and let's store a new variable in our dfm of the training-set with such predictions.
# Why that? Cause we will use such variable to identify the "sign" of the most important features
predicted_rf <- predict(RF, train, type="class")
table(predicted_rf )
Dfm_train@docvars$predRF <- predicted_rf
str(Dfm_train@docvars)
# let's assign a feature to the relevant/irrelevant class according to its frequency (if, for example, the word "collapse" appears
# 10 times among predicted relevant tweets and 5 times among predicted irrelevant tweets,
# we classify it as pertaining to the "relevant tweets" world; and so on)
# let's estimate the number of times a word appear in those tweets that we have classified as related to accidents or otherwise
sums <- list()
for (v in 0:1){
sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predRF"]==v,])
}
sums <- do.call(cbind, sums)
sums
# let's apply the sign (either 1 or 2) if, respectively, a word appears more in the tweets classified as not related to accidents or related to them
sign <- apply(sums, 1, which.max)
# get the feature
names <- dimnames(train)[[2]]
str(names)
df <- data.frame(
Feature = names,
sign = sign-1, # let's recode the sign between 0 and 1
stringsAsFactors=F)
str(df)
importanceRF <- merge(importance_RF, df, by="Feature")
str(importanceRF)
## best predictors
for (v in 0:1){
cat("\n\n")
cat("value==", v)
importanceRF <- importanceRF [order(importanceRF $MeanDecreaseGini, decreasing=TRUE),]
print(head(importanceRF [importanceRF $sign==v,], n=10))
cat("\n")
cat(paste(unique(head(importanceRF $Features[importanceRF $sign==v], n=10)), collapse=", "))
}
# let's draw a graph with our results focusing on MeanDecreaseGini
importanceRF$Gini <- ifelse(importanceRF $sign>0,importanceRF $MeanDecreaseGini,importanceRF $MeanDecreaseGini*-1)
# the twop 20 relevant words
importance2RF <- top_n(importanceRF, 20, Gini )
importance2RF
# the twop 20 irrelevant words
importance3RF <- top_n(importanceRF, -20, Gini )
importance3RF
importance_newRF <- rbind(importance2RF, importance3RF)
str(importance_newRF)
# reorder the features
importance_newRF <- mutate(importance_newRF, Feature= reorder(Feature, Gini ))
importance_newRF$sign2<- ifelse(importance_newRF$sign>0,"relevant","irrelevant")
ggplot(importance_newRF, aes(Feature, Gini, fill = sign2)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Mean Decrease Gini coefficient") +
scale_fill_manual(values = c("#000033", "#006699")) +
labs(title = "Disaster tweets",
subtitle = "Irrelevant (-) versus Relevant (+) words - Random Forest Model",
fill = "Disaster")
# let's store the graph
RF_graph <- ggplot(importance_newRF, aes(Feature, Gini, fill = sign2)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Mean Decrease Gini coefficient") +
scale_fill_manual(values = c("#000033", "#006699")) +
labs(title = "Disaster tweets",
subtitle = "Irrelevant (-) versus Relevant (+) words - Random Forest Model",
fill = "Disaster")
# let's FINALLY predict the test-set
system.time(predicted_rf <- predict(RF, test,type="class"))
table(predicted_rf )
prop.table(table(predicted_rf))
######################################################
######################################################
# Let's compare the results out-of-sample we got via Naive Bayes & RF
######################################################
######################################################
prop.table(table(predicted_nb ))
prop.table(table(predicted_rf ))
results <- as.data.frame(rbind(prop.table(table(predicted_nb )), prop.table(table(predicted_rf ))))
str(results)
results$algorithm <- c("NB", "RF")
str(results)
# Let's plot the results!
library(reshape2)
df.long<-melt(results,id.vars=c("algorithm"))
str(df.long)
ggplot(df.long,aes(algorithm,value,fill=variable))+
geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() +
ylab(label="Review class in the test-set") + xlab("algorithm") + scale_fill_discrete(name = "Prediction", labels = c("Irrelevant", "Relevant"))
# and so, which result is to trust more than the other one? For answering to that, we need to introduce the Cross-Validation procedure...next week!
#################################################
# let's compare feature importance across models
#################################################
library(gridExtra)
grid.arrange(NB_graph, RF_graph, nrow=2) # Plot everything together
# let's check some correlation
str(NB_prob)
str(importanceRF)
# let's merge the dataframe (w/o XGB for the moment)
words <- merge(NB_prob, importanceRF, by="Feature")
str(words)
library(PerformanceAnalytics)
chart.Correlation(words[c(5, 7:8)])