rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(caret)
library(naivebayes)
library(car)
library(ggplot2)
library(dplyr)
library(reshape2)
# The data we will be using are some English social media disaster tweets discussed in
# this article: https://arxiv.org/pdf/1705.02009.pdf
# It consist of a number of tweets regarding accidents mixed in with a selection of control tweets (not about accidents)
#####################################################
# FIRST STEP: let's create the DfM for the training-set
#####################################################
x <- read.csv("train_disaster.csv", stringsAsFactors=FALSE)
str(x)
# class-label variable: choose_one (0=tweets not relevant; 1=relevant tweets)
table(x$choose_one)
prop.table(table(x$choose_one))
nrow(x)
myCorpusTwitterTrain <- corpus(x)
tok2 <- tokens(myCorpusTwitterTrain , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE,
remove_separators = TRUE, remove_url = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
# let's also remove the unicode symbols
tok2 <- tokens_remove(tok2, c("0*"))
tok2 <- tokens_wordstem (tok2)
Dfm_train <- dfm(tok2)
# Let's trim the dfm in order to keep only features that appear in 2 or more tweets (tweets are short texts!)
# and let's keep only features with at least 2 characters
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE)
Dfm_train <- dfm_remove(Dfm_train , min_nchar = 2)
topfeatures(Dfm_train , 20) # 20 top words
# Always checking if after trimming you have some texts with just 0s!
# 6 in this case
Dfm_train[ntoken(Dfm_train) == 0,]
Dfm_train <- Dfm_train[ntoken(Dfm_train) != 0,]
Dfm_train[ntoken(Dfm_train) == 0,]
#####################################################
# SECOND STEP: let's create the DfM for the test-set
#####################################################
x10 <- read.csv("test_disaster.csv", stringsAsFactors=FALSE)
str(x10)
nrow(x10)
myCorpusTwitterTest <- corpus(x10)
tok <- tokens(myCorpusTwitterTest , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE,
remove_separators = TRUE, remove_url = TRUE)
tok <- tokens_remove(tok, stopwords("en"))
tok <- tokens_remove(tok, c("0*"))
tok <- tokens_wordstem (tok)
Dfm_test <- dfm(tok)
Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE)
Dfm_test<- dfm_remove(Dfm_test, min_nchar = 2)
topfeatures(Dfm_test , 20) # 20 top words
# Always checking if after trimming you have some texts with just 0s!
# 11 in this case
Dfm_test[ntoken(Dfm_test) == 0,]
Dfm_test<- Dfm_test[ntoken(Dfm_test) != 0,]
Dfm_test[ntoken(Dfm_test) == 0,]
#####################################################
# THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern.
# after this step, we can "predict" by employing only the features included in the training-set. This step is always advisable
# everytime we have two separated datasets for the training and the test-set
#####################################################
setequal(featnames(Dfm_train), featnames(Dfm_test))
length(Dfm_test@Dimnames$features)
length(Dfm_train@Dimnames$features)
test_dfm <- dfm_match(Dfm_test, features = featnames(Dfm_train))
length(test_dfm@Dimnames$features)
setequal(featnames(Dfm_train), featnames(test_dfm ))
#####################################################
# FOURTH STEP: Let's convert the two DfMs into matrices for the ML algorithms to work
#####################################################
train <- as.matrix(Dfm_train)
test <- as.matrix(test_dfm)
#####################################################
# FIFHT STEP: let's estimate a ML Model
#####################################################
#####################################################
# Let's start with a Naive Bayes Model
#####################################################
# we will use the naivebayes package. Another possibile package you can consider is the fastNaiveBayes package
# given our training-set, we have to use a Multinomial rather rather than a Binomial distribution given that our
# features can assume a value different than just 0/1, i.e., a one-hot-encoding. Indeed:
table(Dfm_train@x )
# to run a Binomial model with naivebayes just replace "multinomial_naive_bayes" with "bernoulli_naive_bayes" in the below command
# Which is our DV?
str(Dfm_train@docvars$choose_one) # that's an integer variable. Not good for a classification! It should always be a factor variable
# So we will use as.factor to transform it into a factor variable
# Note that instead of y=as.factor(Dfm_train@docvars$choose_one) in the formula of the NB we could have used
# y=as.factor(x$choose_one) of course! However I would always suggest you to extract the DV from the docvars of the DfM,
# especially if you have trimmed it...
system.time(NB <- multinomial_naive_bayes(x=train, y=as.factor(Dfm_train@docvars$choose_one), laplace = 1))
summary(NB)
prop.table(table(Dfm_train@docvars$choose_one)) # prior probabilities
# let's see the association between words and probabilities (i.e., matrix with class conditional parameter estimates - i.e., the likelihood!).
# take a look at "casualti" and "cream". The likelihood for the former is higher for a tweet discussing about a disaster (=1) compared
# to irrelavant tweets (=0), i.e., p(casualti|1)> p(casualti|0); the opposite happens for the word cream
head(NB$params)
# Let's investigate about this issue a bit more. Why should we care about it?
# Often, ML models are considered “black boxes” due to their complex inner-workings. However, because of their complexity,
# they are typically more accurate for predicting nonlinear or rare phenomena. Unfortunately, more accuracy often comes at the
# expense of interpretability, and interpretability is crucial. It is not enough to identify a ML model that optimizes
# predictive performance; understanding and trusting model results is a hallmark of good (social and political) science!
# Luckily, several advancements have been made to aid in interpreting ML models over the years.
# Interpreting ML models is an emerging field that has become known as "Interpretable Machine Learning" (IML).
# Approaches to model interpretability can be broadly categorized as providing global or local explanations.
# Global interpretations help us understand the inputs and their entire modeled relationship with the prediction target.
# Local interpretations help us understand model predictions for a single row of data or a group of similar rows.
# Global interpretability in particular is about understanding how the model makes predictions, based on a holistic view of
# its features and how they influence the underlying model structure.
# It answers questions regarding which features are relatively influential as well as how these features influence the response variable.
# In this latter case, we assess the magnitude of a variable’s relationship with the response as compared to other variables used in the model.
# In text analytics (given that we are dealining with huge DfM wherein the features per-se are not the main focus of our interest)
# we are mainly interested in global interpretations; however if you use ML for other aims, local interpretations become VERY important!!!
# So let's do some Global Interpretation for our NB!
NB_prob <- as.data.frame(NB$params)
NB_prob$Feature <- row.names(NB_prob)
str(NB_prob)
# let's estimate the features that change the most the difference between the relevant and irrelevant conditional probabilities
NB_prob$diff <- NB_prob[,2]-NB_prob[,1]
str(NB_prob)
# the features that present the highest absolute value in this difference can be also considered as the most important in
# affecting the overall performance of the algorithm
NB_prob$imp <- abs(NB_prob[,2]-NB_prob[,1])
str(NB_prob)
print(head(NB_prob[order(NB_prob$imp , decreasing=TRUE),], 15)) # most relevant words overall
# clearly, the features the present the highest positive values according to NB_prob$diff are the words
# most relevant for the "disaster" tweets (=1); those with the highest negative values are the words
# most relevant for the "irrelevant" tweets (=0)
print(head(NB_prob[order(NB_prob$diff , decreasing=TRUE),], 15)) # most relevant words for "disaster" tweets
print(head(NB_prob[order(NB_prob$diff , decreasing=FALSE),], 15)) # irrelevant words for the other tweets
NB_prob$sign <- ifelse(NB_prob$diff>0,"relevant","irrelevant")
str(NB_prob)
# let's extract the top 20-most irrelevant and the 20-most relevant contributing features
NB_prob2 <- top_n(NB_prob, 20, diff )
NB_prob2
NB_prob3 <- top_n(NB_prob, -20, diff )
NB_prob3
NB_prob_new <- rbind(NB_prob2, NB_prob3)
# reorder the features
NB_prob_new <- mutate(NB_prob_new, Feature= reorder(Feature, diff))
ggplot(NB_prob_new, aes(Feature, diff, fill = sign)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Difference in the conditional probabilities") +
scale_fill_manual(values = c("#000033", "#006699")) +
labs(title = "Disaster tweets",
subtitle = "Irrelevant (-) versus Relevant (+) words - Naive Bayes Model",
fill = "Disaster")
# let's FINALLY predict the test-set
predicted_nb <- predict(NB ,test )
table(predicted_nb )
prop.table(table(predicted_nb ))