rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(caTools) library(caret) library(naivebayes) library(car) library(ggplot2) library(dplyr) library(reshape2) # The data we will be using are some English social media disaster tweets discussed in # this article: https://arxiv.org/pdf/1705.02009.pdf # It consist of a number of tweets regarding accidents mixed in with a selection of control tweets (not about accidents) ##################################################### # FIRST STEP: let's create the DfM for the training-set ##################################################### x <- read.csv("train_disaster.csv", stringsAsFactors=FALSE) str(x) # class-label variable: choose_one (0=tweets not relevant; 1=relevant tweets) table(x$choose_one) prop.table(table(x$choose_one)) nrow(x) myCorpusTwitterTrain <- corpus(x) tok2 <- tokens(myCorpusTwitterTrain , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_url = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) # let's also remove the unicode symbols tok2 <- tokens_remove(tok2, c("0*")) tok2 <- tokens_wordstem (tok2) Dfm_train <- dfm(tok2) # Let's trim the dfm in order to keep only features that appear in 2 or more tweets (tweets are short texts!) # and let's keep only features with at least 2 characters Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE) Dfm_train <- dfm_remove(Dfm_train , min_nchar = 2) topfeatures(Dfm_train , 20) # 20 top words # Always checking if after trimming you have some texts with just 0s! # 6 in this case Dfm_train[ntoken(Dfm_train) == 0,] Dfm_train <- Dfm_train[ntoken(Dfm_train) != 0,] Dfm_train[ntoken(Dfm_train) == 0,] ##################################################### # SECOND STEP: let's create the DfM for the test-set ##################################################### x10 <- read.csv("test_disaster.csv", stringsAsFactors=FALSE) str(x10) nrow(x10) myCorpusTwitterTest <- corpus(x10) tok <- tokens(myCorpusTwitterTest , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_url = TRUE) tok <- tokens_remove(tok, stopwords("en")) tok <- tokens_remove(tok, c("0*")) tok <- tokens_wordstem (tok) Dfm_test <- dfm(tok) Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE) Dfm_test<- dfm_remove(Dfm_test, min_nchar = 2) topfeatures(Dfm_test , 20) # 20 top words # Always checking if after trimming you have some texts with just 0s! # 11 in this case Dfm_test[ntoken(Dfm_test) == 0,] Dfm_test<- Dfm_test[ntoken(Dfm_test) != 0,] Dfm_test[ntoken(Dfm_test) == 0,] ##################################################### # THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern. # after this step, we can "predict" by employing only the features included in the training-set. This step is always advisable # everytime we have two separated datasets for the training and the test-set ##################################################### setequal(featnames(Dfm_train), featnames(Dfm_test)) length(Dfm_test@Dimnames$features) length(Dfm_train@Dimnames$features) test_dfm <- dfm_match(Dfm_test, features = featnames(Dfm_train)) length(test_dfm@Dimnames$features) setequal(featnames(Dfm_train), featnames(test_dfm )) ##################################################### # FOURTH STEP: Let's convert the two DfMs into matrices for the ML algorithms to work ##################################################### train <- as.matrix(Dfm_train) test <- as.matrix(test_dfm) ##################################################### # FIFHT STEP: let's estimate a ML Model ##################################################### ##################################################### # Let's start with a Naive Bayes Model ##################################################### # we will use the naivebayes package. Another possibile package you can consider is the fastNaiveBayes package # given our training-set, we have to use a Multinomial rather rather than a Binomial distribution given that our # features can assume a value different than just 0/1, i.e., a one-hot-encoding. Indeed: table(Dfm_train@x ) # to run a Binomial model with naivebayes just replace "multinomial_naive_bayes" with "bernoulli_naive_bayes" in the below command # Which is our DV? str(Dfm_train@docvars$choose_one) # that's an integer variable. Not good for a classification! It should always be a factor variable # So we will use as.factor to transform it into a factor variable # Note that instead of y=as.factor(Dfm_train@docvars$choose_one) in the formula of the NB we could have used # y=as.factor(x$choose_one) of course! However I would always suggest you to extract the DV from the docvars of the DfM, # especially if you have trimmed it... system.time(NB <- multinomial_naive_bayes(x=train, y=as.factor(Dfm_train@docvars$choose_one), laplace = 1)) summary(NB) prop.table(table(Dfm_train@docvars$choose_one)) # prior probabilities # let's see the association between words and probabilities (i.e., matrix with class conditional parameter estimates - i.e., the likelihood!). # take a look at "casualti" and "cream". The likelihood for the former is higher for a tweet discussing about a disaster (=1) compared # to irrelavant tweets (=0), i.e., p(casualti|1)> p(casualti|0); the opposite happens for the word cream head(NB$params) # Let's investigate about this issue a bit more. Why should we care about it? # Often, ML models are considered “black boxes” due to their complex inner-workings. However, because of their complexity, # they are typically more accurate for predicting nonlinear or rare phenomena. Unfortunately, more accuracy often comes at the # expense of interpretability, and interpretability is crucial. It is not enough to identify a ML model that optimizes # predictive performance; understanding and trusting model results is a hallmark of good (social and political) science! # Luckily, several advancements have been made to aid in interpreting ML models over the years. # Interpreting ML models is an emerging field that has become known as "Interpretable Machine Learning" (IML). # Approaches to model interpretability can be broadly categorized as providing global or local explanations. # Global interpretations help us understand the inputs and their entire modeled relationship with the prediction target. # Local interpretations help us understand model predictions for a single row of data or a group of similar rows. # Global interpretability in particular is about understanding how the model makes predictions, based on a holistic view of # its features and how they influence the underlying model structure. # It answers questions regarding which features are relatively influential as well as how these features influence the response variable. # In this latter case, we assess the magnitude of a variable’s relationship with the response as compared to other variables used in the model. # In text analytics (given that we are dealining with huge DfM wherein the features per-se are not the main focus of our interest) # we are mainly interested in global interpretations; however if you use ML for other aims, local interpretations become VERY important!!! # So let's do some Global Interpretation for our NB! NB_prob <- as.data.frame(NB$params) NB_prob$Feature <- row.names(NB_prob) str(NB_prob) # let's estimate the features that change the most the difference between the relevant and irrelevant conditional probabilities NB_prob$diff <- NB_prob[,2]-NB_prob[,1] str(NB_prob) # the features that present the highest absolute value in this difference can be also considered as the most important in # affecting the overall performance of the algorithm NB_prob$imp <- abs(NB_prob[,2]-NB_prob[,1]) str(NB_prob) print(head(NB_prob[order(NB_prob$imp , decreasing=TRUE),], 15)) # most relevant words overall # clearly, the features the present the highest positive values according to NB_prob$diff are the words # most relevant for the "disaster" tweets (=1); those with the highest negative values are the words # most relevant for the "irrelevant" tweets (=0) print(head(NB_prob[order(NB_prob$diff , decreasing=TRUE),], 15)) # most relevant words for "disaster" tweets print(head(NB_prob[order(NB_prob$diff , decreasing=FALSE),], 15)) # irrelevant words for the other tweets NB_prob$sign <- ifelse(NB_prob$diff>0,"relevant","irrelevant") str(NB_prob) # let's extract the top 20-most irrelevant and the 20-most relevant contributing features NB_prob2 <- top_n(NB_prob, 20, diff ) NB_prob2 NB_prob3 <- top_n(NB_prob, -20, diff ) NB_prob3 NB_prob_new <- rbind(NB_prob2, NB_prob3) # reorder the features NB_prob_new <- mutate(NB_prob_new, Feature= reorder(Feature, diff)) ggplot(NB_prob_new, aes(Feature, diff, fill = sign)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Difference in the conditional probabilities") + scale_fill_manual(values = c("#000033", "#006699")) + labs(title = "Disaster tweets", subtitle = "Irrelevant (-) versus Relevant (+) words - Naive Bayes Model", fill = "Disaster") # let's FINALLY predict the test-set predicted_nb <- predict(NB ,test ) table(predicted_nb ) prop.table(table(predicted_nb ))