rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(caTools) library(randomForest) library(caret) library(naivebayes) library(car) library(ggplot2) library(dplyr) library(reshape2) # The data we will be using are some English social media disaster tweets discussed in # this article: https://arxiv.org/pdf/1705.02009.pdf # It consist of a number of tweets regarding accidents mixed in with a selection control tweets (not about accidents) ##################################################### # FIRST STEP: let's create the DfM for the training-set ##################################################### x <- read.csv("train_disaster.csv", stringsAsFactors=FALSE) str(x) # class-label variable: choose_one (0=tweets not relevant; 1=relevant tweets) table(x$choose_one) prop.table(table(x$choose_one)) nrow(x) myCorpusTwitterTrain <- corpus(x) tok2 <- tokens(myCorpusTwitterTrain , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) # let's also remove the unicode symbols tok2 <- tokens_remove(tok2, c("0*")) tok2 <- tokens_wordstem (tok2) Dfm_train <- dfm(tok2) # Let's trim the dfm in order to keep only tokens that appear in 2 or more tweets (tweets are short texts!) # and let's keep only features with at least 2 characters Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE) Dfm_train <- dfm_remove(Dfm_train , min_nchar = 2) topfeatures(Dfm_train , 20) # 20 top words ##################################################### # SECOND STEP: let's create the DfM for the test-set ##################################################### x10 <- read.csv("test_disaster.csv", stringsAsFactors=FALSE) str(x10) nrow(x10) myCorpusTwitterTest <- corpus(x10) tok <- tokens(myCorpusTwitterTest , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok <- tokens_remove(tok, stopwords("en")) tok <- tokens_remove(tok, c("0*")) tok <- tokens_wordstem (tok) Dfm_test <- dfm(tok) Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE) Dfm_test<- dfm_remove(Dfm_test, min_nchar = 2) topfeatures(Dfm_test , 20) # 20 top words ##################################################### # THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern. # after this step, we can "predict" by employing only the features included in the training-set ##################################################### setequal(featnames(Dfm_train), featnames(Dfm_test)) length(Dfm_test@Dimnames$features) length(Dfm_train@Dimnames$features) test_dfm <- dfm_match(Dfm_test, features = featnames(Dfm_train)) length(test_dfm@Dimnames$features) setequal(featnames(Dfm_train), featnames(test_dfm )) ##################################################### # FOURTH STEP: Let's convert the two DfMs into matrices for the ML algorithms to work ##################################################### train <- as.matrix(Dfm_train) test <- as.matrix(test_dfm) ##################################################### # FIFHT STEP: let's estimate a ML Model ##################################################### ##################################################### # Let's start with a Naive Bayes Model ##################################################### # we will use the naivebayes package. Another possibile package you can consider is the fastNaiveBayes package # given our training-set, we have to use a Multinomial rather rather than a Binomial distribution given that our # features can assume a value different than just 0/1, i.e., a one-hot-encoding. Indeed: table(Dfm_train@x ) # to run a Binomial model with naivebayes just replace "multinomial_naive_bayes" with "bernoulli_naive_bayes" in the below command # Which is our DV? str(Dfm_train@docvars$choose_one) # that's an integer variable. Not good for a classification! It should always be a factor variable # So we will use as.factor to transform it into a factor variable # Note that instead of y=as.factor(Dfm_train@docvars$choose_one) in the formula of the NB we could have used # y=as.factor(x$choose_one) of course! system.time(NB <- multinomial_naive_bayes(x=train, y=as.factor(Dfm_train@docvars$choose_one), laplace = 1)) summary(NB) prop.table(table(Dfm_train@docvars$choose_one)) # prior probabilities # let's see the association between words and probabilities (i.e., matrix with class conditional parameter estimates). # take a look at "casualti" and "cream". The former increases the probability of a relevant tweet (=1) compared to the latter one (=0) head(NB$params) # Let's investigate about this issue a bit more. Why should we care about it? # Often, ML models are considered “black boxes” due to their complex inner-workings. However, because of their complexity, # they are typically more accurate for predicting nonlinear or rare phenomena. Unfortunately, more accuracy often comes at the # expense of interpretability, and interpretability is crucial. It is not enough to identify a ML model that optimizes # predictive performance; understanding and trusting model results is a hallmark of good (social and political) science! # Luckily, several advancements have been made to aid in interpreting ML models over the years. # Interpreting ML models is an emerging field that has become known as "Interpretable Machine Learning" (IML). # Approaches to model interpretability can be broadly categorized as providing global or local explanations. # Global interpretations help us understand the inputs and their entire modeled relationship with the prediction target. # Local interpretations help us understand model predictions for a single row of data or a group of similar rows. # Global interpretability in particular is about understanding how the model makes predictions, based on a holistic view of # its features and how they influence the underlying model structure. # It answers questions regarding which features are relatively influential as well as how these features influence the response variable. # In this latter case, we assess the magnitude of a variable’s relationship with the response as compared to other variables used in the model. # In text analytics (given that we are dealining with huge DfM wherein the features per-se are not the main focus of our interest) # we are mainly interested in global interpretations; however if you use ML for other aims, local interpretations become VERY important!!! # So let's do some Global Interpretation for our NB! NB_prob <- as.data.frame(NB$params) NB_prob$Feature <- row.names(NB_prob) str(NB_prob) # let's estimate the features that change the most the difference between the relevant and irrelevant conditional probabilities NB_prob$diff <- NB_prob[,2]-NB_prob[,1] str(NB_prob) # the features that present the highest absolute value in this difference can be also considered as the most important in # affecting the overall performance of the algorithm NB_prob$imp <- abs(NB_prob[,2]-NB_prob[,1]) str(NB_prob) print(head(NB_prob[order(NB_prob$imp , decreasing=TRUE),], 15)) # most relevant words overall # clearly, the features the present the highest positive values according to NB_prob$diff are the words # most relevant for the "disaster" tweets (=1); those with the highest negative values are the words # most relevant for the "irrelevant" tweets (=0) print(head(NB_prob[order(NB_prob$diff , decreasing=TRUE),], 15)) # most relevant words for "disaster" tweets print(head(NB_prob[order(NB_prob$diff , decreasing=FALSE),], 15)) # irrelevant words for the other tweets NB_prob$sign <- ifelse(NB_prob$diff>0,"relevant","irrelevant") str(NB_prob) # let's extract the top 20-most irrelevant and the 20-most relevant contributing features NB_prob2 <- top_n(NB_prob, 20, diff ) NB_prob2 NB_prob3 <- top_n(NB_prob, -20, diff ) NB_prob3 NB_prob_new <- rbind(NB_prob2, NB_prob3) # reorder the features NB_prob_new <- mutate(NB_prob_new, Feature= reorder(Feature, diff)) ggplot(NB_prob_new, aes(Feature, diff, fill = sign)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Difference in the conditional probabilities") + scale_fill_manual(values = c("#000033", "#006699")) + labs(title = "Disaster tweets", subtitle = "Irrelevant (-) versus Relevant (+) words - Naive Bayes Model", fill = "Disaster") # let's store the graph NB_graph <- ggplot(NB_prob_new, aes(Feature, diff, fill = sign)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Difference in the conditional probabilities") + scale_fill_manual(values = c("#000033", "#006699")) + labs(title = "Disaster tweets", subtitle = "Irrelevant (-) versus Relevant (+) words - Naive Bayes Model", fill = "Disaster") # let's FINALLY predict the test-set predicted_nb <- predict(NB ,test ) table(predicted_nb ) prop.table(table(predicted_nb )) ##################################################### # let's run a Random Forest ##################################################### # we will use the randomForest package. Another interesting pacakge you could explore for running a RF is the ranger package. # note that as hyperameter (or tuning-parameter), I select a specific value for the number of trees. A RF has also other hyperparameters. # More on this next week! set.seed(123) # (define a set.seed for being able to replicate the results!) system.time(RF <- randomForest(y= as.factor(Dfm_train@docvars$choose_one), x=train, importance=TRUE, do.trace=TRUE, ntree=500)) # As discussed yesterday, a natural benefit of the bootstrap resampling process is that random forests have an out-of-bag (OOB) sample # that provides a reasonable approximation of the test error. This provides a built-in validation set without any extra work on your part. # However, REMEMBER: this is less efficient than changing several hyperparameters and doing a k-fold cross-validation as we will see in the next weeks! str(RF$err.rate) RF$err.rate[nrow(RF$err.rate),] # our final error (overall, and for each of the two classes: irrelevant and relevant tweets) plot(RF) # the errors for the two classes (red=irrelevant; green=relevant). We make a much greater error for the latter class); in black the average error <- as.data.frame(RF$err.rate) # number of trees with the lowest OOB error which.min(error$OOB) # 441 set.seed(123) system.time(RF2 <- randomForest(y= as.factor(Dfm_train@docvars$choose_one), x=train, importance=TRUE, ntree=441, do.trace=TRUE)) RF$err.rate[nrow(RF$err.rate),] # our final error (overall, and for each of the two classes) RF2$err.rate[nrow(RF2$err.rate),] # our final error (overall, and for each of the two classes) # Now let's do some Global Interpretation for our RF! # What about the importance of each feature for our trained model? # Variable importance is a standard metric in machine learning, which indicates the amount of information a variable provides # to the model for predicting the outcome head(RF$importance[,3:4]) # let's grap the result varImpPlot(RF ) # Each features’s importance is assessed based on two criteria: # -MeanDecreaseAccuracy: gives a rough estimate of the loss in prediction performance when that particular variable is omitted from the training set. # Caveat: if two variables are somewhat redundant, then omitting one of them may not lead to massive gains in prediction performance, # but would make the second variable more important. # -MeanDecreaseGini: GINI is a measure of node impurity. Think of it like this: if you use this feature to split the data, how pure will the nodes be? # Highest purity means that each node contains only elements of a single class. # Assessing the decrease in GINI when that feature is omitted leads to an understanding of how important that feature is to split the data correctly. # Plz note that these measures are used to rank variables in terms of importance and, thus, their absolute values could be disregarded. # The problem is that we just get for example the GINI statistics overall w/o differentiating the words most important for specific classes. # Which are however the most important words for the relevant label? And for the irrelevant one? # let's extract the matrix for GINI and Accuracy importance_RF <- as.data.frame(RF$importance[,3:4]) str(importance_RF) importance_RF$Feature<- row.names(importance_RF) str(importance_RF) # same words we get with varImpPlot(RF ) print(head(importance_RF[order(importance_RF$MeanDecreaseGini, decreasing=TRUE),])) # let's predict the training-set texts and let's store a new variable in our dfm of the training-set with such predictions. # Why that? Cause we will use such variable to identify the "sign" of the most important features predicted_rf <- predict(RF, train, type="class") table(predicted_rf ) Dfm_train@docvars$predRF <- predicted_rf str(Dfm_train@docvars) # let's assign a feature to the relevant/irrelevant class according to its frequency (if, for example, the word "collapse" appears # 10 times among predicted relevant tweets and 5 times among predicted irrelevant tweets, # we classify it as pertaining to the "relevant tweets" world; and so on) # let's estimate the number of times a word appear in those tweets that we have classified as related to accidents or otherwise sums <- list() for (v in 0:1){ sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predRF"]==v,]) } sums <- do.call(cbind, sums) sums # let's apply the sign (either 1 or 2) if, respectively, a word appears more in the tweets classified as not related to accidents or related to them sign <- apply(sums, 1, which.max) # get the feature names <- dimnames(train)[[2]] str(names) df <- data.frame( Feature = names, sign = sign-1, # let's recode the sign between 0 and 1 stringsAsFactors=F) str(df) importanceRF <- merge(importance_RF, df, by="Feature") str(importanceRF) ## best predictors for (v in 0:1){ cat("\n\n") cat("value==", v) importanceRF <- importanceRF [order(importanceRF $MeanDecreaseGini, decreasing=TRUE),] print(head(importanceRF [importanceRF $sign==v,], n=10)) cat("\n") cat(paste(unique(head(importanceRF $Features[importanceRF $sign==v], n=10)), collapse=", ")) } # let's draw a graph with our results focusing on MeanDecreaseGini importanceRF$Gini <- ifelse(importanceRF $sign>0,importanceRF $MeanDecreaseGini,importanceRF $MeanDecreaseGini*-1) # the twop 20 relevant words importance2RF <- top_n(importanceRF, 20, Gini ) importance2RF # the twop 20 irrelevant words importance3RF <- top_n(importanceRF, -20, Gini ) importance3RF importance_newRF <- rbind(importance2RF, importance3RF) str(importance_newRF) # reorder the features importance_newRF <- mutate(importance_newRF, Feature= reorder(Feature, Gini )) importance_newRF$sign2<- ifelse(importance_newRF$sign>0,"relevant","irrelevant") ggplot(importance_newRF, aes(Feature, Gini, fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Mean Decrease Gini coefficient") + scale_fill_manual(values = c("#000033", "#006699")) + labs(title = "Disaster tweets", subtitle = "Irrelevant (-) versus Relevant (+) words - Random Forest Model", fill = "Disaster") # let's store the graph RF_graph <- ggplot(importance_newRF, aes(Feature, Gini, fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Mean Decrease Gini coefficient") + scale_fill_manual(values = c("#000033", "#006699")) + labs(title = "Disaster tweets", subtitle = "Irrelevant (-) versus Relevant (+) words - Random Forest Model", fill = "Disaster") # let's FINALLY predict the test-set system.time(predicted_rf <- predict(RF, test,type="class")) table(predicted_rf ) prop.table(table(predicted_rf)) ###################################################### ###################################################### # Let's compare the results out-of-sample we got via Naive Bayes & RF ###################################################### ###################################################### prop.table(table(predicted_nb )) prop.table(table(predicted_rf )) results <- as.data.frame(rbind(prop.table(table(predicted_nb )), prop.table(table(predicted_rf )))) str(results) results$algorithm <- c("NB", "RF") str(results) # Let's plot the results! library(reshape2) df.long<-melt(results,id.vars=c("algorithm")) str(df.long) ggplot(df.long,aes(algorithm,value,fill=variable))+ geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() + ylab(label="Review class in the test-set") + xlab("algorithm") + scale_fill_discrete(name = "Prediction", labels = c("Irrelevant", "Relevant")) # and so, which result is to trust more than the other one? For answering to that, we need to introduce the Cross-Validation procedure...next week! ################################################# # let's compare feature importance across models ################################################# library(gridExtra) grid.arrange(NB_graph, RF_graph, nrow=2) # Plot everything together # let's check some correlation str(NB_prob) str(importanceRF) # let's merge the dataframe (w/o XGB for the moment) words <- merge(NB_prob, importanceRF, by="Feature") str(words) library(PerformanceAnalytics) chart.Correlation(words[c(5, 7:8)])