rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(caTools) library(randomForest) library(caret) library(naivebayes) library(car) library(ggplot2) library(dplyr) library(reshape2) library(gridExtra) # The data we will be using are some English social media disaster tweets discussed in # this article: https://arxiv.org/pdf/1705.02009.pdf # It consist of a number of tweets regarding accidents mixed in with a selection control tweets (not about accidents) ##################################################### # FIRST STEP: let's create the DfM for the training-set ##################################################### x <- read.csv("train_disaster.csv", stringsAsFactors=FALSE) str(x) # class-label variable: choose_one (0=tweets not relevant for accidencts; 1=relevant tweets for accidents) table(x$choose_one) prop.table(table(x$choose_one)) nrow(x) myCorpusTwitterTrain <- corpus(x) tok2 <- tokens(myCorpusTwitterTrain , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_URL = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) # let's also remove the unicode symbols tok2 <- tokens_remove(tok2, c("0*")) tok2 <- tokens_wordstem (tok2) Dfm_train <- dfm(tok2) # Let's trim the dfm in order to keep only tokens that appear in 2 or more tweets (tweets are short texts!) # and let's keep only features with at least 2 characters Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE) Dfm_train <- dfm_remove(Dfm_train , min_nchar = 2) topfeatures(Dfm_train , 20) # 20 top words # Always check if after trimming your dfm you have some texts with just 0s! Dfm_train [ntoken(Dfm_train ) == 0,] # Here problems with 6 texts! So let's delete such texts! Dfm_train <- Dfm_train [ntoken(Dfm_train ) != 0,] table(ntoken(Dfm_train ) == 0) ##################################################### # SECOND STEP: let's create the DfM for the test-set ##################################################### x10 <- read.csv("test_disaster.csv", stringsAsFactors=FALSE) str(x10) nrow(x10) myCorpusTwitterTest <- corpus(x10) tok <- tokens(myCorpusTwitterTest , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_URL = TRUE) tok <- tokens_remove(tok, stopwords("en")) tok <- tokens_remove(tok, c("0*")) tok <- tokens_wordstem (tok) Dfm_test <- dfm(tok) Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE) Dfm_test<- dfm_remove(Dfm_test, min_nchar = 2) # Here problems with 11 texts! Dfm_test[ntoken(Dfm_test) == 0,] # So let's delete such texts! Dfm_test <- Dfm_test[ntoken(Dfm_test) != 0,] Dfm_test[ntoken(Dfm_test) == 0,] topfeatures(Dfm_test , 20) # 20 top words ##################################################### # THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern. # After this step, we can "predict" the test-set by employing only the features included in the training-set. # Avoiding to use dfm_match would generate a problem with RandomForest predictions on the test-set. # That is, w/o dfm_match you would get the following error: Error in predict.randomForest - variables in the training data missing in newdata ##################################################### setequal(featnames(Dfm_train), featnames(Dfm_test)) nfeat(Dfm_test) nfeat(Dfm_train) test_dfm <- dfm_match(Dfm_test, features = featnames(Dfm_train)) nfeat(test_dfm) setequal(featnames(Dfm_train), featnames(test_dfm )) # of course if you have a unique dataset including both the training (i.e., the documents human-codified) and the test-set # you could avoid this THIRD STEP, given that you would generate a unique Dfm with the same # of columns # for all the documents irrespective of their status (i.e., being part of the training or the test-set). # See below an example. ##################################################### # FOURTH STEP: Let's convert the two DfMs into matrices for the ML algorithms to work ##################################################### train <- as.matrix(Dfm_train) # dense matrix object.size(train) # Alternatively, we could have saved the dfm as a compressed sparse matrix! # What's the meaning of that? Take a look at the EXTRA slides on the home-page! trainSP <- as(Dfm_train, "dgCMatrix") # compressed matrix object.size(trainSP ) object.size(train)/object.size(trainSP ) str(train) str(trainSP) # The difference between a dense and a compressed matrix is going to make a HUGE difference when you have a large matrix as test and train!!! # We will mainly use dense (i.e., not compressed) matrix in our class given that we will always use small dataset. # But keeps this in mind! test <- as.matrix(test_dfm) ##################################################### # FIFHT STEP: let's estimate a ML Model ##################################################### ##################################################### # Let's start with a Naive Bayes Model ##################################################### # We will use the naivebayes package. Another possibile package you can consider is the fastNaiveBayes package # Given our training-set, we have to use a Multinomial rather rather than a Binomial distribution given that our # features can assume a value different than just 0/1, i.e., a one-hot-encoding. Indeed: table(Dfm_train@x ) # to run a Binomial model with naivebayes just replace "multinomial_naive_bayes" with "bernoulli_naive_bayes" in the below command # Which is our DV? str(Dfm_train@docvars$choose_one) # that's an integer variable. Not good for a classification! It should always be a factor variable # So we will use as.factor to transform it into a factor variable system.time(NB <- multinomial_naive_bayes(x=train, y=as.factor(Dfm_train@docvars$choose_one), laplace = 1)) # Note that instead of y=as.factor(Dfm_train@docvars$choose_one) in the formula of the NB above we could have written # y=as.factor(x$choose_one)! However I would always suggest you to extract the DV from the docvars of the DfM, # especially if you have trimmed it and you have deleted some rows as we did summary(NB) prop.table(table(Dfm_train@docvars$choose_one)) # prior probabilities # let's see the association between words and probabilities (i.e., matrix with class conditional parameter estimates - i.e., the likelihood!). # take a look at "casualti" and "cream". The likelihood for the former is higher for a tweet discussing about a disaster (=1) compared # to irrelavant tweets (=0), i.e., p(casualti|1)> p(casualti|0); the opposite happens for the word cream head(NB$params) # Let's investigate about this issue a bit more. Therefore, let's do some Global Interpretation of our NB! NB_prob <- as.data.frame(NB$params) NB_prob$Feature <- row.names(NB_prob) str(NB_prob) # let's estimate the features that change the most the difference between the relevant and irrelevant conditional probabilities NB_prob$diff <- NB_prob[,2]-NB_prob[,1] str(NB_prob) # the features that present the highest absolute value in this difference can be also considered as the most important in # affecting the overall performance of the algorithm NB_prob$imp <- abs(NB_prob[,2]-NB_prob[,1]) str(NB_prob) print(head(NB_prob[order(NB_prob$imp , decreasing=TRUE),], 15)) # most relevant words overall # clearly, the features the present the highest positive values according to NB_prob$diff are the words # most relevant for the "disaster" tweets (=1); those with the highest negative values are the words # most relevant for the "irrelevant" tweets (=0) print(head(NB_prob[order(NB_prob$diff , decreasing=TRUE),], 15)) # most relevant words for "disaster" tweets print(head(NB_prob[order(NB_prob$diff , decreasing=FALSE),], 15)) # irrelevant words for the other tweets NB_prob$sign <- ifelse(NB_prob$diff>0,"relevant","irrelevant") str(NB_prob) # let's extract the top 20-most irrelevant and the 20-most relevant contributing features NB_prob2 <- top_n(NB_prob, 20, diff ) NB_prob2 NB_prob3 <- top_n(NB_prob, -20, diff ) NB_prob3 NB_prob_new <- rbind(NB_prob2, NB_prob3) # reorder the features NB_prob_new <- mutate(NB_prob_new, Feature= reorder(Feature, diff)) ggplot(NB_prob_new, aes(Feature, diff, fill = sign)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Difference in the likelihoods") + scale_fill_manual(values = c("#000033", "#006699")) + labs(title = "Disaster tweets", subtitle = "Irrelevant (-) versus Relevant (+) words - Naive Bayes Model", fill = "Disaster") # let's store the graph NB_graph <- ggplot(NB_prob_new, aes(Feature, diff, fill = sign)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Difference in the likelihoods") + scale_fill_manual(values = c("#000033", "#006699")) + labs(title = "Disaster tweets", subtitle = "Irrelevant (-) versus Relevant (+) words - Naive Bayes Model", fill = "Disaster") # let's FINALLY predict the test-set predicted_nb <- predict(NB ,test ) table(predicted_nb ) prop.table(table(predicted_nb )) ########################## # ALTERNATIVE APPROACH # let's create a unique dataset out of the training and the test-set and let's estimate on such dataset our ML algorithm ########################## x$type <- "train" str(x) x10$type <- "test" str(x10) x_tot <- rbind(x, x10) str(x_tot) myCorpusTwitterTrain <- corpus(x_tot) tok2 <- tokens(myCorpusTwitterTrain , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_URL = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) # let's also remove the unicode symbols tok2 <- tokens_remove(tok2, c("0*")) tok2 <- tokens_wordstem (tok2) Dfm_tot <- dfm(tok2) Dfm_tot <- dfm_trim(Dfm_tot , min_docfreq = 2, verbose=TRUE) Dfm_tot <- dfm_remove(Dfm_tot , min_nchar = 2) Dfm_tot [ntoken(Dfm_tot ) == 0,] # So let's delete such texts! Dfm_tot <- Dfm_test[ntoken(Dfm_tot ) != 0,] head(docvars(Dfm_tot)) table(Dfm_tot@docvars$type) # let's separate the texts included in our original training [in the "x" dataframe] form the test-set. # number of texts in our training-set train_index <- ndoc(Dfm_tot[Dfm_tot@docvars$type=="train",]) train_index training <-c(1:train_index) str(training) testSet<-c((train_index+1):ndoc(Dfm_tot)) str(testSet) db_tot <- as.matrix(Dfm_tot ) system.time(NB2 <- multinomial_naive_bayes(x=db_tot[training,], y=as.factor(Dfm_tot@docvars$choose_one[training]), laplace = 1)) predicted_nb2 <- predict(NB2 ,db_tot[testSet ,] ) table(predicted_nb2 ) round(prop.table(table(predicted_nb2 )),3) # slightly different values than above. Why? # It is related to the impact of dfm_trim done with a smaller dfm or a larger one (i.e., Dfm_train vs. Dfm_tot) round(prop.table(table(predicted_nb )),3) ##################################################### # let's run a Random Forest ##################################################### # We will use the randomForest package. If you are employing compressed matrices, plz use the ranger package. # the ranger package is also usually faster than the randomForest one # Note that as hyperameter (or tuning-parameter), I select a specific value for the number of trees. A RF has also other hyperparameters. # More on this next week! set.seed(123) # (define a set.seed for being able to replicate the results!) system.time(RF <- randomForest(y= as.factor(Dfm_train@docvars$choose_one), x=train, importance=TRUE, do.trace=TRUE, ntree=500)) RF # As discussed, a natural benefit of the bootstrap resampling process is that random forests have an out-of-bag (OOB) sample # that provides a reasonable approximation of the test error. This provides a built-in validation set without any extra work on your part. # However, REMEMBER: this is less efficient (especially with medium to large training-set) compared with # doing a k-fold cross-validation as we will see in the next weeks! str(RF$err.rate) RF$err.rate[nrow(RF$err.rate),] # our final error (overall, and for each of the two classes: irrelevant and relevant tweets) plot(RF, type = "l", col = c("red", "black","springgreen4"), main = "Class errors in Random Forest Model / OOB") legend("topright", horiz = F, cex = 0.7, fill = c( "black", "red", "springgreen4"), c( "No-disaster tweets","Average error", "Disaster tweets")) error <- as.data.frame(RF$err.rate) # number of trees with the lowest OOB error which.min(error$OOB) # 455 set.seed(123) system.time(RF2 <- randomForest(y= as.factor(Dfm_train@docvars$choose_one), x=train, importance=TRUE, ntree=455, do.trace=TRUE)) RF$err.rate[nrow(RF$err.rate),] # our final error (overall, and for each of the two classes) RF2$err.rate[nrow(RF2$err.rate),] # our final error (overall, and for each of the two classes) # Now let's do some Global Interpretation for our RF! # What about the importance of each feature for our trained model? # Variable importance is a standard metric in machine learning, which indicates the amount of information a variable provides # to the model for predicting the outcome head(RF$importance[,3:4]) # let's grap the result varImpPlot(RF ) # The importance of each featuress is assessed based on two criteria: # -MeanDecreaseAccuracy: gives a rough estimate of the loss in prediction performance when that particular variable is omitted from the training set. # Caveat: if two variables are somewhat redundant, then omitting one of them may not lead to massive gains in prediction performance, # but would make the second variable more important. # -MeanDecreaseGini: GINI is a measure of node impurity. Think of it like this: if you use this feature to split the data, how pure will the nodes be? # Highest purity means that each node contains only elements of a single class. # Assessing the decrease in GINI when that feature is omitted leads to an understanding of how important that feature is to split the data correctly. # let's extract the matrix for GINI and Accuracy importance_RF <- as.data.frame(RF$importance[,3:4]) str(importance_RF) importance_RF$Feature<- row.names(importance_RF) str(importance_RF) # same words we get with varImpPlot(RF ) print(head(importance_RF[order(importance_RF$MeanDecreaseGini, decreasing=TRUE),])) # Note that these measures are used to rank variables in terms of importance and, thus, their absolute values could be disregarded. # As a result, we just get for example the GINI statistics overall w/o differentiating the words most important for specific classes. # Which are however the most important words for the relevant label? And for the irrelevant one? # let's predict the training-set texts and let's store a new variable in our dfm of the training-set with such predictions. # Why that? Cause we will use such variable to identify the "sign" of the most important features predicted_rf <- predict(RF, train, type="class") table(predicted_rf ) Dfm_train@docvars$predRF <- predicted_rf str(Dfm_train@docvars) # let's now assign a feature to the relevant/irrelevant class according to its frequency # (if, for example, the word "collapse" appears 10 times among predicted relevant tweets and 5 times among predicted irrelevant tweets, # we are going to classify it as pertaining to the "relevant tweets" world; and so on) # First, let's estimate the number of times a word appear in those tweets that we have classified as related to accidents or otherwise sums <- list() for (v in 0:1){ sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predRF"]==v,]) } str(sums) sums <- do.call(cbind, sums) # do.call allows to use cbind on a list (i.e., collection of objects of different types as in the case of sums) sums # let's apply the sign (either 1 or 2) if, respectively, a word appears more in the tweets classified as not related to accidents or related to them sign <- apply(sums, 1, which.max) # get the feature names <- dimnames(train)[[2]] str(names) df <- data.frame( Feature = names, sign = sign-1, # let's now recode the sign from 1 and 2 to 0 and 1 (0=no accidentes; 1=accidents) stringsAsFactors=F) str(df) importanceRF <- merge(importance_RF, df, by="Feature") str(importanceRF) ## best predictors for (v in 0:1){ cat("\n\n") cat("value==", v) importanceRF <- importanceRF [order(importanceRF $MeanDecreaseGini, decreasing=TRUE),] print(head(importanceRF [importanceRF $sign==v,], n=10)) cat("\n") cat(paste(unique(head(importanceRF $Features[importanceRF $sign==v], n=10)), collapse=", ")) } # let's draw a graph with our results focusing on MeanDecreaseGini importanceRF$Gini <- ifelse(importanceRF $sign==1,importanceRF $MeanDecreaseGini,importanceRF $MeanDecreaseGini*-1) # the top 20 words for the disaster tweets importance2RF <- top_n(importanceRF, 20, Gini ) importance2RF # the top 20 words for teets not related to disasters importance3RF <- top_n(importanceRF, -20, Gini ) importance3RF importance_newRF <- rbind(importance2RF, importance3RF) str(importance_newRF) # reorder the features importance_newRF <- mutate(importance_newRF, Feature= reorder(Feature, Gini )) # let's finally create a new string variabile according to the value of sign importance_newRF$sign2<- ifelse(importance_newRF$sign==1,"relevant","irrelevant") str(importance_newRF) ggplot(importance_newRF, aes(Feature, Gini, fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Mean Decrease Gini coefficient") + scale_fill_manual(values = c("#000033", "#006699")) + labs(title = "Disaster tweets", subtitle = "Irrelevant (-) versus Relevant (+) words - Random Forest Model", fill = "Disaster") # let's store the graph RF_graph <- ggplot(importance_newRF, aes(Feature, Gini, fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Mean Decrease Gini coefficient") + scale_fill_manual(values = c("#000033", "#006699")) + labs(title = "Disaster tweets", subtitle = "Irrelevant (-) versus Relevant (+) words - Random Forest Model", fill = "Disaster") # let's FINALLY predict the test-set. We define the seed to get always the same prediction. Otherwise, you can get each time slighty # different results according to how to randomly classify a given document when you have a draw across decision trees # (i.e., say 250 trees say "positive" and 250 trees say "negative") set.seed(123) system.time(predicted_rf <- predict(RF, test)) table(predicted_rf ) set.seed(14) system.time(predicted_rf <- predict(RF, test)) table(predicted_rf ) # note that if you add: "predict.all=TRUE", then the returned object is a list # of 2 components: aggregate, which is the vector of predicted values by the forest (as above), # and individual, which is a matrix where each column contains prediction by a tree in the forest. set.seed(123) system.time(predicted_rf2 <- predict(RF, test, predict.all=TRUE)) str(predicted_rf2) table(predicted_rf2$aggregate ) nrow(test) head(predicted_rf2$individual) ###################################################### ###################################################### # Let's compare the results out-of-sample we got via Naive Bayes & RF ###################################################### ###################################################### prop.table(table(predicted_nb )) prop.table(table(predicted_rf )) results <- as.data.frame(rbind(prop.table(table(predicted_nb )), prop.table(table(predicted_rf )))) str(results) results$algorithm <- c("NB", "RF") str(results) # Let's plot the results! library(reshape2) df.long<-melt(results,id.vars=c("algorithm")) str(df.long) ggplot(df.long,aes(algorithm,value,fill=variable))+ geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() + ylab(label="Review class in the test-set") + xlab("algorithm") + scale_fill_discrete(name = "Prediction", labels = c("Irrelevant", "Relevant")) # Similar but not exactly the same results for the prediction! # And so, which result is to trust more than the other one? # For answering to that, we need to introduce the Cross-Validation procedure... ################################################# # let's compare feature importance across models ################################################# library(gridExtra) grid.arrange(NB_graph, RF_graph, nrow=2) # Plot everything together # let's check some correlation # let's merge the dataframe words <- merge(NB_prob, importanceRF, by="Feature") str(words) library(PerformanceAnalytics) chart.Correlation(words[c(5, 7:8)])