rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(caTools) library(randomForest) library(caret) library(naivebayes) library(car) library(reshape2) library(gridExtra) library(cvTools) ##################################################### # FIRST STEP: let's prepare the training-set ##################################################### # let's focus on our familiar database on tweets about disasters x <- read.csv("train_disaster.csv", stringsAsFactors=FALSE) str(x) myCorpusTwitterTrain <- corpus(x) tok2 <- tokens(myCorpusTwitterTrain , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) # let's also remove the unicode symbols tok2 <- tokens_remove(tok2, c("0*")) tok2 <- tokens_wordstem (tok2) Dfm_train <- dfm(tok2) # Let's trim the dfm in order to keep only features that appear in 2 or more tweets (tweets are short texts!) # and let's keep only features with at least 2 characters Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE) Dfm_train <- dfm_remove(Dfm_train , min_nchar = 2) topfeatures(Dfm_train , 20) # 20 top words train <- as.matrix(Dfm_train) # our classes table(Dfm_train@docvars$choose_one) # our benchmark: accuracy .595 prop.table(table(Dfm_train@docvars$choose_one)) ###################################################### ###################################################### # Let's make CROSS-VALIDATION with k-fold=5 with a loop! # HERE you HAVE JUST 2 classes for the outcome (irrelevant/relevant tweets discussing about a disaster) ###################################################### ###################################################### # Some R packages, such as Caret, allows you to run a k-fold cross-validation with few lines of command. # But I want that you understand what's going on, so better work with this loop! And then if you desire, move to Caret # STEP 1: create the 5 folds ttrain <- train # let's change the name of the original train matrix, given that we are already going to use such name below in the loop # let's split our training-set in 5 folds set.seed(123) # set the see for replicability k <- 5 # the number of folds; it does not matter the number of folds you decide here; the below procedure always will work! folds <- cvFolds(NROW(ttrain ), K=k) str(folds) ###################################################### ###################################################### # Let's start with Naive Bayes LOOP ###################################################### ###################################################### # STEP 2: the LOOP system.time(for(i in 1:k){ train <- ttrain [folds$subsets[folds$which != i], ] # Set the training set validation <- ttrain [folds$subsets[folds$which == i], ] # Set the validation set newrf <- multinomial_naive_bayes(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$choose_one) ,x=train, laplace = 1) newpred <- predict(newrf,newdata=validation, type="class") class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$choose_one) print(class_table) df<-confusionMatrix( class_table, mode = "everything") df.name<-paste0("conf.mat.nb",i) # create the name for the object that will save the confusion matrix for each loop (=5) assign(df.name,df) }) # STEP 3: the metrics RIVEDI! ls() # we have created 5 objects that have saved the 5 confusion matrices we have created. I can estimate now the performance metrics on such results # for example: conf.mat.nb1 str(conf.mat.nb1) conf.mat.nb1$overall[1] # overall accuracy: (36+23)/(36+23+8+13) # note that the F1 value you see is not the one for the overall model, but just for the first class (i.e., "irrelevant tweets") # Therefore we have to estimate the average value of F1 for each k-fold by hands! See below conf.mat.nb1$byClass[1] # Recall for irrelevant tweets: (36)/(36+13) - think vertically! conf.mat.nb1$byClass[3] # Precision for irrelevant tweets: (36)/(36+8) - think horizontally! conf.mat.nb1$byClass[2] # Recall for disaster tweets: (23)/(23+8) - think vertically! conf.mat.nb1$byClass[4] # Precision for disaster tweets: (23)/(23+13) - think horizontally! # F1 per the irrelevant class: conf.mat.nb1$byClass[7] # that is: (2*conf.mat.nb1$byClass[3]*conf.mat.nb1$byClass[1])/(conf.mat.nb1$byClass[3]+conf.mat.nb1$byClass[1]) # F1 for irrilevant tweets # and for tweets really discussing about diasters? (2*conf.mat.nb1$byClass[2]*conf.mat.nb1$byClass[4])/(conf.mat.nb1$byClass[2]+conf.mat.nb1$byClass[4]) # F1 for relevant tweets NBPredict <- data.frame(col1=vector(), col2=vector(), col3=vector()) for(i in mget(ls(pattern = "conf.mat.nb")) ) { Accuracy <-(i)$overall[1] # save in the matrix the accuracy value F1_irrelevant <- (2*(i)$byClass[1]*(i)$byClass[3])/((i)$byClass[1]+(i)$byClass[3]) # save in the matrix the F1 value for irrelevant tweets F1_relevant <- (2*(i)$byClass[2]*(i)$byClass[4])/((i)$byClass[2]+(i)$byClass[4]) # save in the matrix the F1 value for relevant tweets NBPredict <- rbind(NBPredict , cbind(Accuracy , F1_irrelevant , F1_relevant )) } NBPredict NBPredict [is.na(NBPredict )] <- 0 # if I get some NA for some categories with respect to F1 (this happens when BOTH precision and recall score for that category is 0), # you could consider to replace NA with 0; this usually can happen when your training-set is not that big, so that during the k-fold cv, # you can have a training-set with few observations for some given class [not our case....] str(NBPredict ) # Let's compute the average value for accuracy and f1 acc_nb_avg <- mean(NBPredict [, 1] ) f1_nb_avg <- mean(colMeans(NBPredict [-1] )) # comparing the avg. accuracy with the avg. F1 value is always a good practice. You can use it as a diagnostic tool- # If the values are very different, that implies that you are doing well with some class labels and poorly # with other class labels. In this case, not such a huge difference (being the 2 values very close to each other) acc_nb_avg f1_nb_avg ###################################################### ###################################################### # Random Forest LOOP ###################################################### ###################################################### # STEP 2: the LOOP # here we fix ntree=100 to make things faster! system.time(for(i in 1:k){ train <- ttrain [folds$subsets[folds$which != i], ] validation <- ttrain [folds$subsets[folds$which == i], ] set.seed(123) newrf <- randomForest(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$choose_one) ,x=train, do.trace=TRUE, ntree=100) newpred <- predict(newrf,newdata=validation, type="class") class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$choose_one) print(class_table) df<-confusionMatrix( class_table, mode = "everything") df.name<-paste0("conf.mat.rf",i) assign(df.name,df) }) # STEP 3: the metrics RFPredict <- data.frame(col1=vector(), col2=vector(), col3=vector()) for(i in mget(ls(pattern = "conf.mat.rf")) ) { Accuracy <-(i)$overall[1] # save in the matrix the accuracy value F1_irrelevant <- (2*(i)$byClass[1]*(i)$byClass[3])/((i)$byClass[1]+(i)$byClass[3]) # save in the matrix the F1 value for negative F1_relevant <- (2*(i)$byClass[2]*(i)$byClass[4])/((i)$byClass[2]+(i)$byClass[4]) # save in the matrix the F1 value for positive RFPredict <- rbind(RFPredict , cbind(Accuracy , F1_irrelevant , F1_relevant )) } RFPredict RFPredict [is.na(RFPredict )] <- 0 str(RFPredict ) # Let's compute the average value for accuracy and f1 acc_rf_avg <- mean(RFPredict [, 1] ) f1_rf_avg <- mean(colMeans(RFPredict [-1] )) acc_rf_avg f1_rf_avg ###################################################### ###################################################### # Let's compare the results we got via Naive Bayes and RF ###################################################### ###################################################### acc_nb_avg acc_rf_avg f1_nb_avg f1_rf_avg # Let's plot the results! gb1 <- as.data.frame(acc_nb_avg ) colnames(gb1)[1] <- "Accuracy NB" gb2 <- as.data.frame(acc_rf_avg ) colnames(gb3)[1] <- "Accuracy RF" ac_tot <- cbind(gb1, gb2) ac_tot str(ac_tot) df.long_ac_tot<-melt(ac_tot) str(df.long_ac_tot) p <- ggplot(df.long_ac_tot, aes(x=variable, y=value, color = variable)) + geom_boxplot() + xlab("Algorithm") + ylab(label="Values of Accuracy") + labs(title = "Cross-validation with k =5: values of Accuracy") + coord_flip() + theme_bw() + theme(legend.position = "None") gb1 <- as.data.frame(f1_nb_avg) colnames(gb1)[1] <- "F1 NB" gb2 <- as.data.frame(f1_rf_avg ) colnames(gb2)[1] <- "F1 RF" f1_tot <- cbind(gb1, gb2) f1_tot str(f1_tot) df.long_f1_tot<-melt(f1_tot) str(df.long_f1_tot) p2 <- ggplot(df.long_f1_tot, aes(x=variable, y=value, color = variable)) + geom_boxplot() + xlab("Algorithm") + ylab(label="Values of F1") + labs(title = "Cross-validation with k =5: values of F1") + coord_flip() + theme_bw() + theme(legend.position = "None") grid.arrange(p, p2, nrow=2) # Plot everything together # So in this particular case, NB appears to be better than the RF # But remember: you are employing a training-set consisting of just 400 documents # and NB usually does well in this scenario. Moreover... # Wait a minute! In this example we kept the hyperparameters fixed! # But is there any other hyperameters configuration for RF and NB that will improve the model? ###################################################### ###################################################### # Exercise: try to replicate the previous analysis with k-fold: 10 ###################################################### ######################################################