rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(caTools) library(randomForest) library(caret) library(naivebayes) library(car) library(reshape2) library(gridExtra) library(cvTools) ##################################################### # FIRST STEP: let's prepare the training-set ##################################################### # let's focus on MOVIE reviews (our DV: either a positive or a negative movie-review) x <- read.csv("train_review2.csv", stringsAsFactors=FALSE) str(x) myCorpusTwitterTrain <- corpus(x) tok2 <- tokens(myCorpusTwitterTrain, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) Dfm_train <- dfm( tok2) # Let's trim the dfm in order to keep only tokens that appear in at least 5% of the reviews Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop") topfeatures(Dfm_train , 20) # 20 top words # Always check if after trimming your test-set you have some texts with just 0s! Dfm_train [ntoken(Dfm_train ) == 0,] # our classes table(Dfm_train@docvars$Sentiment) # our benchmark: accuracy .524 prop.table(table(Dfm_train@docvars$Sentiment)) train <- as.matrix(Dfm_train) ###################################################### ###################################################### # Let's make CROSS-VALIDATION with k-fold=5 with a loop! # HERE you HAVE JUST 2 classes for the outcome (negative/positive) ###################################################### ###################################################### # STEP 1: create the 5 folds ttrain <- train # let's change the name of the original train matrix, given that we are already going to use such name below in the loop # let's split our training-set in 5 folds set.seed(123) # set the see for replicability k <- 5 # the number of folds; it does not matter the number of folds you decide here; the below procedure always will work! folds <- cvFolds(NROW(ttrain ), K=k) str(folds) ###################################################### ###################################################### # Let's start with a Random Forest loop ###################################################### ###################################################### # STEP 2: the LOOP # here we fix ntree=100 to make things faster! system.time(for(i in 1:k){ train <- ttrain [folds$subsets[folds$which != i], ] validation <- ttrain [folds$subsets[folds$which == i], ] set.seed(123) newrf <- randomForest(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train, do.trace=TRUE, ntree=100) set.seed(123) newpred <- predict(newrf,newdata=validation) class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment) print(class_table) df<-confusionMatrix( class_table, mode = "everything") df.name<-paste0("conf.mat.rf",i) assign(df.name,df) }) # STEP 3: the metrics ls() # we have created 5 objects that have saved the 5 confusion matrices we have created. # We can estimate now the performance metrics on such results. For example: conf.mat.rf1 str(conf.mat.rf1) conf.mat.rf1$overall[1] # overall accuracy: (32+49)/(32+49+13+6) # note that the F1 value you see is not the one for the overall model, but just for the first class (i.e., "negative") # Therefore we have to estimate the average value of F1 for each k-fold by hands! See below conf.mat.rf1$byClass[1] # Recall for negative: (32)/(32+13) - think vertically! conf.mat.rf1$byClass[3] # Precision for negative: (32)/(32+6) - think horizontally! conf.mat.rf1$byClass[2] # Recall for positive: (49)/(49+6) - think vertically! conf.mat.rf1$byClass[4] # Precision for positive: (49)/(49+13) - think horizontally! # F1 per the negative class: (2*conf.mat.rf1$byClass[3]*conf.mat.rf1$byClass[1])/(conf.mat.rf1$byClass[3]+conf.mat.rf1$byClass[1]) # F1 for negative # and for positive? (2*conf.mat.rf1$byClass[2]*conf.mat.rf1$byClass[4])/(conf.mat.rf1$byClass[2]+conf.mat.rf1$byClass[4]) # F1 for positive RFPredict <- data.frame(col1=vector(), col2=vector(), col3=vector()) for(i in mget(ls(pattern = "conf.mat.rf")) ) { Accuracy <-(i)$overall[1] # save in the matrix the accuracy value F1_negative <- (2*(i)$byClass[1]*(i)$byClass[3])/((i)$byClass[1]+(i)$byClass[3]) # save in the matrix the F1 value for negative F1_positive <- (2*(i)$byClass[2]*(i)$byClass[4])/((i)$byClass[2]+(i)$byClass[4]) # save in the matrix the F1 value for positive RFPredict <- rbind(RFPredict , cbind(Accuracy , F1_negative, F1_positive)) } str(RFPredict ) # Let's compute the average value for accuracy and f1 acc_rf_avg <- mean(RFPredict [, 1] ) f1_rf_avg <- mean(colMeans(RFPredict [-1] )) acc_rf_avg f1_rf_avg # the fact that the avg. accuracy and the avg. F1 values are close to each other is of course a good news! ###################################################### ###################################################### # Naive Bayes LOOP ###################################################### ###################################################### system.time(for(i in 1:k){ train <- ttrain [folds$subsets[folds$which != i], ] validation <- ttrain [folds$subsets[folds$which == i], ] newrf <- multinomial_naive_bayes(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train, laplace = 1) newpred <- predict(newrf,newdata=validation, type="class") class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment) print(class_table) df<-confusionMatrix( class_table, mode = "everything") df.name<-paste0("conf.mat.nb",i) assign(df.name,df) }) # STEP 3: the metrics NBPredict <- data.frame(col1=vector(), col2=vector(), col3=vector()) for(i in mget(ls(pattern = "conf.mat.nb")) ) { Accuracy <-(i)$overall[1] # save in the matrix the accuracy value F1_negative <- (2*(i)$byClass[1]*(i)$byClass[3])/((i)$byClass[1]+(i)$byClass[3]) # save in the matrix the F1 value for negative F1_positive <- (2*(i)$byClass[2]*(i)$byClass[4])/((i)$byClass[2]+(i)$byClass[4]) # save in the matrix the F1 value for positive NBPredict <- rbind(NBPredict , cbind(Accuracy , F1_negative, F1_positive)) } str(NBPredict ) # Let's compute the average value for accuracy and f1 acc_nb_avg <- mean(NBPredict [, 1] ) f1_nb_avg <- mean(colMeans(NBPredict [-1] )) acc_nb_avg f1_nb_avg ###################################################### ###################################################### # Let's compare the results we got via Naive Bayes, SVM and RF ###################################################### ###################################################### acc_rf_avg acc_nb_avg f1_rf_avg f1_nb_avg # Let's plot the results! gb1 <- as.data.frame(acc_nb_avg ) colnames(gb1)[1] <- "Accuracy NB" gb2 <- as.data.frame(acc_rf_avg ) colnames(gb2)[1] <- "Accuracy RF" ac_tot <- cbind(gb1, gb2) ac_tot str(ac_tot) df.long_ac_tot<-melt(ac_tot) str(df.long_ac_tot) p <- ggplot(df.long_ac_tot, aes(x=variable, y=value, color = variable)) + geom_boxplot() + xlab("Algorithm") + ylab(label="Values of Accuracy") + labs(title = "Cross-validation with k =5: values of Accuracy") + coord_flip() + theme_bw() + theme(legend.position = "None") gb1 <- as.data.frame(f1_nb_avg) colnames(gb1)[1] <- "F1 NB" gb2 <- as.data.frame(f1_rf_avg ) colnames(gb2)[1] <- "F1 RF" f1_tot <- cbind(gb1, gb2) f1_tot str(f1_tot) df.long_f1_tot<-melt(f1_tot) str(df.long_f1_tot) p2 <- ggplot(df.long_f1_tot, aes(x=variable, y=value, color = variable)) + geom_boxplot() + xlab("Algorithm") + ylab(label="Values of F1") + labs(title = "Cross-validation with k =5: values of F1") + coord_flip() + theme_bw() + theme(legend.position = "None") grid.arrange(p, p2, nrow=2) # Plot everything together # So in this particular case, NB appears to be better than the RF! # But remember: you are employing a training-set consisting of just 500 documents # and NB usually does well in this scenario. Moreover... # Wait a minute! In this example we kept the tuning parameters fixed! # But is there any other tuning parameters configuration for RF and NB that will improve the model?