rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(caTools) library(randomForest) library(caret) library(naivebayes) library(car) library(cvTools) library(reshape2) library(dplyr) ##################################################### # let's prepare the training-set with 3 categories (this script works fine for any number of categories>2) ##################################################### uk_train <- read.csv("uk_train2.csv") str(uk_train) myCorpusTwitterTrain <- corpus(uk_train) tok2 <- tokens(myCorpusTwitterTrain, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) Dfm_train <- dfm(tok2) # Let's trim the dfm in order to keep only tokens that appear in 2 or more tweets (tweets are very short texts...) Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE) train <- as.matrix(Dfm_train) Dfm_train [ntoken(Dfm_train ) == 0,] Dfm_train <- Dfm_train [ntoken(Dfm_train ) != 0,] # our classes table(Dfm_train@docvars$Sentiment) # our benchmark: accuracy .46 prop.table(table(Dfm_train@docvars$Sentiment)) ###################################################### ###################################################### # which main changes? Compared to the script "Lab 8 part 1" # consider the case of a RF - but that applies to all the other scripts ###################################################### ###################################################### # STEP 1: create the 5 folds train <- as.matrix(Dfm_train) ttrain <- train set.seed(1234) # set the see for replicability k <- 5 folds <- cvFolds(NROW(ttrain ), K=k) str(folds) # STEP 2: the LOOP system.time(for(i in 1:k){ train <- ttrain [folds$subsets[folds$which != i], ] validation <- ttrain [folds$subsets[folds$which == i], ] set.seed(123) newrf <- randomForest(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train, do.trace=TRUE, ntree=100) set.seed(123) newpred <- predict(newrf,newdata=validation) class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment) print(class_table) df<-confusionMatrix( class_table, mode = "everything") df.name<-paste0("conf.mat.rf",i) assign(df.name,df) }) # STEP 3: the metrics RFPredict <- data.frame(col1=vector(), col2=vector(), col3=vector(), col4=vector()) ##### FIRST CHANGE # Why 4 columns NOW? 1 for accuracy; and 3 for the K1 value of the classes in the Sentiment: negative, neutral, positive. # According to the number of classes in your output variable, changes the number of columns to fill!!! for(i in mget(ls(pattern = "conf.mat.rf")) ) { Accuracy <-(i)$overall[1] # save in the matrix the accuracy value ##### SECOND CHANGE: the following 4 lines: p <- as.data.frame((i)$byClass) F1_negative <- p$F1[1] # save in the matrix the F1 value for negative F1_neutral <- p$F1[2] # save in the matrix the F1 value for neutral F1_positive <- p$F1[3] # save in the matrix the F1 value for positive RFPredict <- rbind(RFPredict , cbind(Accuracy , F1_negative , F1_neutral, F1_positive )) } str(RFPredict ) # you see that we are not doing that well with the class "positive" # Let's compare the average value for accuracy and f1 acc_RF_avg <- mean(RFPredict [, 1] ) f1_RF_avg <- mean(colMeans(RFPredict [-1] )) acc_RF_avg f1_RF_avg