rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(randomForest) 
library(caret)
library(naivebayes)
library(car)
library(cvTools)
library(reshape2)
library(dplyr)

#####################################################
# let's prepare the training-set with 3 categories (this script works fine for any number of categories>2)
#####################################################

uk_train <- read.csv("uk_train2.csv")
str(uk_train)

myCorpusTwitterTrain <- corpus(uk_train)
tok2 <- tokens(myCorpusTwitterTrain, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
Dfm_train <- dfm(tok2)
# Let's trim the dfm in order to keep only tokens that appear in 2 or more tweets (tweets are very short texts...) 
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE)
train <- as.matrix(Dfm_train) 

Dfm_train [ntoken(Dfm_train ) == 0,]
Dfm_train <- Dfm_train [ntoken(Dfm_train ) != 0,]

# our classes
table(Dfm_train@docvars$Sentiment)
# our benchmark: accuracy .46
prop.table(table(Dfm_train@docvars$Sentiment))

######################################################
######################################################
# which main changes? Compared to the script "Lab 8 part 1"
# consider the case of a RF - but that applies to all the other scripts
######################################################
######################################################

# STEP 1: create the 5 folds

train <- as.matrix(Dfm_train)
ttrain <- train 
set.seed(1234) # set the see for replicability
k <- 5
folds <- cvFolds(NROW(ttrain ), K=k)
str(folds)

# STEP 2: the LOOP

system.time(for(i in 1:k){
  train <- ttrain [folds$subsets[folds$which != i], ] 
  validation <- ttrain [folds$subsets[folds$which == i], ]
  set.seed(123)
  newrf <- randomForest(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train, do.trace=TRUE, ntree=100) 
  set.seed(123)
newpred <- predict(newrf,newdata=validation) 
class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment)
print(class_table)  
  df<-confusionMatrix( class_table, mode = "everything") 
  df.name<-paste0("conf.mat.rf",i)
  assign(df.name,df)
})

# STEP 3: the metrics

RFPredict <- data.frame(col1=vector(), col2=vector(), col3=vector(), col4=vector()) ##### FIRST CHANGE
# Why 4 columns NOW? 1 for accuracy; and 3 for the K1 value of the classes in the Sentiment: negative, neutral, positive.
# According to the number of classes in your output variable, changes the number of columns to fill!!!

for(i in  mget(ls(pattern = "conf.mat.rf")) ) {
Accuracy <-(i)$overall[1] # save in the matrix the accuracy value
##### SECOND CHANGE: the following 4 lines: 
p <- as.data.frame((i)$byClass)
F1_negative <- p$F1[1] # save in the matrix the F1 value for negative
F1_neutral <- p$F1[2] # save in the matrix the F1 value for neutral
F1_positive  <- p$F1[3] # save in the matrix the F1 value for positive
RFPredict <- rbind(RFPredict , cbind(Accuracy , F1_negative , F1_neutral, F1_positive ))
 }
str(RFPredict )

# you see that we are not doing that well with the class "positive"

# Let's compare the average value for accuracy and f1

acc_RF_avg <- mean(RFPredict [, 1] )
f1_RF_avg <- mean(colMeans(RFPredict [-1] ))

acc_RF_avg
f1_RF_avg