rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/mw/Dropbox (Personale)/TOPIC MODEL")
getwd()
library(e1071)
library(caTools)
library(randomForest)
library(caret)
library(quanteda)
library(readtext)

### TRAIN-TEST
# This dataset is a sample of tweets mentioning the official account of Donald Trump “@realDonaldTrump”, 
# on dates 7–13 June 2016, written in English and coming from the US. 
# Data have been collected through Twitter API also specifying language and origin of tweets. 
# This dataset include a sample of around 482 tweets that have been manually codified by a group
# of students. The coding stage involved detecting the sentiment towards Trump (negative, positive,
# neutral).

x <- read.csv("Trump-orig3.csv", stringsAsFactors=FALSE)
str(x)
table(x$Sentiment)
x$Sentiment <- as.factor(x$Sentiment)
str(x)
table(x$Sentiment)
prop.table(table(x$Sentiment))
myCorpusTrain <- corpus(x)
summary(myCorpusTrain )

### TEST-SET
### This is a sample of 1000 tweets written in English about Trump and published since 1.17.2018 till 1.19.2018
x10 <- read.csv("Trump_tweets2.csv", stringsAsFactors=FALSE)
str(x10)
x10$X <- NULL
myCorpusTest <- corpus(x10)
summary(myCorpusTest)

###########################################
## Prepare the two DFMs (for traing and test set)
###########################################

# get training set (documents in id_train) and compute the dfm out of it
training_corpus <- myCorpusTrain
summary(training_corpus )
training_dfm <- dfm(training_corpus  , remove = c(stopwords("english"), ("pic.twitter.com"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
topfeatures(training_dfm , 20)  # 20 top words
# Keep terms that appear at least in the 5% or more of the tweets (tweets are very short texts...)
training_dfm <- dfm_trim(training_dfm , min_docfreq= 0.05)

# get test set (documents in id_train) and compute the dfm out of it
test_corpus <- myCorpusTest
summary(test_corpus )
test_dfm <-  dfm(test_corpus , remove = c(stopwords("english"), ("pic.twitter.com"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
topfeatures(test_dfm , 20)  # 20 top words
# Keep terms that appear at least in the 5% or more of the tweets (tweets are very short texts...)
test_dfm <- dfm_trim(test_dfm , min_docfreq= 0.05)

# Let's make the features identical by passing training_dfm to dfm_select() as a pattern.
test_dfm <- dfm_select(test_dfm, training_dfm)

###########################################
## Try a Naive Bayes model 
###########################################

# train the naive Bayes classifier using textmodel_nb() and a Multinomial distribution
nb <- textmodel_nb(training_dfm, docvars(training_dfm, "Sentiment"), distribution = c("multinomial"))
summary(nb)

predicted_class <- predict(nb, test_dfm)
table(predicted_class)
str(predicted_class)
nb_results <- prop.table(table(predicted_class ))
nb_results

###########################################
## Prepare the two data-frames out of the two DFMs (for traing and test set)
## this is needed for RF, SVM, etc. that work with data.frames not with DFM!
###########################################

# extract a data frame from the training DFM
train_df <- as.data.frame(as.matrix(training_dfm))
str(train_df)
colnames(train_df) <- make.names(colnames(train_df), unique=TRUE)

# add the column of the Sentiment included in the training-set back to the training-set data frame
table(training_corpus$documents$Sentiment)
train_df$sentiment<- training_corpus$documents$Sentiment
table(train_df$sentiment)
str(train_df$sentiment)

# extract a data frame from the test DFM
test_df <- as.data.frame(as.matrix(test_dfm))
colnames(test_df) <- make.names(colnames(test_df), unique=TRUE)

###########################################
## Try a Random Forest model 
###########################################

# train the Random Forest classifier
set.seed(123)
RF <- randomForest(sentiment~ ., data=train_df, type="classification")

predictRF <- predict(RF, newdata=test_df)
table( predictRF)
rf_results <- prop.table(table(predictRF ))
rf_results

###########################################
## Try a Support Vector Machine model 
###########################################

# train the SVM classifier
set.seed(123)
SVM <- svm(sentiment~ ., data=train_df, method = "C-classification", kernel='linear', cost=1)

predictSVM  <- predict(SVM , newdata=test_df)
table( predictSVM )
svm_results <- prop.table(table(predictSVM  ))
svm_results

#######################################
#### plot NB, RF and SVM results
#######################################

myFrame1 <- as.data.frame(prop.table(table(predictRF )))
str(myFrame1)
colnames(myFrame1)[1] <- "Prediction"
myFrame1$class <- c("RF", "RF", "RF")
str(myFrame1)

myFrame2 <- as.data.frame(prop.table(table(predicted_class )))
str(myFrame2)
colnames(myFrame2)[1] <- "Prediction"
myFrame2$class <- c("NB", "NB", "NB")
str(myFrame2)

myFrame1 <- as.data.frame(prop.table(table(predictRF )))
str(myFrame1)
colnames(myFrame1)[1] <- "Prediction"
myFrame1$class <- c("RF", "RF", "RF")
str(myFrame1)

myFrame3 <- as.data.frame(prop.table(table(predictSVM )))
colnames(myFrame3)[1] <- "Prediction"
myFrame3$class <- c("SVM", "SVM", "SVM")
str(myFrame3)

myFrame_tot <- rbind(myFrame1, myFrame2, myFrame3)
str(myFrame_tot)

library(reshape2)
df.long<-melt(myFrame_tot,id.vars=c("class", "Freq"))
str(df.long)

ggplot(df.long,aes(class,Freq,fill=value))+
 geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + 
coord_flip() +  
ylab(label="Frequency") +  xlab("Algorithm") + ggtitle("RF vs. Naive Bayes vs. SVM predictions for the test-set")

# which of the three algorithm to choose? Do some cross-validation!!!

###########################################
# cross-validation with NB with K=4 (each TS of 118)
###########################################

summary( myCorpusTrain)
N <- ndoc(myCorpusTrain)
# Number of desired splits
folds <- 4
# Generate indices of holdout observations
holdout <- split(sample(1:N), 1:folds)
str(holdout)
# Check that each observation appears exactly once in the holdout object:
holdout %>% unlist() %>% length() == N
holdout[[1]]

# create docvar with ID
docvars(myCorpusTrain, "id_numeric") <- 1:ndoc(myCorpusTrain)
summary(myCorpusTrain)

# get training set_other1 (all the documents NOT in holdout[[1]]) and compute the dfm out of it
tr_other1 <- corpus_subset(myCorpusTrain,  !id_numeric %in% holdout[[1]])
tr_other1_dfm <- dfm(tr_other1  , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
tr_other1_dfm <- dfm_trim(tr_other1_dfm , min_docfreq= 0.05)

# get training set_other2 (all the documents NOT in holdout[[2]]) and compute the dfm out of it
tr_other2 <- corpus_subset(myCorpusTrain,  !id_numeric %in% holdout[[2]])
tr_other2_dfm <- dfm(tr_other2  , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
tr_other2_dfm <- dfm_trim(tr_other2_dfm , min_docfreq= 0.05)

# get training set_other3 (all the documents NOT in holdout[[3]]) and compute the dfm out of it
tr_other3 <- corpus_subset(myCorpusTrain,  !id_numeric %in% holdout[[3]])
tr_other3_dfm <- dfm(tr_other3  , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
tr_other3_dfm <- dfm_trim(tr_other3_dfm , min_docfreq= 0.05)

# get training set_other4 (all the documents NOT in holdout[[4]]) and compute the dfm out of it
tr_other4 <- corpus_subset(myCorpusTrain,  !id_numeric %in% holdout[[4]])
tr_other4_dfm <- dfm(tr_other4  , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
tr_other4_dfm <- dfm_trim(tr_other4_dfm , min_docfreq= 0.05)

# get training set1 (all the documents in holdout[[1]]) and compute the dfm out of it
tr_1 <- corpus_subset(myCorpusTrain,  id_numeric %in% holdout[[1]])
tr_1_dfm <- dfm(tr_1   , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
tr_1_dfm <- dfm_trim(tr_1_dfm , min_docfreq= 0.05)

# get training set2 (all the documents in holdout[[2]]) and compute the dfm out of it
tr_2 <- corpus_subset(myCorpusTrain,  id_numeric %in% holdout[[2]])
tr_2_dfm <- dfm(tr_2   , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
tr_2_dfm <- dfm_trim(tr_2_dfm , min_docfreq= 0.05)

# get training set3 (all the documents in holdout[[3]]) and compute the dfm out of it
tr_3 <- corpus_subset(myCorpusTrain,  id_numeric %in% holdout[[3]])
tr_3_dfm <- dfm(tr_3   , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
tr_3_dfm <- dfm_trim(tr_3_dfm , min_docfreq= 0.05)

# get training set4 (all the documents in holdout[[4]]) and compute the dfm out of it
tr_4 <- corpus_subset(myCorpusTrain,  id_numeric %in% holdout[[4]])
tr_4_dfm <- dfm(tr_4   , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
tr_4_dfm <- dfm_trim(tr_4_dfm , min_docfreq= 0.05)

#######################################
# K=1 (train 1) NB
#######################################
nb <- textmodel_nb(tr_other1_dfm , docvars(tr_other1_dfm, "Sentiment"), distribution = c("multinomial"))
summary(nb)
# Let's make the features identical by passing training_dfm to dfm_select() as a pattern.
test_dfm <- dfm_select(tr_1_dfm, tr_other1_dfm)
# Let’s inspect how well the classification worked
actual_class1 <- docvars(test_dfm, "Sentiment")
predicted_class1 <- predict(nb, test_dfm)
table(predicted_class1)
prop.table(table(predicted_class1 ))
class_table1 <- table(actual_class1, predicted_class1)
class_table1
confusionMatrix(class_table1, mode = "everything")

#######################################
# K=2 (train 2) NB
#######################################
nb <- textmodel_nb(tr_other2_dfm , docvars(tr_other2_dfm, "Sentiment"), distribution = c("multinomial"))
summary(nb)
# Let's make the features identical by passing training_dfm to dfm_select() as a pattern.
test_dfm <- dfm_select(tr_2_dfm, tr_other2_dfm)
# Let’s inspect how well the classification worked
actual_class2 <- docvars(test_dfm, "Sentiment")
predicted_class2 <- predict(nb, test_dfm)
table(predicted_class2)
prop.table(table(predicted_class2 ))
class_table2 <- table(actual_class2, predicted_class2)
class_table2
confusionMatrix(class_table2, mode = "everything")

#######################################
# K=3 (train 3) NB
#######################################
nb <- textmodel_nb(tr_other3_dfm , docvars(tr_other3_dfm, "Sentiment"), distribution = c("multinomial"))
summary(nb)
# Let's make the features identical by passing training_dfm to dfm_select() as a pattern.
test_dfm <- dfm_select(tr_3_dfm, tr_other3_dfm)
# Let’s inspect how well the classification worked
actual_class3 <- docvars(test_dfm, "Sentiment")
predicted_class3 <- predict(nb, test_dfm)
table(predicted_class3)
prop.table(table(predicted_class3 ))
class_table3 <- table(actual_class3, predicted_class3)
class_table3
confusionMatrix(class_table3, mode = "everything")

#######################################
# K=4 (train 4) NB
#######################################
nb <- textmodel_nb(tr_other4_dfm , docvars(tr_other4_dfm, "Sentiment"), distribution = c("multinomial"))
summary(nb)
# Let's make the features identical by passing training_dfm to dfm_select() as a pattern.
test_dfm <- dfm_select(tr_4_dfm, tr_other4_dfm)
# Let’s inspect how well the classification worked
actual_class4 <- docvars(test_dfm, "Sentiment")
predicted_class4 <- predict(nb, test_dfm)
table(predicted_class4)
prop.table(table(predicted_class4 ))
class_table4 <- table(actual_class4, predicted_class4)
class_table4
confusionMatrix(class_table4, mode = "everything")

#######################################
# NB cross-validation accuracy, precision & recall for each class in Sentiment
#######################################

cm1 <- confusionMatrix(class_table1, mode = "everything")
cm2 <-confusionMatrix(class_table2, mode = "everything")
cm3 <-confusionMatrix(class_table3, mode = "everything")
cm4 <-confusionMatrix(class_table4, mode = "everything")

str(cm1)
str(cm1$overall)
(cm1$overall[1]+cm2$overall[1]+cm3$overall[1]+cm4$overall[1])/4
accuracy_mean_nb <- rbind(cm1$overall[1], cm2$overall[1], cm3$overall[1], cm4$overall[1])
accuracy_mean_nb
accuracy_nb_avg <- mean(accuracy_mean_nb)
accuracy_nb_sd <-sd(accuracy_mean_nb)

# Recall (=Sensitivity; True Positive/Actual Positive or TP/(TP+FN))
# Precision (=Pos Pred Value; True Positive/Predicted Positive or TP/(TP+FP))

confusionMatrix(class_table1, mode = "everything")
str(cm1$byClass)
cm1$byClass[1:3] # recall row
cm1$byClass[7:9] # precision row

precision_nb <-  as.data.frame(cbind(cm1$byClass[7:9], cm2$byClass[7:9], cm3$byClass[7:9], cm4$byClass[7:9]))
precision_nb$sentiment <- c("Negative", "Neutral", "Positive") 
str(precision_nb )
precision_nb [1:4]
precision_nb_avg <- as.data.frame(rowMeans(precision_nb [1:4], na.rm = TRUE))
precision_nb_avg $sentiment <- c("Negative", "Neutral", "Positive") 
str(precision_nb_avg)
colnames(precision_nb_avg)[1] <- "Precision NB"
str(precision_nb_avg)

recall_nb <-  as.data.frame(cbind(cm1$byClass[1:3], cm2$byClass[1:3], cm3$byClass[1:3], cm4$byClass[1:3]))
recall_nb$sentiment <- c("Negative", "Neutral", "Positive") 
str(recall_nb )
recall_nb [1:4]
recall_nb_avg <- as.data.frame(rowMeans(recall_nb [1:4], na.rm = TRUE))
recall_nb_avg $sentiment <- c("Negative", "Neutral", "Positive") 
str(recall_nb_avg)
colnames(recall_nb_avg)[1] <- "Recall NB"
str(recall_nb_avg)

accuracy_nb_avg
accuracy_nb_sd
precision_nb_avg
recall_nb_avg

###########################################
# cross-validation with RF with K=4 (each TS of 118)
###########################################

N <- nrow(train_df)
N
# Number of desired splits
folds <- 4
# Generate indices of holdout observations
holdout <- split(sample(1:N), 1:folds)
str(holdout)
# Check that each observation appears exactly once in the holdout object:
holdout %>% unlist() %>% length() == N
holdout[[1]]

# training set with all the data but the ones included in the sampled train 1
data=train_df[-holdout$`1`,]
nrow(data)
472-118

# training set with only the data included in the sampled train 1
newdata=train_df[holdout$`1`,]
nrow(data)

#######################################
# K=1 (train 1) RF
#######################################

set.seed(123)
system.time(RF <- randomForest(sentiment~ ., data=train_df[-holdout$`1`,], type="classification"))
predictRF1 <- predict(RF, newdata=train_df[holdout$`1`,])
table( predictRF1)
rf_results1 <- prop.table(table(predictRF1 ))
rf_results1
newdata<-train_df[holdout$`1`,]
table("Predictions"= predictRF1, "Actual"=newdata$sentiment)
# Let's use the confusionMatrix command 
conf.rf1 <- confusionMatrix( predictRF1, newdata$sentiment)
conf.rf1 

#######################################
# K=2 (train 2) RF
#######################################

set.seed(123)
system.time(RF <- randomForest(sentiment~ ., data=train_df[-holdout$`2`,], type="classification"))
predictRF2 <- predict(RF, newdata=train_df[holdout$`2`,])
table( predictRF2)
rf_results2 <- prop.table(table(predictRF2 ))
rf_results2
newdata<-train_df[holdout$`2`,]
table("Predictions"= predictRF2, "Actual"=newdata$sentiment)
# Let's use the confusionMatrix command 
conf.rf2 <- confusionMatrix( predictRF2, newdata$sentiment)
conf.rf2 

#######################################
# K=3 (train 3) RF
#######################################

set.seed(123)
system.time(RF <- randomForest(sentiment~ ., data=train_df[-holdout$`3`,], type="classification"))
predictRF3 <- predict(RF, newdata=train_df[holdout$`3`,])
table( predictRF3)
rf_results3 <- prop.table(table(predictRF3 ))
rf_results3
newdata<-train_df[holdout$`3`,]
table("Predictions"= predictRF3, "Actual"=newdata$sentiment)
# Let's use the confusionMatrix command 
conf.rf3 <- confusionMatrix( predictRF3, newdata$sentiment)
conf.rf3 

#######################################
# K=4 (train 4) RF
#######################################

set.seed(123)
system.time(RF <- randomForest(sentiment~ ., data=train_df[-holdout$`4`,], type="classification"))
predictRF4 <- predict(RF, newdata=train_df[holdout$`4`,])
table( predictRF4)
rf_results4 <- prop.table(table(predictRF4 ))
rf_results4
newdata<-train_df[holdout$`4`,]
table("Predictions"= predictRF4, "Actual"=newdata$sentiment)
# Let's use the confusionMatrix command 
conf.rf4 <- confusionMatrix( predictRF4, newdata$sentiment)
conf.rf4

#######################################
# RF cross-validation accuracy, precision & recall for each class in Sentiment
#######################################

(conf.rf1$overall[1] + conf.rf2$overall[1] + conf.rf3$overall[1] + conf.rf4$overall[1])/4
accuracy_mean_rf <- rbind(conf.rf1$overall[1], conf.rf2$overall[1], conf.rf3$overall[1], conf.rf4$overall[1])
accuracy_mean_rf
accuracy_rf_avg <- mean(accuracy_mean_rf)
accuracy_rf_sd <- sd(accuracy_mean_rf) 

recall_rf<-  as.data.frame(cbind(conf.rf1$byClass[1:3], conf.rf2$byClass[1:3], conf.rf3$byClass[1:3], conf.rf4$byClass[1:3]))
recall_rf$sentiment <- c("Negative", "Neutral", "Positive") 
str(recall_rf)
recall_rf[1:4]
recall_rf_avg <- as.data.frame(rowMeans(recall_rf[1:4], na.rm = TRUE))
recall_rf_avg $sentiment <- c("Negative", "Neutral", "Positive") 
colnames(recall_rf_avg)[1] <- "Recall RF"
str(recall_rf_avg)

precision_rf <-  as.data.frame(cbind(conf.rf1$byClass[7:9], conf.rf2$byClass[7:9], conf.rf3$byClass[7:9],  conf.rf4$byClass[7:9]))
precision_rf$sentiment <- c("Negative", "Neutral", "Positive") 
str(precision_rf)
precision_rf[1:4]
precision_rf_avg <- as.data.frame(rowMeans(precision_rf[1:4], na.rm = TRUE))
precision_rf_avg $sentiment <- c("Negative", "Neutral", "Positive") 
colnames(precision_rf_avg)[1] <- "Precision RF"
str(precision_rf_avg)

accuracy_rf_avg
accuracy_rf_sd 
recall_rf_avg
precision_rf_avg

###########################################
# cross-validation with SVM with K=4 (each TS of 118)
###########################################

#######################################
# K=1 (train 1) SVM
#######################################

set.seed(123)
system.time(SVM <- svm(sentiment~ ., data=train_df[-holdout$`1`,], method = "C-classification", kernel='linear', cost=1))
predictSVM1 <- predict(SVM, newdata=train_df[holdout$`1`,])
table( predictSVM1)
svm_results1 <- prop.table(table(predictSVM1 ))
svm_results1
newdata<-train_df[holdout$`1`,]
table("Predictions"= predictSVM1, "Actual"=newdata$sentiment)
# Let's use the confusionMatrix command 
conf.svm1 <- confusionMatrix( predictSVM1, newdata$sentiment)
conf.svm1 

#######################################
# K=2 (train 2) SVM
#######################################

set.seed(123)
system.time(SVM <- svm(sentiment~ ., data=train_df[-holdout$`2`,], method = "C-classification", kernel='linear', cost=1))
predictSVM2 <- predict(SVM, newdata=train_df[holdout$`2`,])
table( predictSVM2)
svm_results2 <- prop.table(table(predictSVM2 ))
svm_results2
newdata<-train_df[holdout$`2`,]
table("Predictions"= predictSVM2, "Actual"=newdata$sentiment)
# Let's use the confusionMatrix command 
conf.svm2 <- confusionMatrix( predictSVM2, newdata$sentiment)
conf.svm2 

#######################################
# K=3 (train 3) SVM
#######################################

set.seed(123)
system.time(SVM <- svm(sentiment~ ., data=train_df[-holdout$`3`,], method = "C-classification", kernel='linear', cost=1))
predictSVM3 <- predict(SVM, newdata=train_df[holdout$`3`,])
table( predictSVM3)
svm_results3 <- prop.table(table(predictSVM3 ))
svm_results3
newdata<-train_df[holdout$`3`,]
table("Predictions"= predictSVM3, "Actual"=newdata$sentiment)
# Let's use the confusionMatrix command 
conf.svm3 <- confusionMatrix( predictSVM3, newdata$sentiment)
conf.svm3 

#######################################
# K=4 (train 4) SVM
#######################################

set.seed(123)
system.time(SVM <- svm(sentiment~ ., data=train_df[-holdout$`4`,], method = "C-classification", kernel='linear', cost=1))
predictSVM4 <- predict(SVM, newdata=train_df[holdout$`4`,])
table( predictSVM4)
svm_results4 <- prop.table(table(predictSVM4 ))
svm_results4
newdata<-train_df[holdout$`4`,]
table("Predictions"= predictSVM4, "Actual"=newdata$sentiment)
# Let's use the confusionMatrix command 
conf.svm4 <- confusionMatrix( predictSVM4, newdata$sentiment)
conf.svm4 

#######################################
# SVM cross-validation accuracy, precision & recall for each class in Sentiment
#######################################

(conf.svm1$overall[1] + conf.svm2$overall[1] + conf.svm3$overall[1] + conf.svm4$overall[1])/4
accuracy_mean_svm <- rbind(conf.svm1$overall[1], conf.svm2$overall[1], conf.svm3$overall[1], conf.svm4$overall[1])
accuracy_mean_svm
accuracy_svm_avg <- mean(accuracy_mean_svm)
accuracy_svm_sd <- sd(accuracy_mean_svm) 

recall_svm<-  as.data.frame(cbind(conf.svm1$byClass[1:3], conf.svm2$byClass[1:3], conf.svm3$byClass[1:3], conf.svm4$byClass[1:3]))
recall_svm$sentiment <- c("Negative", "Neutral", "Positive") 
str(recall_svm)
recall_svm[1:4]
recall_svm_avg <- as.data.frame(rowMeans(recall_svm[1:4], na.rm = TRUE))
recall_svm_avg $sentiment <- c("Negative", "Neutral", "Positive") 
colnames(recall_svm_avg)[1] <- "Recall SVM"
str(recall_svm_avg)

precision_svm <-  as.data.frame(cbind(conf.svm1$byClass[7:9], conf.svm2$byClass[7:9], conf.svm3$byClass[7:9],  conf.svm4$byClass[7:9]))
precision_svm$sentiment <- c("Negative", "Neutral", "Positive") 
str(precision_svm)
precision_svm[1:4]
precision_svm_avg <- as.data.frame(rowMeans(precision_svm[1:4], na.rm = TRUE))
precision_svm_avg $sentiment <- c("Negative", "Neutral", "Positive") 
colnames(precision_svm_avg)[1] <- "Precision SVM"
str(precision_svm_avg)

accuracy_svm_avg
accuracy_svm_sd 
recall_svm_avg
precision_svm_avg

######################
######################
## comparing NB, RF and SVM cross validation and plotting the results
######################
######################

gb1 <- as.data.frame(accuracy_mean_rf )
colnames(gb1)[1] <- "Accuracy RF"
gb2 <- as.data.frame(accuracy_mean_nb )
colnames(gb2)[1] <- "Accuracy NB"
gb3 <- as.data.frame(accuracy_mean_svm )
colnames(gb3)[1] <- "Accuracy SVM"

gb_tot <- cbind(gb1, gb2, gb3)
gb_tot
str(gb_tot)
df.long_gb_tot<-melt(gb_tot)
str(df.long_gb_tot)

ggplot(df.long_gb_tot, aes(x=variable, y=value)) + 
  geom_boxplot() + coord_flip() + xlab("Algorithm") + ylab(label="Value") + 
ggtitle("RF vs. Naive Bayes vs. SVM K-fold cross-validation (K=4): Accuracy")

p2 <- ggplot(df.long_gb_tot, aes(x=variable, y=value)) + 
  geom_boxplot() + coord_flip() + xlab("Algorithm") + ylab(label="Value") + 
ggtitle("RF vs. Naive Bayes K-fold vs. SVM cross-validation (K=4): Accuracy")

cv_rf <- merge(recall_rf_avg, precision_rf_avg, by=c("sentiment")) 
str(cv_rf)
cv_nb <- merge(recall_nb_avg, precision_nb_avg, by=c("sentiment")) 
cv_svm <- merge(recall_svm_avg, precision_svm_avg, by=c("sentiment")) 
cv <- merge(cv_rf, cv_nb, by=c("sentiment")) 
cv <- merge(cv, cv_svm, by=c("sentiment")) 
str(cv)
cv<- cv[c(1,3,5,7,2, 4, 6)]
str(cv)

df.long2<-melt(cv,id.vars=c("sentiment"))
str(df.long2)

p <- ggplot(df.long2,aes(variable,value,fill=sentiment))+
 geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + 
coord_flip() +  
ylab(label="Frequency") +  xlab("Algorithm") + ggtitle("RF vs. Naive Bayes vs. SVM K-fold cross-validation (K=4): Precision and Recall")

# Plot everything together
grid.arrange(p2, p,
             nrow=2)

library(gridExtra)
grid.arrange(
  tableGrob(cv, theme=tt1))
grid.table(cv)
tt1 <- ttheme_default()
t <-tableGrob(cv, theme=tt1)

# Plot chart and table into one object
grid.arrange(p, t,
             nrow=2,
             as.table=TRUE,
             heights=c(3,1))

# Plot together chart and table
 ggplot(df.long2,aes(variable,value,fill=sentiment))+
 geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + 
coord_flip() +  
ylab(label="Frequency") +  xlab("Algorithm") + ggtitle("RF vs. Naive Bayes K-fold cross-validation (K=4)") +
annotation_custom(tableGrob(cv), xmin=0.5, xmax=1, ymin=0.5, ymax=1)