rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(e1071)
library(randomForest)
library(caret)
library(stringr)
library(car)

#### let's repeat all the steps discussed the last time to build the training and the test for NB, RF and SVM

################################################################
# FIRST STEP: create the DFM for the training-set
################################################################

x11 <- read.csv("trainTrump.csv", stringsAsFactors=FALSE)
x11$text <- str_replace_all(x11$text, "[^[:alnum:]]", " ")
myCorpusTwitterTrain <- corpus(x11)
Dfm_train<- dfm(myCorpusTwitterTrain , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("ā"),
("com"), ("ly")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE)

################################################################
# SECOND STEP: create the DFM for the test-set
################################################################

x10 <- read.csv("testTrump.csv", stringsAsFactors=FALSE)
x10$text <- str_replace_all(x10$text, "[^[:alnum:]]", " ")
myCorpusTwitterTest <- corpus(x10)
Dfm_test<- dfm(myCorpusTwitterTest, remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("ā"),
("com"), ("ly")),  remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE)

################################################################
# THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern.
################################################################
test_dfm  <- dfm_match(Dfm_test, features = featnames(Dfm_train))

################################################################
# FOURTH STEP/B
# transform both dfm (train and test) in a data frame
################################################################

train <- as.data.frame(as.matrix(Dfm_train))
test <- as.data.frame(as.matrix(test_dfm))
colnames(train ) <- make.names(colnames(train ))
colnames(test ) <- make.names(colnames(test ))

######################################################
######################################################
# Let's make CROSS-VALIDATION with k-fold=2 as an example
######################################################
######################################################

######################################################
# Let's start with NB
######################################################

######################################################
# STEP 1: divide the training-set randomly according to the number of k (here k=2)
######################################################

N <- ndoc(myCorpusTwitterTrain ) # number of documents in the training-set
k <- 2 # Number of desired folds
# Generate indices of holdout observations
set.seed(123)
holdout <- split(sample(1:N), 1:k)
str(holdout)
holdout[[1]]
holdout[[2]]
# Check that each observation appears exactly once in the holdout object:
holdout %>% unlist() %>% length() == N

######################################################
# STEP 2: create the dfm for BOTH training-sets
######################################################

head(summary(myCorpusTwitterTrain ))
docvars(myCorpusTwitterTrain , "id_numeric") <- 1:ndoc(myCorpusTwitterTrain ) # create docvar with ID
head(summary(myCorpusTwitterTrain ))

# get training set1 (all the documents in holdout[[1]]) and compute the dfm out of it
tr_1_dfm <-  dfm(corpus_subset(myCorpusTwitterTrain , id_numeric %in% holdout[[1]]), remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("ā"),
("com"), ("ly")),remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
tr_1_dfm <- dfm_trim(tr_1_dfm ,min_docfreq= 2, verbose=TRUE)

# get training set_other1 (all the documents NOT in holdout[[1]]) and compute the dfm out of it
tr_other1_dfm <- dfm(corpus_subset(myCorpusTwitterTrain ,  !id_numeric %in% holdout[[1]]), remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("ā"),
("com"), ("ly")),remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
tr_other1_dfm <- dfm_trim(tr_other1_dfm , min_docfreq= 2, verbose=TRUE)

# get training set2 (all the documents in holdout[[2]]) and compute the dfm out of it
tr_2_dfm <-  dfm(corpus_subset(myCorpusTwitterTrain ,  id_numeric %in% holdout[[2]]), remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("ā"),
("com"), ("ly")),remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
tr_2_dfm <- dfm_trim(tr_2_dfm ,min_docfreq= 2, verbose=TRUE)

# get training set_other2 (all the documents NOT in holdout[[2]]) and compute the dfm out of it
tr_other2_dfm <- dfm(corpus_subset(myCorpusTwitterTrain ,  !id_numeric %in% holdout[[2]]), remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("ā"),
("com"), ("ly")),remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
tr_other2_dfm <- dfm_trim(tr_other2_dfm , min_docfreq= 2, verbose=TRUE)

######################################################
# STEP 3: estimate the results with k=2
######################################################

#######################################
# K=1 NB (we use as a training-set all the documents NOT in holdout(1), that is tr_other1_dfm, 
# to predict the documents of the test-set, i.e., the documents in holdout(1), that is tr_1_dfm)
#######################################
nb <- textmodel_nb(tr_other1_dfm , docvars(tr_other1_dfm, "Sentiment"), distribution = c("multinomial"))
summary(nb)
# Let's make the features identical by passing training_dfm to dfm_select() as a pattern.
test_dfm <- dfm_select(tr_1_dfm, tr_other1_dfm)
# Let’s inspect how well the classification worked
actual_class1 <- docvars(test_dfm, "Sentiment")
predicted_class1 <- predict(nb, test_dfm)
table(actual_class1 ) # actual values presented in the test set
table(predicted_class1 ) # the predicted values 
class_table1 <- table(predicted_class1, actual_class1) # let's put together the predicted and actual values
class_table1
confusionMatrix(class_table1, mode = "everything")

#######################################
# K=2 NB (we use as a training-set all the documents NOT in holdout(2), that is tr_other2_dfm, 
# to predict the documents of the test-set, i.e., the documents in holdout(2), that is tr_2_dfm)
#######################################
nb <- textmodel_nb(tr_other2_dfm , docvars(tr_other2_dfm, "Sentiment"), distribution = c("multinomial"))
summary(nb)
# Let's make the features identical by passing training_dfm to dfm_select() as a pattern.
test_dfm <- dfm_select(tr_2_dfm, tr_other2_dfm)
# Let’s inspect how well the classification worked
actual_class2 <- docvars(test_dfm, "Sentiment")
predicted_class2 <- predict(nb, test_dfm)
table(predicted_class2)
prop.table(table(predicted_class2 ))
class_table2 <- table( predicted_class2, actual_class2)
class_table2
confusionMatrix(class_table2, mode = "everything")

#######################################
# Estimating CV performance metrics for NB
#######################################
conf.mat_nb1 <- confusionMatrix(class_table1, mode = "everything")
conf.mat_nb2 <- confusionMatrix(class_table2, mode = "everything")

res1_nb <- as.data.frame(conf.mat_nb1 $byClass)
res2_nb <- as.data.frame(conf.mat_nb2 $byClass)

res_accuracy <- data.frame(
   accuracy1=conf.mat_nb1$overall[1],
  accuracy2=conf.mat_nb2$overall[1]
)

res_precision <- data.frame(
  precision1=res1_nb$Precision,
    precision2=res2_nb$Precision
)

res_recall <- data.frame(
   recall1=res1_nb$Recall,
  recall2=res2_nb$Recall
)

res_f1 <- data.frame(
   f1_1=res1_nb$F1,
  f1_2=res2_nb$F1
)

rownames(res_precision ) <- rownames(res1_nb)
rownames(res_recall) <- rownames(res1_nb)
rownames(res_f1) <- rownames(res1_nb)

res_accuracy
res_precision
res_recall
res_f1

acc_nb <- rowMeans(res_accuracy ) # let's estimate the overall accuracy
f1_nb <- mean(rowMeans(res_f1 ))  # let's estimate the overall F1
acc_nb
f1_nb

######################################################
######################################################
# CROSS-VALIDATION: k-fold=2 for SVM 
######################################################
######################################################
# STEP 1B: divide the training-set randomly according to the number of k

N <- nrow(train ) # we need to work on the data frame "train" rather than on the corpus this time
k <- 2 # Number of desired folds
# Generate indices of holdout observations
set.seed(123)
holdout <- split(sample(1:N), 1:k)
str(holdout)
# Check that each observation appears exactly once in the holdout object:
holdout %>% unlist() %>% length() == N

# training set with all the data but the ones included in the sampled train 1 can be called as:
# head(train [-holdout$`1`,])
# training set with only the data included in the sampled train 1 can be called as:
# head(train[holdout$`1`,])

#######################################
# K=1 (train 1) SVM (we use as a training-set all the documents NOT in holdout(1), 
# to predict the documents of the test-set, i.e., the documents in holdout(1) )
#######################################

system.time(SVM <- svm(y= as.factor(Dfm_train[-holdout$`1`,]@docvars$Sentiment) ,x=train[-holdout$`1`,], 
kernel='linear', cost = 1))
predictSVM1 <- predict(SVM, newdata=train[holdout$`1`,])
class_table1 <- table("Predictions"= predictSVM1, "Actual"=Dfm_train[holdout$`1`,]@docvars$Sentiment)
class_table1

#######################################
# K=2 (train 2) SVM (we use as a training-set all the documents NOT in holdout(2), 
# to predict the documents of the test-set, i.e., the documents in holdout(2) )
#######################################

set.seed(123)
system.time(SVM <- svm(y= as.factor(Dfm_train[holdout$`1`,]@docvars$Sentiment) ,x=train[holdout$`1`,], 
kernel='linear', cost = 1))
predictSVM2 <- predict(SVM, newdata=train[-holdout$`1`,])
class_table2 <- table("Predictions"= predictSVM2, "Actual"=Dfm_train[-holdout$`1`,]@docvars$Sentiment)
class_table2

#######################################
# # Estimating CV performance metrics for SVM
#######################################
conf.mat_svm1 <- confusionMatrix(class_table1, mode = "everything")
conf.mat_svm2 <- confusionMatrix(class_table2, mode = "everything")

res1_svm <- as.data.frame(conf.mat_svm1 $byClass)
res2_svm <- as.data.frame(conf.mat_svm2 $byClass)

res_accuracy <- data.frame(
   accuracy1=conf.mat_svm1$overall[1],
  accuracy2=conf.mat_svm2$overall[1]
)

res_precision <- data.frame(
  precision1=res1_svm$Precision,
    precision2=res2_svm$Precision
)

res_recall <- data.frame(
   recall1=res1_svm$Recall,
  recall2=res2_svm$Recall
)

res_f1 <- data.frame(
   f1_1=res1_svm$F1,
  f1_2=res2_svm$F1
)

rownames(res_precision ) <- rownames(res1_svm)
rownames(res_recall) <- rownames(res1_svm)
rownames(res_f1) <- rownames(res1_svm)

acc_svm <- rowMeans(res_accuracy )
f1_svm <- mean(rowMeans(res_f1 ))

acc_svm
f1_svm

######################################################
######################################################
# CROSS-VALIDATION: k-fold=2 for RF 
######################################################
######################################################

# replicate STEP 1B if you have not still done it; otherwise, skyp STEP 1B

#######################################
# K=1 (train 1) RF (we use as a training-set all the documents NOT in holdout(1), 
# to predict the documents of the test-set, i.e., the documents in holdout(1) )
#######################################

set.seed(123)
system.time(RF<- randomForest(y= as.factor(Dfm_train[-holdout$`1`,]@docvars$Sentiment) ,x=train[-holdout$`1`,], 
 ntree=100, do.trace=TRUE))
predictRF1 <- predict(RF, newdata=train[holdout$`1`,])
class_table1 <- table("Predictions"= predictRF1 , "Actual"=Dfm_train[holdout$`1`,]@docvars$Sentiment)
class_table1

#######################################
# K=2 (train 2) RF (we use as a training-set all the documents NOT in holdout(2), 
# to predict the documents of the test-set, i.e., the documents in holdout(2) )
#######################################

set.seed(123)
system.time(RF2<- randomForest(y= as.factor(Dfm_train[holdout$`1`,]@docvars$Sentiment) ,x=train[holdout$`1`,], 
 ntree=100, do.trace=TRUE))
predictRF2 <- predict(RF2, newdata=train[-holdout$`1`,])
class_table2 <- table("Predictions"= predictRF2, "Actual"=Dfm_train[-holdout$`1`,]@docvars$Sentiment)
class_table2

#######################################
# # Estimating CV performance metrics for RF
#######################################

conf.mat_rf1 <- confusionMatrix(class_table1, mode = "everything")
conf.mat_rf2 <- confusionMatrix(class_table2, mode = "everything")

res1_rf <- as.data.frame(conf.mat_rf1 $byClass)
res2_rf <- as.data.frame(conf.mat_rf2 $byClass)

res_accuracy <- data.frame(
   accuracy1=conf.mat_rf1$overall[1],
  accuracy2=conf.mat_rf2$overall[1]
)

res_precision <- data.frame(
  precision1=res1_rf$Precision,
    precision2=res2_rf$Precision
)

res_recall <- data.frame(
   recall1=res1_rf$Recall,
  recall2=res2_rf$Recall
)

res_f1 <- data.frame(
   f1_1=res1_rf$F1,
  f1_2=res2_rf$F1
)

rownames(res_precision ) <- rownames(res1_rf)
rownames(res_recall) <- rownames(res1_rf)
rownames(res_f1) <- rownames(res1_rf)

acc_rf <- rowMeans(res_accuracy )
f1_rf <- mean(rowMeans(res_f1 ))

acc_rf
f1_rf

######################################################
# let's compare the results we got!
######################################################

acc_nb
acc_svm
acc_rf

f1_nb
f1_svm
f1_rf

######################################################
######################################################
# OF COURSE k should be always higher than 2! Usually is 5 or 10...
# Replicating the analysis in this case could be very boring, isn't it? And then?!?
######################################################
######################################################