rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(e1071)
library(randomForest)
library(caret)
library(stringr)
library(cvTools)
library(magicfor)
library(car) 
library(reshape2)
library(gridExtra)

#### let's repeat all the steps discussed the last time to build the training and the test for NB, RF and SVM with a new step
#### called FIFTH STEP/B

################################################################
# FIRST STEP: create the DFM for the training-set
################################################################

x11 <- read.csv("trainTrump.csv", stringsAsFactors=FALSE)
x11$text <- str_replace_all(x11$text, "[^[:alnum:]]", " ")
myCorpusTwitterTrain <- corpus(x11)
Dfm_train<- dfm(myCorpusTwitterTrain , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("ā"),
("com"), ("ly")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE)

################################################################
# SECOND STEP: create the DFM for the test-set
################################################################

x10 <- read.csv("testTrump.csv", stringsAsFactors=FALSE)
x10$text <- str_replace_all(x10$text, "[^[:alnum:]]", " ")
myCorpusTwitterTest <- corpus(x10)
Dfm_test<- dfm(myCorpusTwitterTest, remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("ā"),
("com"), ("ly")),  remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE)

################################################################
# THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern.
################################################################
test_dfm  <- dfm_match(Dfm_test, features = featnames(Dfm_train))

################################################################
# FOURTH STEP/B
# transform both dfm (train and test) in a data frame
################################################################

train <- as.data.frame(as.matrix(Dfm_train))
test <- as.data.frame(as.matrix(test_dfm))
colnames(train ) <- make.names(colnames(train ))
colnames(test ) <- make.names(colnames(test ))

################################################################
# FIFTH STEP/B: A NEW STEP!!!! Add back to the traing set the values of the Sentiment
################################################################

train$Sentiment <- as.factor(Dfm_train@docvars$Sentiment)

######################################################
######################################################
# Let's start to explore the hyperparameters for the SVM
######################################################
######################################################

# Let's stick tih the linear kernel. I can investigate different combination of values for C (default: C=1) as well of epsilon

# other main hypermaters for a linear kernel: epsilon (epsilon in the insensitive-loss function (default: 0.1))
# for all other kernel (radial, polynomial), you also have gamma (default: 1/(data dimension)) - in our case: 1/length(train)
# for polynomial kernel you also have degree (default: 3) and coef0 (default: 0)

set.seed(123)
# Note that here our specification of the SVM is a bit different than what we did in the previous examples
system.time(fitSVM <- tune(svm, Sentiment~ . ,data=train,
            kernel="linear", ranges=list(cost = c(1, 2, 10),
epsilon=c(0.1, 3))))
plot(fitSVM )
summary(fitSVM )

# best model here has cost=1 and epsilon=0.1.
# Of course changing the values of c and epsilon through which looking for (for example
# by looking for values of c also >10), can change the final results

# Let's explore a radial kernel
# to make things faster in the Lab, we let only gamma to change, while fixing the cost and the epsilon
# to the values we got for the linear kernel
set.seed(123)
system.time(fitSVM_radial <- tune(svm, Sentiment~ . ,data=train,
            kernel="radial", ranges=list(cost = c(1), gamma=c( 0.001, 0.01, 0.1, 1), epsilon=c(0.1))))
summary(fitSVM_radial )
# best model here has gamma=1 when cost=1 and epsilon=0.1 

######################################################
######################################################
# Now let's explore the hyperparameters for the RF
######################################################
######################################################

# The main deafault hyperparameters in the case of a RF are the following ones: ntree (Number of trees to grow; default=500),
# mtry (Number of variables randomly sampled as candidates at each split, where p is number of variables in x; the default is =sqrt(p);
# In our case x=length(train)=939! therefore sqrt(length(train))=30.64) 
# nodesize (Minimum size of terminal nodes. Setting this number larger causes smaller trees to be grown (and thus take less time); the default is nodesize=1).

# let's fix ntree=100 and nodesize=1 to save time during the Lab class!
set.seed(123)
# Note that here our specification of the RF is a bit different than what we did in the previous examples
system.time(fitRF <- tune.randomForest(Sentiment~ . ,data=train,
  mtry=c(28, 30), nodesize=c(1), ntree=c(100)))
# best model here has mtry=30 when nodesize=1 and tree=100 
summary(fitRF  )
fitRF$best.parameters

######################################################
######################################################
# Once you have estimated the best hyperparameters setting for both RF and SVM, you could replicate the
# K-fold analysis to see now which is the best algorithm between NB, RF and SVM
######################################################
######################################################