rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(caTools) library(e1071) library(randomForest) library(caret) library(stringr) library(cvTools) library(magicfor) library(car) library(reshape2) library(gridExtra) #### let's repeat all the steps discussed the last time to build the training and the test for NB, RF and SVM with a new step #### called FIFTH STEP/B ################################################################ # FIRST STEP: create the DFM for the training-set ################################################################ x11 <- read.csv("trainTrump.csv", stringsAsFactors=FALSE) x11$text <- str_replace_all(x11$text, "[^[:alnum:]]", " ") myCorpusTwitterTrain <- corpus(x11) Dfm_train<- dfm(myCorpusTwitterTrain , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("â"), ("com"), ("ly")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE) ################################################################ # SECOND STEP: create the DFM for the test-set ################################################################ x10 <- read.csv("testTrump.csv", stringsAsFactors=FALSE) x10$text <- str_replace_all(x10$text, "[^[:alnum:]]", " ") myCorpusTwitterTest <- corpus(x10) Dfm_test<- dfm(myCorpusTwitterTest, remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("â"), ("com"), ("ly")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE) ################################################################ # THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern. ################################################################ test_dfm <- dfm_match(Dfm_test, features = featnames(Dfm_train)) ################################################################ # FOURTH STEP/B # transform both dfm (train and test) in a data frame ################################################################ train <- as.data.frame(as.matrix(Dfm_train)) test <- as.data.frame(as.matrix(test_dfm)) colnames(train ) <- make.names(colnames(train )) colnames(test ) <- make.names(colnames(test )) ################################################################ # FIFTH STEP/B: A NEW STEP!!!! Add back to the traing set the values of the Sentiment ################################################################ train$Sentiment <- as.factor(Dfm_train@docvars$Sentiment) ###################################################### ###################################################### # Let's start to explore the hyperparameters for the SVM ###################################################### ###################################################### # Let's stick tih the linear kernel. I can investigate different combination of values for C (default: C=1) as well of epsilon # other main hypermaters for a linear kernel: epsilon (epsilon in the insensitive-loss function (default: 0.1)) # for all other kernel (radial, polynomial), you also have gamma (default: 1/(data dimension)) - in our case: 1/length(train) # for polynomial kernel you also have degree (default: 3) and coef0 (default: 0) set.seed(123) # Note that here our specification of the SVM is a bit different than what we did in the previous examples system.time(fitSVM <- tune(svm, Sentiment~ . ,data=train, kernel="linear", ranges=list(cost = c(1, 2, 10), epsilon=c(0.1, 3)))) plot(fitSVM ) summary(fitSVM ) # best model here has cost=1 and epsilon=0.1. # Of course changing the values of c and epsilon through which looking for (for example # by looking for values of c also >10), can change the final results # Let's explore a radial kernel # to make things faster in the Lab, we let only gamma to change, while fixing the cost and the epsilon # to the values we got for the linear kernel set.seed(123) system.time(fitSVM_radial <- tune(svm, Sentiment~ . ,data=train, kernel="radial", ranges=list(cost = c(1), gamma=c( 0.001, 0.01, 0.1, 1), epsilon=c(0.1)))) summary(fitSVM_radial ) # best model here has gamma=1 when cost=1 and epsilon=0.1 ###################################################### ###################################################### # Now let's explore the hyperparameters for the RF ###################################################### ###################################################### # The main deafault hyperparameters in the case of a RF are the following ones: ntree (Number of trees to grow; default=500), # mtry (Number of variables randomly sampled as candidates at each split, where p is number of variables in x; the default is =sqrt(p); # In our case x=length(train)=939! therefore sqrt(length(train))=30.64) # nodesize (Minimum size of terminal nodes. Setting this number larger causes smaller trees to be grown (and thus take less time); the default is nodesize=1). # let's fix ntree=100 and nodesize=1 to save time during the Lab class! set.seed(123) # Note that here our specification of the RF is a bit different than what we did in the previous examples system.time(fitRF <- tune.randomForest(Sentiment~ . ,data=train, mtry=c(28, 30), nodesize=c(1), ntree=c(100))) # best model here has mtry=30 when nodesize=1 and tree=100 summary(fitRF ) fitRF$best.parameters ###################################################### ###################################################### # Once you have estimated the best hyperparameters setting for both RF and SVM, you could replicate the # K-fold analysis to see now which is the best algorithm between NB, RF and SVM ###################################################### ######################################################