rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(caret)
library(car)
library(xgboost)
library(reshape2)
library(gridExtra)
library(cvTools)
library(Ckmeans.1d.dp)
library(dplyr)
#####################################################
# FIRST STEP: let's prepare the training-set
#####################################################
# let's focus on MOVIE reviews (either positive or negative)
x <- read.csv("train_review2.csv", stringsAsFactors=FALSE)
str(x)
nrow(x)
table(x$Sentiment)
prop.table(table(x$Sentiment))
myCorpusTwitterTrain <- corpus(x)
tok2 <- tokens(myCorpusTwitterTrain, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
Dfm_train <- dfm(tok2)
# Let's trim the dfm in order to keep only tokens that appear in at least 5% of the reviews
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop")
topfeatures(Dfm_train , 20) # 20 top words
train <- as.matrix(Dfm_train)
######################################################
######################################################
# Now let's explore (let's tune!) the hyperparameters for the XGB
######################################################
######################################################
# As we discussed, there are several hyperparameters involved in a XGB. Among the most important ones:
# "eta" (learning rate: Controls how quickly the algorithm proceeds down the gradient descent.
# Smaller values reduce the chance of overfitting but also increases the time to find the optimal fit, default=0.3)
# "nrounds" (number of trees) - GBMs often require many trees; however, unlike random forests GBMs can overfit
# so the goal is to find the optimal number of trees that minimize the loss function of interest with cross validation.
# A good number of trees to begin with: 1000
# "max_depth" (maximum depth of a tree): the numbers of splits in each tree, which controls the complexity of the boosted
# ensemble. Often a "max_depth"=1 works well, in which case each tree is a stump consisting of a single split.
# More commonly, "max_depth" is greater than 1 but almost always <10. Default=6;
# "subsample": it controls whether or not you use a fraction of the available training observations. Using less than 100% of the training observations means
# you are implementing a stochastic gradient descent! For example, setting it to 0.5 means that xgboost randomly collected half of the data instances
# to grow trees and this will prevent overfitting and keep from getting stuck in a local minimum or plateau of the loss function gradient.
# It makes computation shorter (because less data to analyse)) and keeps from getting stuck in a local minimum or plateau
# of the loss function gradient. default=1
# Other possible hyperparamaters to be considered:
# colsample_bytree (subsample ratio of columns when constructing each tree): default=1
# min_child_weight = minimum sum of instance weight needed in a child. If the tree partition step results in a leaf node
# with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning.
# The larger, the more conservative the algorithm will be. Default=1
# REMEMBER: you DV should be always a numeric one starting from 0. If it is not the case (as here) you need to create such variable
x <- as.factor(Dfm_train@docvars$Sentiment)
x
table(x)
x <- as.numeric(x)
x
x[ x ==1 ] <-0
x[ x ==2 ] <-1
table(x)
Dfm_train@docvars$code <- x
str(Dfm_train)
table(Dfm_train@docvars$code)
train <- as.matrix(Dfm_train) # dense matrix
# create hyperparameter grid: you can add as many values and hyperparameters you want. Here just two:
# eta (1 and 2) and max_depth (1 and 2)
hyper_grid <- expand.grid(
eta = c(0.3, 1, 2),
max_depth = c(1, 2),
min_error = 0 , # a place to dump results
accuracy = 0 # a place to dump results
)
nrow(hyper_grid)
hyper_grid
# A further nice feature is the early_stopping_rounds option. This allows us to tell the function to stop running if the
# cross validated error does not improve for n continuous trees.
# For example, the below model will stop if we see no improvement for 200 consecutive trees.
# This feature will help us speed up the tuning process in the next section.
# grid search
for(i in 1:nrow(hyper_grid)) {
# create parameter list
params <- list(
eta = hyper_grid$eta[i],
max_depth = hyper_grid$max_depth[i]
)
set.seed(123)
# train model
xgb.tune <- xgb.cv(
params = params,
data = train,
label = Dfm_train@docvars$code,
nrounds = 500,
nfold = 10,
objective = "binary:logistic", # for binary
verbose = 1, # not silent,
nthread = 4,
metrics = "error", # binary classification error rate
early_stopping_rounds = 200 # stop if no improvement for 200 consecutive trees
)
# add min training error to grid
hyper_grid$min_error[i] <- min(xgb.tune$evaluation_log$test_error_mean)
hyper_grid$accuracy[i] <- 1-hyper_grid$min_error[i]
}
head(arrange(hyper_grid, min_error ), 10)
### let's add another hyperparameter such as subsample
hyper_grid <- expand.grid(
eta = c(0.3, 1 , 2),
max_depth = c(1, 2),
subsample = c(.5, 1), # that's new!
min_error = 0, # a place to dump results
accuracy = 0 # a place to dump results
)
nrow(hyper_grid)
hyper_grid
# grid search
for(i in 1:nrow(hyper_grid)) {
# create parameter list
params <- list(
eta = hyper_grid$eta[i],
max_depth = hyper_grid$max_depth[i],
subsample = hyper_grid$subsample[i] # that's new!
)
# reproducibility
set.seed(123)
# train model
xgb.tune2 <- xgb.cv(
params = params,
data = train,
label = Dfm_train@docvars$code,
nrounds = 500,
nfold = 10,
objective = "binary:logistic", # for binary
verbose = 1, # not silent,
nthread = 4,
metrics="error", # binary classification error rate
early_stopping_rounds = 200 # stop if no improvement for 200 consecutive trees
)
# add min training error to grid
hyper_grid$min_error [i] <- min(xgb.tune2$evaluation_log$test_error_mean)
hyper_grid$accuracy[i] <- 1-hyper_grid$min_error[i]
}
head(arrange(hyper_grid, min_error ), 10)
#####################################################
# let's prepare the training-set with 3 categories (this script works fine for any number of categories>2)
#####################################################
######################################################
######################################################
# which main changes when running an XGB?
######################################################
######################################################
uk_train <- read.csv("uk_train.csv")
str(uk_train)
myCorpusTwitterTrain <- corpus(uk_train)
tok2 <- tokens(myCorpusTwitterTrain, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
Dfm_train <- dfm(tok2)
# Let's trim the dfm in order to keep only tokens that appear in 2 or more tweets (tweets are very short texts...)
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE)
topfeatures(Dfm_train , 20) # 20 top words
train <- as.matrix(Dfm_train)
# our classes
table(Dfm_train@docvars$Sentiment)
# our benchmark: accuracy .530
prop.table(table(Dfm_train@docvars$Sentiment))
# REMEMBER: You need always to add the number of classes to be classified in the formula if it is a multi-categorical variable like now!
numberOfClasses <- length(unique(Dfm_train@docvars$Sentiment))
numberOfClasses
# you DV should be always a numeric one starting from 0. If it is not the case (as here) you need to create such variable
x <- as.factor(Dfm_train@docvars$Sentiment)
x
table(x)
x <- as.numeric(x)
table(x)
x[ x ==1 ] <-0
x[ x ==2 ] <-1
x[ x ==3 ] <-2 # CHANGE!
table(x)
Dfm_train@docvars$code <- x
str(Dfm_train)
table(Dfm_train@docvars$code)
table(Dfm_train@docvars$Sentiment)
# create hyperparameter grid: you can add as many values and hyperparameters you want. Here just two:
# eta (1 and 2) and max_depth (1 and 2)
hyper_grid <- expand.grid(
eta = c(0.3, 1, 2),
max_depth = c(1, 2),
min_error = 0, # a place to dump results
accuracy = 0 # a place to dump results
)
nrow(hyper_grid) # 4 possibilities by crossing eta with max_depth
hyper_grid
# grid search
for(i in 1:nrow(hyper_grid)) {
# create parameter list
params <- list(
eta = hyper_grid$eta[i],
max_depth = hyper_grid$max_depth[i]
)
set.seed(123)
# train model
xgb.tune <- xgb.cv(
params = params,
data = train,
label = Dfm_train@docvars$code,
nrounds = 500,
nfold = 10,
objective = "multi:softmax", # for multi-category # CHANGE!
"num_class" = numberOfClasses, # add this line! # CHANGE!
verbose = 1, # not silent
metrics="merror", # Exact matching error, used to evaluate multi-class classification # CHANGE
nthread = 4,
early_stopping_rounds = 150 # stop if no improvement for 150 consecutive trees
)
# add min training error to grid
hyper_grid$min_error[i] <- min(xgb.tune$evaluation_log$test_merror_mean) # CHANGE: "test_merror_mean" and not "test_error_mean"
hyper_grid$accuracy[i] <- 1-hyper_grid$min_error[i]
}
head(arrange(hyper_grid, min_error ), 10)
### also remember, if you use objective = "multi:softmax" you can directly predict classes, w/o any need to round the probabilities given that
### objective = "multi:softmax" returns predicted class (not predicated probabilities) contrary to objective = "binary:logistic"