#################################################### # classifiers with a general example #################################################### rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL") getwd() library(e1071) library(caTools) library(randomForest) library(caret) library(quanteda) library(readtext) data(iris) str(iris) # This famous iris data set gives the measurements in centimeters of the variables sepal length and width and petal length and width, respectively, # for 50 flowers from each of 3 species of iris. The species are Iris setosa, versicolor, and virginica. # let's run a v-fold cross-validation (v=2) set.seed(101) # Set Seed so that same sample can be reproduced in future also # Now Selecting 50% of data as sample from total 'n' rows of the data sample <- sample.int(n = nrow(iris), size = floor(.50*nrow(iris)), replace = F) train <- iris[sample, ] test <- iris[-sample, ] # I consider train as V1 and test as V2 nrow(train) nrow(test) #################################################### # An example with Naïve Bayes classifier #################################################### # first let's use V1 to train the algorithm and then predict V2 set.seed(1234) system.time(NB <- naiveBayes(Species ~ ., data=train)) predictNB <- predict(NB, newdata=test) table( predictNB) # Let's estimate the "confusion matrix" table("Predictions"= predictNB, "Actual"=test$Species) # Accuracy (number of documents on the diagonal/total number of documents) (26+15+28) / nrow(test) x <- as.matrix(table("Predictions"= predictNB, "Actual"=test$Species)) sum(diag(x)) / nrow(test) acc <- sum(diag(x)) / sum(x) acc # Let's use the confusionMatrix command (same results as above for accuracy!) conf.mat <- confusionMatrix( predictNB, test$Species) conf.mat # Precision for versicolor (=Pos Pred Value; True Positive/Predicted Positive or TP/(TP+FP)) 15/18 # Recall for virginica (=Sensitivity; True Positive/Actual Positive or TP/(TP+FN)) 28/31 # Now let's reverse che cross-validation procedure, and let's use test (V2) to train the algorithm and then predict train (V1)! set.seed(1234) NB2 <- naiveBayes(Species ~ ., data=test) system.time(predictNB2 <- predict(NB2, newdata=train)) # Let's estimate the "confusion matrix" table("Predictions"= predictNB2, "Actual"=train$Species) x_alt <- as.matrix(table("Predictions"= predictNB2, "Actual"=train$Species)) sum(diag(x_alt)) / sum(x_alt) acc_alt <- sum(diag(x_alt)) / sum(x_alt) # Let's use the confusionMatrix command conf.mat_alt <- confusionMatrix( predictNB2, train$Species) conf.mat_alt # K-Fold cross-validation (with k=2) # Accuracy (average across the two cross-validation procedure, i.e., V1 vs. V2 and V2 vs. V1) (acc_alt+acc)/2 # Precision (=Pos Pred Value) for versicolor conf.mat conf.mat_alt (0.8333+1)/2 # Recall (=Sensitivity) for virginica (0.9032+1)/2 ####### REMEMBER: when you have a (relative) small training-set, doing cross-validation is even more important, ####### given that it allows you to control if you results are driven by the codification (errors? bad codification? outlier?) ####### of a specific subset of documents. ####### This appears when the accuracy of V1 vs. V2 is drammaticaly different from the one you get from V2 vs. V1! #################################################### # An example with Random Forest (we just use V1 to predict V2 here to save space...) #################################################### set.seed(123) # by default we build 500 trees RF <- randomForest(Species ~ ., data=train, type="classification") system.time(predictRF <- predict(RF, newdata=test)) table( predictRF) table( test$Species, predictRF) # Accuracy (proportion of correctly classified documents) x2 <- as.matrix(table("Predictions"= predictRF, "Actual"=test$Species)) sum(diag(x2)) / sum(x2) # Let's use the confusionMatrix command (same results as above for accuracy!) conf.mat2 <- confusionMatrix( predictRF, test$Species) conf.mat2 #################################################### # An example with SVM (we just use V1 to predict V2 here to save space...) #################################################### set.seed(123) SVM <- svm(Species ~ . ,data=train, type="C-classification") summary(SVM) system.time(predictSVM <-predict(SVM, newdata=test)) table( predictSVM ) table( test$Species, predictSVM ) # Accuracy (proportion of correctly classified documents) x3 <- as.matrix(table("Predictions"= predictSVM, "Actual"=test$Species)) sum(diag(x3)) / sum(x3) # Let's use the confusionMatrix command (same results as above for accuracy!) conf.mat3 <- confusionMatrix( predictSVM, test$Species) conf.mat3 ####### REMEMBER: you can use cross-validation also to compare the performance of different algorithms! ####### For example: in this example, among the three classifiers who was the best one according to ####### accuracy, precision and recall? #################################################### # classifiers with texts #################################################### # ### This is a sample of 1000 tweets written in English about Trump and published since 1.17.2018 till 1.19.2018 x10 <- read.csv("Trump_tweets2.csv", stringsAsFactors=FALSE) str(x10) myCorpusTwitter <- corpus(x10) # let's suppose that the sentiment classification via data_dictionary_LSD2015 represents the "true" sentiment that # a set of human coders would have found trump3 <- dfm(myCorpusTwitter , remove = c(stopwords("english"), ("rt"), ("t.co") ), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_twitter = TRUE, remove_url = TRUE , dictionary = data_dictionary_LSD2015[1:2] ) str(trump3 ) df <- data.frame(trump3 ) str(df) head(df) df$sentiment_diff <- df$positive-df$negative str(df) # we assume 0=neutral. Alternatively, we could have forced everythin between -1 and +1 to be neutral df$sentiment2[df$sentiment_diff==0] <- "Neutral" df$sentiment2[df$sentiment_diff<0] <- "Negative" df$sentiment2[df$sentiment_diff>0] <- "Positive" str(df) table(df$sentiment2) df$sentiment2 <- as.factor(df$sentiment2) str(df) # add the sentiment to the original set of tweets x10$sentiment <- df$sentiment2 str(x10) table(x10$sentiment) # removing all punctuations also here (it's a mess with tweets!) library(stringr) x10$text <- str_replace_all(x10$text, "[^[:alnum:]]", " ") kwic(x10$text, "Trump's") # let's re-build the corpus (with the added sentiment variable) and the dfm myCorpusTwitter <- corpus(x10) myDfm3 <- dfm(myCorpusTwitter , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) topfeatures(myDfm3 , 20) # 20 top words # Keep terms that appear at least in the 5% or more of the tweets (tweets are very short texts...) trim <- dfm_trim(myDfm3 , min_docfreq= 0.05) # transform the data in a data frame data <- as.data.frame(as.matrix(trim)) str(data ) # this is important: randomForest can't recognize the colname that begins with space, comma, number or other specific punctuation. # the command below would add a letter in front of a number (if you have any numbers left in the tdm). Highly suggested colnames(data ) <- make.names(colnames(data )) summary(x10$sentiment) data$sentiment<- x10$sentiment colnames(data) summary(data$sentiment) # Build a training and testing set. Given that we are assuming that the sentiment classification via data_dictionary_LSD2015 represents # a kind of "human coding" we are in effect running a kind of v-fold cross-classification procedure (with v=2) somehow! set.seed(123) # Set Seed so that same sample can be reproduced in future also # Now Selecting 60% of data as sample from total 'n' rows of the data sample <- sample.int(n = nrow(data), size = floor(.60*nrow(data)), replace = F) train <- data[sample, ] test <- data[-sample, ] nrow(train) nrow(test) ########################################### # Try a random forest model. ########################################### set.seed(123) system.time(RF <- randomForest(sentiment~ ., data=train, type="classification")) predictRF <- predict(RF, newdata=test) table( predictRF) table( test$sentiment, predictRF) # Accuracy (proportion of correctly classified documents) x2 <- as.matrix(table("Predictions"= predictRF, "Actual"=test$sentiment)) sum(diag(x2)) / nrow(test) sum(diag(x2)) / sum(x2) # Let's use the confusionMatrix command (same results as above for accuracy!) conf.mat <- confusionMatrix( predictRF, test$sentiment) conf.mat # Accuracy (proportion of correctly classified documents): 0.698 # But see Precision (=Pos Pred Value) for Negative and even more # Recall (=Sensitivity) for Neutral and Positive... ########################################### # Try a SVM model. ########################################### # Try a SVM model # I would suggest you to go for linear kernel if you have large number of features (>1000 as in our case with >2800 words) # because it is more likely that the data is linearly separable in high dimensional space set.seed(123) system.time(SV <- svm(sentiment ~ . ,data=train,method = "C-classification", kernel='linear')) # If you ever get this warning message (as it is the case now!): # "In svm.default(x, y, scale = scale, ..., na.action = na.action) : # Variable(s) ‘XXX’ and ‘YYY’ and ‘WWW’ constant. Cannot scale data" # it means that in the dfm in the training-set you have some words with always 0 across all texts. # This is a problem, cause per default, data in a SVM are scaled to zero mean and unit variance. # The center and scale values are returned and used for later predictions. But if you have just a string of 0s... # So what to do? Either you drop such words and re-run the analysis or you write in the SVM command: "scale=FALSE" # The first option is always better! # delete all the words with sum=0 colnames(train) colSums(train[,1:3596]) train <- train[, colSums(train[,1:3596]!= 0) > 0] set.seed(123) system.time(SV <- svm(sentiment ~ . ,data=train,method = "C-classification", kernel='linear')) predictSV <-predict(SV, newdata=test) table( predictSV ) # Let's use the confusionMatrix command (same results as above for accuracy!) conf.mat2 <- confusionMatrix( predictSV, test$sentiment) conf.mat2 ########################################### ## Try a Naive Bayes model: back to Quanteda ########################################### # Since Naive Bayes evaluates products of probabilities, we need some way of assigning # non-zero probabilities to words which do not occur in the sample. # That is, Naive Bayes can only take features into consideration that occur both in the training set # and the test set, but we can make the features identical by passing training_dfm to dfm_select() as a pattern. summary(myCorpusTwitter) # generate 600 numbers without replacement to treat them as the training-set set.seed(123) id_train <- sample(1:1000, 600, replace = FALSE) head(id_train, 10) # create docvar with ID docvars(myCorpusTwitter, "id_numeric") <- 1:ndoc(myCorpusTwitter) summary(myCorpusTwitter) # get training set (documents in id_train) and compute the dfm out of it training_corpus <- corpus_subset(myCorpusTwitter, id_numeric %in% id_train) summary(training_corpus ) training_dfm <- dfm(training_corpus , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) # Keep terms that appear at least in the 5% or more of the tweets (tweets are very short texts...) training_dfm <- dfm_trim(training_dfm , min_docfreq= 0.05) # get test set (documents not in id_train) and compute the dfm out of it test_corpus <- corpus_subset(myCorpusTwitter, !id_numeric %in% id_train) summary(test_corpus ) test_dfm <- dfm(test_corpus , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) # Keep terms that appear at least in the 5% or more of the tweets (tweets are very short texts...) test_dfm <- dfm_trim(test_dfm , min_docfreq= 0.05) # train the naive Bayes classifier using textmodel_nb() and a Multinomial distribution (the default) nb <- textmodel_nb(training_dfm, docvars(training_dfm, "sentiment"), distribution = c("multinomial")) summary(nb) # Let's make the features identical by passing training_dfm to dfm_select() as a pattern. test_dfm <- dfm_select(test_dfm, training_dfm) # Let’s inspect how well the classification worked actual_class <- docvars(test_dfm, "sentiment") predicted_class <- predict(nb, test_dfm) table(predicted_class) prop.table(table(predicted_class )) class_table <- table(actual_class, predicted_class) class_table confusionMatrix(class_table, mode = "everything") #################################### #### replicate the entire analysis but now reduces the training set to 0.3. #### Which are the main changes in terms of accuracy? It decreases for RF, SVM and naive Bayes classifier (less data on which train the algorithm!) #################################### trim <- dfm_trim(myDfm3 , min_docfreq= 0.05) data <- as.data.frame(as.matrix(trim)) str(data ) colnames(data ) colnames(data ) <- make.names(colnames(data )) colnames(data ) summary(x10$sentiment) data$sentiment<- x10$sentiment str(sentiment) colnames(sentiment) summary(data$sentiment) set.seed(123) sample <- sample.int(n = nrow(data), size = floor(.30*nrow(data)), replace = F) train <- data[sample, ] test <- data[-sample, ] nrow(train) nrow(test) ################################# # Try a random forest model [not in the class!] ################################# set.seed(123) tweetRF <- randomForest(sentiment~ ., data=train, type="classification") str(tweetRF) predictRF <- predict(tweetRF, newdata=test) table( predictRF) table(test$sentiment) table(test$sentiment, predictRF) # Let's use the confusionMatrix command (same results as above for accuracy!) conf.mat <- confusionMatrix( predictRF, test$sentiment) conf.mat ################################# # Try a SVM model ################################# # delete all the words with sum=0 colnames(train) colSums(train[,1:1694]) train <- train[, colSums(train[,1:1694]!= 0) > 0] set.seed(123) system.time(SV <- svm(sentiment ~ . ,data=train,method = "C-classification", kernel='linear')) predictSV <-predict(SV, newdata=test) table( predictSV ) # Let's use the confusionMatrix command (same results as above for accuracy!) conf.mat2 <- confusionMatrix( predictSV, test$sentiment) conf.mat2 ################################# ## Try a Naive Bayes model ################################# summary(myCorpusTwitter) # generate 600 numbers without replacement to treat them as the training-set set.seed(123) id_train <- sample(1:1000, 300, replace = FALSE) head(id_train, 10) # create docvar with ID docvars(myCorpusTwitter, "id_numeric") <- 1:ndoc(myCorpusTwitter) summary(myCorpusTwitter) # get training set (documents in id_train) and compute the dfm out of it training_corpus <- corpus_subset(myCorpusTwitter, id_numeric %in% id_train) summary(training_corpus ) training_dfm <- dfm(training_corpus , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) # Keep terms that appear at least in the 5% or more of the tweets (tweets are very short texts...) training_dfm <- dfm_trim(training_dfm , min_docfreq= 0.05) # get test set (documents not in id_train) and compute the dfm out of it test_corpus <- corpus_subset(myCorpusTwitter, !id_numeric %in% id_train) summary(test_corpus ) test_dfm <- dfm(test_corpus , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) # Keep terms that appear at least in the 5% or more of the tweets (tweets are very short texts...) test_dfm <- dfm_trim(test_dfm , min_docfreq= 0.05) # train the naive Bayes classifier using textmodel_nb() and a Multinomial distribution (the default) nb <- textmodel_nb(training_dfm, docvars(training_dfm, "sentiment"), distribution = c("multinomial")) summary(nb) # Let's make the features identical by passing training_dfm to dfm_select() as a pattern. test_dfm <- dfm_select(test_dfm, training_dfm) # Let’s inspect how well the classification worked actual_class <- docvars(test_dfm, "sentiment") predicted_class <- predict(nb, test_dfm) table(predicted_class) prop.table(table(predicted_class )) class_table <- table(actual_class, predicted_class) class_table confusionMatrix(class_table, mode = "everything")