rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(caTools) library(e1071) library(randomForest) library(caret) library(naivebayes) library(car) library(ggplot2) library(dplyr) library(reshape2) library(iSAX) ############################################## ############################################## #### Let's call the training and test-sets ############################################## ############################################## # Let's focus on a database related to detecting spam in emails x <- read.csv("spam.train.csv", stringsAsFactors=FALSE) str(x) # a pretty imbalanced training-set: this should negatively affect the performance # of a ML algorithm much more than the performance of a proportional algorithm prop.table(table(x$type)) myCorpusTwitterTrain <- corpus(x) Dfm_train <- dfm(myCorpusTwitterTrain , remove = c(stopwords("english")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_separators=TRUE, remove_url = TRUE, split_hyphens = TRUE) Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE) Dfm_train <- dfm_remove(Dfm_train , min_nchar = 2) topfeatures(Dfm_train , 20) # 20 top words # in this case we know the "true" value of ham/spam for the texts in our test-set. But this is of course just an example! x10 <- read.csv("spam.test.csv", stringsAsFactors=FALSE) str(x10) myCorpusTwitterTest <- corpus(x10) Dfm_test<- dfm(myCorpusTwitterTest , remove = c(stopwords("english")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_separators=TRUE, remove_url = TRUE, split_hyphens = TRUE) Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop") Dfm_test <- dfm_trim(Dfm_test , min_docfreq = 2, verbose=TRUE) Dfm_test <- dfm_remove(Dfm_test , min_nchar = 2) topfeatures(Dfm_test , 20) # 20 top words test_dfm <- dfm_match(Dfm_test, features = featnames(Dfm_train)) train <- as.matrix(Dfm_train) test <- as.matrix(test_dfm) ###################################################### ###################################################### # Let's estimate a NB as our benchmark ###################################################### ###################################################### set.seed(123) # (define a set.seed for being able to replicate the results!) system.time(NB22 <- multinomial_naive_bayes(x=train, y=as.factor(Dfm_train@docvars$type), laplace = 1)) # Let's do some features analysis # let's investigate about this issue a bit more NB_prob <- as.data.frame(NB22$params) NB_prob$Feature <- row.names(NB_prob) str(NB_prob) # the features that change the most the difference between the spam and non-spam features conditional probabilities NB_prob$diff <- NB_prob$spam-NB_prob$ham str(NB_prob) print(head(NB_prob[order(NB_prob$diff , decreasing=TRUE),], 15)) # spam words print(head(NB_prob[order(NB_prob$diff , decreasing=FALSE),], 15)) # non-spam words NB_prob$sign <- ifelse(NB_prob$diff>0,"spam","ham") str(NB_prob) # let's extract the top 20-most positive and the 20-most negative contributing features NB_prob2 <- top_n(NB_prob, 20, diff ) NB_prob2 NB_prob3 <- top_n(NB_prob, -20, diff ) NB_prob3 NB_prob_new <- rbind(NB_prob2, NB_prob3) # reorder the features NB_prob_new <- mutate(NB_prob_new, Feature= reorder(Feature, diff)) ggplot(NB_prob_new, aes(Feature, diff, fill = sign)) + geom_col(show.legend = F) + coord_flip() + ylab("Difference in the conditional probabilities") + scale_fill_manual(values = c("orange", "blue")) + labs(title = "Movie Reviews", subtitle = "Ham (-) versus Spam (+) words - NB") ######### let's predict the test-set predicted_nb <- predict(NB22 ,test ) table(predicted_nb ) prop.table(table(predicted_nb )) ###################################################### ###################################################### # Let's learn how to estimate a proportional model via iSAX ###################################################### ###################################################### ### STEP 1: TRAINING-TEST x <- read.csv("spam.train.csv", stringsAsFactors=FALSE) str(x) x$typeFC <- as.factor(x$type) # let's transform the variable "type" in a factor variable str(x) prop.table(table(x$typeFC)) ### STEP 2: TEST-SET x10 <- read.csv("spam.test.csv", stringsAsFactors=FALSE) str(x10) ### STEP 3: Let's create a unique dataset including both TEST and TRAINING SET x10$typeFC<- NA # for doing that let's add a new column called "typeFC" in the test-set given that such column is presented in the training-set str(x10) documents <- rbind(x10, x) # let's combine the test and the training-set str(documents ) # we have 5572 texts (1000 as training-set and 4572 as training-set) prop.table(table(documents $typeFC)) ### STEP 4: iSA needs the TM package (not Quanteda!) to build the DfM corpus <- VCorpus(VectorSource(documents $text)) # let's build the corpus corpus [[1]]$content corpus <- tm_map(corpus , removeWords, stopwords("english")) # let's remove stopwords corpus [[1]]$content ocome <- prep.data(corpus,verbose=TRUE, th=0.995) # let's prepare data for iSA algorithm # This is a pre-processing step which performs stemming and other cleaning steps (remove punctuation, numbers, etc.), as well as producing the DfM. # th=0.995 means that we drop those features that appear in less than 5% of the texts str(ocome) # if you want the hexadecimal notation (not usually suggested unless you have a very large training-set) # ocome2 <- prep.data(corpus,verbose=TRUE, th=0.995, shannon=TRUE) ### STEP 5: let's separate the resulting object "ocome" according to the presence or absence of info about the type factor variable ### (i.e., training vs. test-set) train <- !is.na(documents $typeFC) # I create an index=TRUE for the training-set documents (i.e., those texts with typeFC) # different than NA) train summary(train) D <- documents$typeFC[train] # I recover the vector of the values for the type in the training-set str(D) # Same results indeed! prop.table(table(D)) prop.table(table(x$typeFC)) Strain <- ocome$S[which(train)] # I select out of "ocome" the vector of stems belonging to the training-set Stest <- ocome$S[-which(train)] # I select out of "ocome" the vector of stems belonging to the test-set length(Strain ) # 500! length(Stest) # 4572! ### STEP 6: let's run the proporional algorithm set.seed(123) system.time(outSent <- iSA(Strain ,Stest , D)) # D is the vector of codings belonging to the training set ### STEP 7: let's predict round(outSent$btab, 5) # I have also bootstrapped s.e. ############################################################################### # let's estimate the MAE - mean average error (just to have a rough idea of the algorithms' performance) ############################################################################### mae <- as.data.frame(rbind(outSent$btab[,"Estimate"], prop.table(table(predicted_nb )), prop.table(table(x10$type)))) str(mae) mae$algorithm <- c("iSA", "NB", "TRUE") mae mae_iSA <- (mean(abs(mae[1,1]-mae[3,1]))+mean(abs(mae[1,2]-mae[3,2])))/2 mae_NB <- (mean(abs(mae[2,1]-mae[3,1]))+mean(abs(mae[2,2]-mae[3,2])))/2 mae_iSA mae_NB ############################################################################### # Of course, you can also run a CV with iSA! Try to do that by yourself! ###############################################################################