rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL") getwd() library(e1071) library(caTools) library(randomForest) library(caret) library(quanteda) library(readtext) library(iSAX) library(tm) library(stringr) library(glmnet) ################################################## # iSA ################################################## ### TRAINING-TEST # Let's start with our usual training-set about Trump x <- read.csv("trainTrump.csv", stringsAsFactors=FALSE) str(x) # removing all punctuations also here (it is a mess with tweets!). Better doing that directly at this stage x$text <- str_replace_all(x$text, "[^[:alnum:]]", " ") str(x) x$Sentiment <- as.factor(x$Sentiment) # let's transform the variable "Sentiment" in a factor variable str(x) table(x$Sentiment) prop.table(table(x$Sentiment)) ### TEST-SET ### That's our usual test-set: sample of 1000 tweets written in English about Trump and published since 1.17.2018 till 1.19.2018 x10 <- read.csv("testTrump.csv", stringsAsFactors=FALSE) x10$text <- str_replace_all(x10$text, "[^[:alnum:]]", " ") str(x10) ### Let's create a unique dataset including both TEST and TRAINING SET x10$Sentiment <- NA # for doing that let's add a new column called "Sentiment" in the test-set given that such column is presented in the training-set str(x10) documents <- rbind(x10, x) # let's combine the test and the training-set str(documents ) table(documents $Sentiment) prop.table(table(documents $Sentiment)) ### iSA needs the TM package (not Quanteda!) to build the DFM corpus <- VCorpus(VectorSource(documents $text)) length(corpus) # we have 1472 texts (1,000 as test-set and 472 as training-set) str(corpus[[1]]) ocome <- prep.data(corpus,verbose=TRUE, th=0.995) # let's prepares data for iSA algorithm. # This is a pre-processing step which performs stemming and other cleaning steps. # th=0.995 means that we drop those features that appear in less than 5% of the texts str(ocome) ### let's separate the resulting object "ocome" according to the presence or absence of info about the Sentiment ### (i.e., training vs. test-set) train <- !is.na(documents $Sentiment) # I create an index=TRUE for the training-set documents (i.e., those texts with Sentiment # different than NA) train summary(train) D <- documents$Sentiment[train] # I recover the vector of the values for the Sentiment in the training-set str(D) # Same results indeed! prop.table(table(D)) prop.table(table(x$Sentiment)) Strain <- ocome$S[which(train)] # I select out of "ocome" the vector of stems belonging to the training-set Stest <- ocome$S[-which(train)] # I select out of "ocome" the vector of stems belonging to the test-set length(Strain ) # 500! length(Stest) # 500! set.seed(123) system.time(outSent <- iSA(Strain ,Stest , D)) # let's run iSA: D is the vector of codings belonging to the training set # estimation for the test-test round(outSent$btab, 5) # I have also bootstrapped s.e.! # training-set % prop.table(table(D )) ################################################## # How to assess the quality of iSA estimates? ################################################## # For example, you can create two random sets out of your original training-set, and estimate the MAE on them! ################## # iSA - K1 we use train1_ISA to predict the documents of train2_ISA ################## # create two training-sets from the original one k <- 2 n <- length(Strain) n split <- rep(1:k,c(0.5*n,0.5*n)) table(split ) split set.seed(123) split <- sample(split) # I rearrange randomly the order of numbers in split split # note that if you want to generate 4 training-sets from the original one using rep, you should write # k <- 4 # n <- length(Strain) # split <- rep(1:k,c(0.25*n,0.25*n,0.25*n,0.25*n)) # if k <- 5, write after rep 0.2*2, etc. train1_ISA <- subset(Strain, split==1) train2_ISA <- subset(Strain, split==2) length(train1_ISA) length(train2_ISA) D_train1_ISA <-subset(D, split==1) prop.table(table(D_train1_ISA)) D_train2_ISA <-subset(D, split==2) prop.table(table(D_train2_ISA)) set.seed(123) outSent <- iSA(train1_ISA ,train2_ISA , D_train1_ISA) str(outSent) round(outSent$btab, 5) ################## # iSA - K2 we use train2_ISA to predict the documents of train1_ISA ################## set.seed(123) outSent2 <- iSA(train2_ISA ,train1_ISA , D_train2_ISA) str(outSent2) round(outSent2$btab, 5) ################## # Estimating the MAE when training-set1 vs. training-set2 ################## mat_all <-cbind(outSent$btab[,"Estimate"],prop.table(table(D_train2_ISA))) colnames(mat_all) <- c("iSA1", "True1") round(mat_all,3) mae_ISA <- mean(abs(mat_all[,1]-mat_all[,2])) mae_ISA ################## # Estimating the MAE when training-set2 vs. training-set1 ################## mat_all2 <-cbind(outSent2$btab[,"Estimate"],prop.table(table(D_train1_ISA))) colnames(mat_all2) <- c("iSA2", "True2") round(mat_all2,3) mae_ISA2 <- mean(abs(mat_all2[,1]-mat_all2[,2])) mae_ISA2 ####### Avg. MAE (mae_ISA+mae_ISA2)/2 ################## # Is a better or a worst result that what we can get via other Machine Learning Algorithms? Let's compare! ################## ################## # RandomForest - K1 we use train1_RF to predict the documents of train2_RF ################## # REMEMBER: iSAX via the command prep.data creates a DfM with just 0/1 as frequencies. # if you want to run an analysis on the original frequencies DfM, you should use Quanteda on the "Trump_train.csv" database # and creating a DfM out of it before running the RF here dtm <- ocome$dtm # create an object out of the DfM included in the object "ocomo" we created via the prep.data command of iSAX dtm1 <- data.frame(dtm) # this would add a letter in front of a number (if you have any numbers left in the tdm). Highly suggested colnames(dtm1) <- make.names(colnames(dtm1)) dtm1 dtm1$Sentiment<- documents $Sentiment # let's add to the DfM also the Sentiment variable out from the training-set str(dtm1$Sentiment) dtm1$Sentiment<- as.factor(documents $Sentiment) # let's transform the Sentiment variable in a factor! str(dtm1$Sentiment) table(documents $Sentiment) summary(dtm1$Sentiment) train <- dtm1[ which(!is.na(dtm1$Sentiment)), ] # let's extract the part of the DfM that contains values for the "Sentiment" variable # i.e., our original training-set! nrow(train) # let's split the training-set in two train1_RF <- subset(train , split==1) train2_RF <- subset(train , split==2) # Same splitting of the training-set as we did with iSA! prop.table(table(train1_RF$Sentiment)) prop.table(table(D_train1_ISA)) prop.table(table(train2_RF$Sentiment)) prop.table(table(D_train2_ISA)) set.seed(123) system.time(tweetRF1 <- randomForest(Sentiment~ ., data=train1_RF, do.trace=TRUE)) predictRF1 <- predict(tweetRF1, newdata=train2_RF, type="class") prop.table(table( predictRF1)) ################## # RandomForest - K2 we use train2_RF to predict the documents of train1_RF ################## set.seed(123) system.time(tweetRF2 <- randomForest(Sentiment~ ., data=train2_RF, do.trace=TRUE)) predictRF2 <- predict(tweetRF2, newdata=train1_RF, type="class") prop.table(table( predictRF2)) ################## # SVM - K1 we use train1_SV to predict the documents of train2_SV ################## train1_SV <- train1_RF train2_SV <- train2_RF set.seed(123) system.time(tweetSV1 <- svm(Sentiment~ . ,data=train1_SV, kernel='linear', cost = 10)) predictSV1 <- predict(tweetSV1, newdata=train2_SV, type="class") prop.table(table( predictSV1)) ################## # SVM - K2 we use train2_SV to predict the documents of train1_SV ################## train1_SV <- train1_RF train2_SV <- train2_RF set.seed(123) system.time(tweetSV2 <- svm(Sentiment~ . ,data=train2_SV, kernel='linear', cost = 10)) predictSV2 <- predict(tweetSV2, newdata=train1_SV, type="class") prop.table(table( predictSV2)) ################## # Naive Bayes - K1 we use train1_NB to predict the documents of train2_NB ################## str(x) myCorpusTwitter <- corpus(x) summary( myCorpusTwitter ) N <- ndoc(myCorpusTwitter ) # Number of desired splits k <- 2 set.seed(123) holdout <- split(sample(1:N), 1:k) docvars(myCorpusTwitter , "id_numeric") <- 1:ndoc(myCorpusTwitter ) # create train1_NB (documents in id_train) train1_NB <- dfm( corpus_subset(myCorpusTwitter, id_numeric %in% holdout[[1]]) , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) # Keep terms that appear at least in the 5% or more of the tweets train1_NB <- dfm_trim(train1_NB , min_docfreq= 0.05) ####################################################################### # let's recode all the values in the DfM as just 0/1 as a matter of compability with the DfM created via iSAX train1_NB@x <- recode(train1_NB@x,"2:hi=1") ####################################################################### # create train2NB (documents NOT in id_train) train2_NB <- dfm(corpus_subset(myCorpusTwitter, id_numeric %in%holdout[[2]]) , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) # Keep terms that appear at least in the 5% or more of the tweets train2_NB <- dfm_trim(train2_NB , min_docfreq= 0.05) ####################################################################### # let's recode all the values in the DfM as just 0/1 as a matter of compability with the DfM created via iSAX train2_NB@x <- recode(train2_NB @x,"2:hi=1") ####################################################################### # train the naive Bayes classifier using textmodel_nb() and a Multinomial distribution nb <- textmodel_nb(train1_NB, docvars(train1_NB, "Sentiment"), distribution = c("multinomial")) summary(nb) # Let's make the features identical by passing train1_NB to dfm_select() as a pattern. training_dfm2 <- dfm_select(train2_NB, train1_NB) # Let’s inspect how well the classification worked actual_class <- docvars(training_dfm2, "Sentiment") predicted_classNB1 <- predict(nb, training_dfm2) table(predicted_classNB1) prop.table(table(predicted_classNB1 )) # Here different splitting of the training-set compared to what we did with iSA/RF! prop.table(table(train1_RF$Sentiment)) prop.table(table(D_train1_ISA)) prop.table(table(train1_NB@docvars$Sentiment )) prop.table(table(train2_RF$Sentiment)) prop.table(table(D_train2_ISA)) prop.table(table(train2_NB@docvars$Sentiment )) ################## # Naive Bayes - K2 we use train2_NB to predict the documents of train1_NB ################## # train the naive Bayes classifier using textmodel_nb() and a Multinomial distribution (the default) nb2 <- textmodel_nb(train2_NB , docvars(train2_NB , "Sentiment"), distribution = c("multinomial")) summary(nb2) # Let's make the features identical by passing training_dfm2 to dfm_select() as a pattern. training_dfm1 <- dfm_select(train1_NB, train2_NB) # Let’s inspect how well the classification worked actual_class2 <- docvars(training_dfm1, "Sentiment") predicted_classNB2 <- predict(nb2, training_dfm1) table(predicted_classNB2) prop.table(table(predicted_classNB2 )) ################## # Estimating the MAE when training-set1 vs. training-set2 for all (=4) the algorithms employed ################## mat_all <-cbind(outSent$btab[,"Estimate"], prop.table(table( predictRF1)), prop.table(table( predictSV1)), prop.table(table(predicted_classNB1)), prop.table(table(D_train2_ISA)), prop.table(table(train2_NB@docvars$Sentiment ))) colnames(mat_all) <- c("iSA1", "RF1", "SV1", "BAYES1", "True1", "TrueNB1") round(mat_all,3) mae_ISA1 <- mean(abs(mat_all[,1]-mat_all[,5])) mae_RF1 <- mean(abs(mat_all[,2]-mat_all[,5])) mae_SV1 <- mean(abs(mat_all[,3]-mat_all[,5])) mae_BAYES1 <- mean(abs(mat_all[,4]-mat_all[,6])) ################## # Estimating the MAE when training-set2 vs. training-set1 for all (=4) the algorithms employed ################## mat_all2 <-cbind(outSent2$btab[,"Estimate"], prop.table(table( predictRF2)), prop.table(table( predictSV2)), prop.table(table(predicted_classNB2)), prop.table(table(D_train1_ISA)), prop.table(table(train1_NB@docvars$Sentiment ))) colnames(mat_all2) <- c("iSA2", "RF2", "SV2", "BAYES2", "True2", "TrueNB2") round(mat_all2,3) mae_ISA2 <- mean(abs(mat_all2[,1]-mat_all2[,5])) mae_RF2 <- mean(abs(mat_all2[,2]-mat_all2[,5])) mae_SV2 <- mean(abs(mat_all2[,3]-mat_all2[,5])) mae_BAYES2 <- mean(abs(mat_all2[,4]-mat_all2[,6])) ####### Avg. MAE (mae_ISA1+mae_ISA2)/2 (mae_RF1+mae_RF2)/2 (mae_SV1+mae_SV2)/2 (mae_BAYES1+mae_BAYES2)/2 # Does it mean that iSA is the best ML algorithm here when we focus on proportio? # Let's see some more serious CV! ################## # Let's loop everything with any number of K! For example: K=10 ################## ################## # iSA K=10 ################## k <- 10 n <- length(Strain) n split <- rep(1:k,c(0.1*n,0.1*n,0.1*n,0.1*n,0.1*n,0.1*n,0.1*n,0.1*n,0.1*n,0.1*n)) table(split ) set.seed(123) split <- sample(split) for(i in 1:k){ train <- subset(Strain, split!=(i)) validation <- subset(Strain, split==(i)) D_train_ISA <-subset(D, split!=(i)) D_train_true <-subset(D, split==(i)) set.seed(123) outSent <- iSA(train ,validation , D_train_ISA) df<-outSent df.name<-paste0("iSA",i) assign(df.name,df) pf<-prop.table(table(D_train_true )) pf.name<-paste0("True",i) assign(pf.name,pf) } iSAPredict <- data.frame(col1=vector(), col2=vector(), col3=vector()) # makes a blank data frame with three columns to fill with the predictions for(i in mget(ls(pattern = "iSA")) ) { col1 <-(i)$btab[1] col2 <-(i)$btab[2] col3 <-(i)$btab[3] iSAPredict <- rbind(iSAPredict, cbind(col1, col2, col3)) } colnames(iSAPredict)[1] <- "Negative" colnames(iSAPredict)[2] <- "Neutral" colnames(iSAPredict)[3] <- "Positive" str(iSAPredict) iSAPredict<-as.data.frame(t(iSAPredict)) # transpose the matrix colnames(iSAPredict) <- rep(1:k) iSAPredict TrueValue <- data.frame(col1=vector(), col2=vector(), col3=vector()) # makes a blank data frame with three columns to fill with the true values for(i in mget(ls(pattern = "True")) ) { col1 <-(i)[1] col2 <- (i)[2] col3 <- (i)[3] TrueValue <- rbind(TrueValue , cbind(col1, col2, col3)) } str(TrueValue ) TrueValue <- as.data.frame(t(TrueValue )) # transpose the matrix colnames(TrueValue ) <- rep(1:k) row.names(TrueValue ) <- row.names(iSAPredict) TrueValue iSA_TABLE <- abs(iSAPredict-TrueValue ) MAE_iSA <- mean(colMeans(iSA_TABLE)) MAE_iSA ################## # RF K=10 ################## dtm <- ocome$dtm # create an object out of the DfM included in the object "ocomo" we created via the prep.data command of iSAX dtm1 <- data.frame(dtm) colnames(dtm1) <- make.names(colnames(dtm1)) dtm1$Sentiment<- documents $Sentiment # let's add to the DfM also the Sentiment variable out from the training-set dtm1$Sentiment<- as.factor(documents $Sentiment) # let's transform the Sentiment variable in a factor! train <- dtm1[ which(!is.na(dtm1$Sentiment)), ] # let's extract the part of the DfM that contains values for the "Sentiment" variable nrow(train) ttrain <- train for(i in 1:k){ train <- subset(ttrain , split!=(i)) validation <- subset(ttrain , split==(i)) set.seed(123) newrf <- randomForest(Sentiment~ ., data=train, do.trace=TRUE, ntree=100) # set ntree=100 to make it faster during the Lab class predict <- predict(newrf, newdata=validation, type="class") df<- prop.table(table(predict)) df.name<-paste0("RFpred",i) assign(df.name,df) pf<-prop.table(table(validation$Sentiment)) pf.name<-paste0("TrueRF",i) assign(pf.name,pf) } RFPredict <- data.frame(col1=vector(), col2=vector(), col3=vector()) # makes a blank data frame with three columns to fill with the predictions for(i in mget(ls(pattern = "RFpred")) ) { col1 <-(i)[1] col2 <- (i)[2] col3 <- (i)[3] RFPredict <- rbind(RFPredict , cbind(col1, col2, col3)) } RFPredict <-as.data.frame(t(RFPredict )) # transpose the matrix colnames(RFPredict ) <- rep(1:k) row.names(RFPredict ) <- row.names(iSAPredict) RFPredict TrueValueRF <- data.frame(col1=vector(), col2=vector(), col3=vector()) # makes a blank data frame with three columns to fill with the true values for(i in mget(ls(pattern = "TrueRF")) ) { col1 <-(i)[1] col2 <- (i)[2] col3 <- (i)[3] TrueValueRF <- rbind(TrueValueRF , cbind(col1, col2, col3)) } str(TrueValueRF ) TrueValueRF <- as.data.frame(t(TrueValueRF )) # transpose the matrix colnames(TrueValueRF ) <- rep(1:k) row.names(TrueValueRF ) <- row.names(iSAPredict) TrueValueRF RF_TABLE <- abs(RFPredict -TrueValueRF) MAE_RF<- mean(colMeans(RF_TABLE)) MAE_RF ################## # SVM K=10 ################## for(i in 1:k){ train <- subset(ttrain , split!=(i)) validation <- subset(ttrain , split==(i)) set.seed(123) newrf <- svm(Sentiment~ ., data=train, kernel='linear', cost = 10) predict <- predict(newrf, newdata=validation, type="class") df<- prop.table(table(predict)) df.name<-paste0("SVpred",i) assign(df.name,df) pf<-prop.table(table(validation$Sentiment)) pf.name<-paste0("TrueSV",i) assign(pf.name,pf) } SVPredict <- data.frame(col1=vector(), col2=vector(), col3=vector()) # makes a blank data frame with three columns to fill with the predictions for(i in mget(ls(pattern = "SVpred")) ) { col1 <-(i)[1] col2 <- (i)[2] col3 <- (i)[3] SVPredict <- rbind(SVPredict , cbind(col1, col2, col3)) } SVPredict <-as.data.frame(t(SVPredict )) # transpose the matrix colnames(SVPredict ) <- rep(1:k) row.names(SVPredict ) <- row.names(iSAPredict) SVPredict TrueValueSV <- data.frame(col1=vector(), col2=vector(), col3=vector()) # makes a blank data frame with three columns to fill with the true values for(i in mget(ls(pattern = "TrueSV")) ) { col1 <-(i)[1] col2 <- (i)[2] col3 <- (i)[3] TrueValueSV <- rbind(TrueValueSV , cbind(col1, col2, col3)) } str(TrueValueSV ) TrueValueSV <- as.data.frame(t(TrueValueSV )) # transpose the matrix colnames(TrueValueSV ) <- rep(1:k) row.names(TrueValueSV ) <- row.names(iSAPredict) TrueValueSV SV_TABLE <- abs(SVPredict -TrueValueSV) MAE_SV<- mean(colMeans(SV_TABLE)) MAE_SV ################## # NB K=10 ################## str(x) myCorpusTwitterTrain <- corpus(x) summary( myCorpusTwitterTrain ) N <- ndoc(myCorpusTwitterTrain ) # Number of desired splits k <- 10 set.seed(123) holdout <- split(sample(1:N), 1:k) docvars(myCorpusTwitterTrain , "id_numeric") <- 1:ndoc(myCorpusTwitterTrain ) for(i in 1:k){ train <- dfm(corpus_subset(myCorpusTwitterTrain , id_numeric %in% holdout[[i]]), remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("â"), ("com"), ("ly")),remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) # Set the training set train <- dfm_trim(train,min_docfreq= 2, verbose=TRUE) # Trim the dfm validation <- dfm(corpus_subset(myCorpusTwitterTrain , !id_numeric %in% holdout[[i]]), remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("â"), ("com"), ("ly")),remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) # Set the validation set validation <- dfm_trim(validation , min_docfreq= 2, verbose=TRUE) # Trim the dfm set.seed(123) newrf <- textmodel_nb(train, docvars(train, "Sentiment"), distribution = c("multinomial")) # Get your new model (just fit on the train data) and ADD the name of the output (in this case "Sentiment") validation <- dfm_select(validation, train) # Let's make the features identical by passing training_dfm to dfm_select() as a pattern newpred <- predict(newrf,validation) # Get the predicitons for the validation set (from the model just fit on the train data) actual_class <- docvars(validation, "Sentiment") # Add name output (in this case "Sentiment") predicted_class <- predict(newrf, newdata = validation) df<- prop.table(table(predicted_class )) df.name<-paste0("NBpred",i) assign(df.name,df) pf<-prop.table(table(validation@docvars$Sentiment )) pf.name<-paste0("TrueNB",i) assign(pf.name,pf) } NBPredict <- data.frame(col1=vector(), col2=vector(), col3=vector()) # makes a blank data frame with three columns to fill with the predictions for(i in mget(ls(pattern = "NBpred")) ) { col1 <-(i)[1] col2 <- (i)[2] col3 <- (i)[3] NBPredict <- rbind(NBPredict , cbind(col1, col2, col3)) } NBPredict <-as.data.frame(t(NBPredict )) # transpose the matrix colnames(NBPredict ) <- rep(1:k) row.names(NBPredict ) <- row.names(iSAPredict) NBPredict TrueValueNB <- data.frame(col1=vector(), col2=vector(), col3=vector()) # makes a blank data frame with three columns to fill with the true values for(i in mget(ls(pattern = "TrueNB")) ) { col1 <-(i)[1] col2 <- (i)[2] col3 <- (i)[3] TrueValueNB <- rbind(TrueValueNB , cbind(col1, col2, col3)) } str(TrueValueNB ) TrueValueNB <- as.data.frame(t(TrueValueNB )) # transpose the matrix colnames(TrueValueNB ) <- rep(1:k) row.names(TrueValueNB ) <- row.names(iSAPredict) NB_TABLE <- abs(NBPredict -TrueValueNB) MAE_NB<- mean(colMeans(NB_TABLE)) MAE_NB ############ GRAPH library(reshape2) gb1 <- as.data.frame(MAE_iSA) colnames(gb1)[1] <- "MAE iSA" gb2 <- as.data.frame(MAE_RF) colnames(gb2)[1] <- "MAE RF" gb3 <- as.data.frame(MAE_SV) colnames(gb3)[1] <- "MAE SV" gb4 <- as.data.frame(MAE_NB ) colnames(gb4)[1] <- "MAE NB" ac_tot <- cbind(gb1, gb2, gb3, gb4) ac_tot str(ac_tot) df.long_ac_tot<-melt(ac_tot) str(df.long_ac_tot) ggplot(df.long_ac_tot, aes(x=variable, y=value)) + geom_boxplot() + xlab("Algorithm") + ylab(label="Value") + ggtitle("Different ML K-fold cross-validation (K=10): MAE") + coord_flip()