rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(dplyr) library(stringr) library(e1071) library(caret) library(cvTools) library(text2vec) library(lsa) library(quanteda.textstats) # Let's focus on MOVIE reviews. We have a training and a test-set here. # Let's sum the two sets to have a larger corpus on which running the WE x <- read.csv("train_review2.csv", stringsAsFactors=FALSE) str(x) # Note that in this case, we do know the "true" value for sentiment with respect to the texts included in the test-set! x10 <- read.csv("test_review2.csv", stringsAsFactors=FALSE) str(x10) tot <- rbind(x, x10) str(tot) nrow(tot) tot$text <- gsub("'"," ",tot$text) # let's replace the apostrophe with an empty space (sometimes Quanteda has problems to remove them myCorpusTwitter <- corpus(tot) # We do minimal preprocessing. We also follow standard practice which is to include all words with a minimum count above # a given threshold, between 5-10 (we choose 5). tok2 <- tokens(myCorpusTwitter , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) Dfm <- dfm(tok2 ) topfeatures(Dfm ) tok2 <- tokens_remove(tok2, c("s", "t")) Dfm <- dfm(tok2 ) topfeatures(Dfm ) Dfm <- dfm_trim(Dfm, min_termfreq = 5, verbose=TRUE) ############################################# ############################################# # Applying the GloVe algorithm via Quanteda ############################################# ############################################# # Let's first extract the vocabulary from our Dfm Dfm_vocab <- featnames(Dfm ) str(Dfm_vocab) # Then let's select the tokens that are in the corpus mov_tokens <- tokens(myCorpusTwitter) # Then let's match the two vocabularies (i.e., let's keep only those features that are both present in the Corpus # as well as in the Dfm (after that we pre-processed the texts!)) # Note the following: the command "padding=TRUE" leaves an empty string where the removed tokens previously existed. # This is useful if a positional match is needed between the pre- and post-selected tokens, # for instance if a window of adjacency needs to be computed. This prevents that non-adjacent words (in the original text) # becomes adjacent (after pre-processing) mov_tokens2 <- tokens_select(mov_tokens, Dfm_vocab, padding = TRUE) # Create a term co-occurance matrix (default window is 5; you can change it by using the window command). # Why using the weights option? In this case, we weight more tokens closer than far away fcmat_news <- fcm(mov_tokens2, context = "window", count = "weighted", weights = 1/(1:5)) fcmat_news # alternatively fcmat_news_noweights <- fcm(mov_tokens2, context = "window") fcmat_news_noweights ############################################# ############################################# # Let's estimate word embeddings via Glove ############################################# ############################################# # Which are the main parameters to look at? # rank=number of dimensions (100 dimensions is the default) # x_max=maximum number of co-occurrences to use in the weighting function. Here we select 10 # Glove algorithm balances the weight of very common and very uncommon words. For example, very common word will have very high co-occurrence # counts with many words, which gives them undue influence on word vectors. On the other side, we can have very rare words, which have low # co-occurrence counts, but which should still have some influence. For more info: https://nlp.stanford.edu/pubs/glove.pdf # If you want to exactly reproduce your results, you have to identify a set.seed and specify n_threads = 1. # You will need more computational time, but at least replication is assured. Note that the differences if # we replicate your analysis with a larger number of threads will be minimal set.seed(123) system.time(glove <- GlobalVectors$new(rank=100, x_max=10)) # you can increase n_iter or decreasing the convergence_tol (but you would need more time...) # These two statistics refer to the ML algorithm Glove employs system.time(glove_main <- glove$fit_transform(fcmat_news, n_iter = 20, convergence_tol = 0.01, n_threads = 1)) # around 45 seconds str(glove_main) ############################################# ############################################# ############# plotting words in the dimensional space ############################################# ############################################# glove_dataframe <- as.data.frame(glove_main) nrow(glove_dataframe) # the same as our co-occurance matrix of course! nrow(fcmat_news) colnames(glove_dataframe ) # let's add to glove_dataframe a specific column called "word" with the list of features glove_dataframe$word <- row.names(glove_dataframe ) colnames(glove_dataframe ) plot_words <- function(words, glove_dataframe){ # empty plot plot(0, 0, xlim=c(-0.5, 0.5), ylim=c(-0.5,0.5), type="n", xlab="Second dimension", ylab="Third dimension") for (word in words){ # extract second and third dimensions vector <- as.numeric(glove_dataframe[glove_dataframe$word==word,2:3]) # add to plot text(vector[1], vector[2], labels=word) } } plot_words(c("teen", "alien", "monster", "war", "bad", "murder", "good", "fear", "summer"), glove_dataframe) # Now we can start to play. The measure of interest in comparing two vectors will be cosine similarity (do you remember?) # Let’s see what is similar to "girl" girl <- glove_main["girl", , drop = F] cos_sim_girl <- sim2(x = glove_main, y = girl, method = "cosine", norm = "l2") head(sort(cos_sim_girl[,1], decreasing = T), 10) #### Once we have the vectors for each word, we can also compute the similarity between a pair of words: similarity <- function(word1, word2){ lsa::cosine( x=as.numeric(glove_dataframe[glove_dataframe$word==word1,1:100]), y=as.numeric(glove_dataframe[glove_dataframe$word==word2,1:100])) } similarity("father", "mother") similarity("summer", "monster") # Let's see an analogy # Which is the spatially closest word vector to "(vec)murder - (vec)detective + (vec)romance" = ? ex <- glove_main["murder", , drop = FALSE] - glove_main["detective", , drop = FALSE] + glove_main["romance", , drop = FALSE] cos_sim <- textstat_simil(x = as.dfm(glove_main), y = as.dfm(ex), method = "cosine") head(sort(cos_sim[, 1], decreasing = TRUE), 5) # alternatively cos_sim_test <- sim2(x = glove_main, y = ex , method = "cosine", norm = "l2") head(sort(cos_sim_test[,1], decreasing = T), 5) ############################################# ############################################# ############# An important note ############################################# ############################################# # Note that the GLOVE model learns two sets of word vectors - main (the one we employed above!) and context # You can think about that as "target" vectors and "context" vectors. Each word should have its own vector (target) but it also serve # as context for other words. Since all computations are symmetrical in the end both vectors should be identical. # But since they use approximate computations the two vectors might be slightly different. # While both of word-vectors matrices can be used as result it could be a good idea to average or take a sum of main and context vector. # This could lead to higher quality embeddings. In the present case not such a great improvement (so we will avoid that). # Having said that, if you want to take the sum of the two vectors, you can simply write: wv_context <- glove$components str(t(wv_context)) # we need to transpose the matrix to directly compare it to the original glove_main matrix str(glove_main) dim(wv_context) glove_main2 <- glove_main + t(wv_context) str(t(wv_context)) str(glove_main) str(glove_main2) ############################################# ############################################# # A SVM with WE ############################################# ############################################# glove_dataframe <- select(glove_dataframe, word, everything()) # move the "word" column to the top colnames(glove_dataframe ) glove_dataframe[1:5, 2:11] # At the moment wlv2 is a matrix of 12,265 rows (one for each feature) and 101 columns (1 column for word and the other 100 for the 100 # dimensions of WE) nrow(glove_dataframe) # in the original Dfm I had 1,500 documents ndoc(Dfm) # what I want to do is estimating for each document in the Dfm a value equals to the weighted average of its words # in each of the 100 dimensions of WE. As a result I can generate a new matrix with 1,500 rows (one for each document) # and 100 columns (with the average position of each document in each of the 100 dimensions of WE) embed <- matrix(NA, nrow=ndoc(Dfm), ncol=100) # empty matrix for (i in 1:ndoc(Dfm)){ if (i %% 100 == 0) message(i, '/', ndoc(Dfm)) # extract word counts vec <- as.numeric(Dfm[i,]) # keep words with counts of 1 or more doc_words <- featnames(Dfm)[vec>0] # extract embeddings for those words embed_vec <- glove_dataframe[glove_dataframe$word %in% doc_words, 2:101] # aggregate from word- to document-level embeddings by taking AVG embed[i,] <- colMeans(embed_vec, na.rm=TRUE) # if no words in embeddings, simply set to 0 if (nrow(embed_vec)==0) embed[i,] <- 0 } str(embed) str(tot) # let's separate our original training and the test-set training <- c(1:500) str(training) test <- c(501:1500) str(test) # our benchmark: 0.524 str(tot) prop.table(table(tot$Sentiment[training])) # SVM set.seed(123) system.time(SVM <- svm(x=embed[training,], y=as.factor(tot$Sentiment[training]), kernel='linear', cost = 1)) # computing predicted values predicted_SV <- predict(SVM, embed[test,], type="class") table(predicted_SV) prop.table(table(predicted_SV)) # in this case we do know the "true" value for Sentiment in the test-set. So let's build a confusion-matrix contrasting the # "tue" values with the predicted "values" confusion <- confusionMatrix( predicted_SV, as.factor(tot$Sentiment[test]), mode = "everything") confusion accuracySV <- confusion $overall[1] F1_0 <- (2*confusion $byClass[1]*confusion $byClass[3])/(confusion $byClass[1]+confusion $byClass[3]) # F1 value for category negative F1_1 <- (2*confusion $byClass[2]*confusion $byClass[4])/(confusion $byClass[2]+confusion $byClass[4]) # F1 value for category positive F1_SV <- (F1_0+F1_1)/2 accuracySV F1_SV ########################################################################### ########################################################################### ############ SAME ANALYIS with with a pre-trained word embeddings computed # on a sample of Google news (not sharing that much with movie reviews...still...) ########################################################################### ########################################################################### # extracting the word embeddings on 100 dimensions pre_trained <- readr::read_delim("vector.txt", skip=1, delim=" ", quote="", col_names=c("word", paste0("V", 1:100))) # 100 dimensions + 1 column for features colnames(pre_trained) nrow(pre_trained) # almost 72K features included; much more than the number of features included in our corpus nrow(glove_dataframe) ################# let's play a bit with this new pre-trained WE # convert from a tibble to a data frame class(pre_trained) pre_trained2 <- as.data.frame(pre_trained) class(pre_trained2) row.names(pre_trained2) <- pre_trained2$word pre_trained2 <- pre_trained2[-c(1)] pre_trainedMatrix <- as.matrix(pre_trained2) str(pre_trainedMatrix) str(glove_main) # Let's see an analogy # Which is the spatially closest word vector to "(vec)king - (vec)male + (vec)female" = ? ex <- pre_trainedMatrix["king", , drop = FALSE] - pre_trainedMatrix["male", , drop = FALSE] + pre_trainedMatrix["female", , drop = FALSE] cos_sim_test <- sim2(x = pre_trainedMatrix, y = ex , method = "cosine", norm = "l2") head(sort(cos_sim_test[,1], decreasing = T), 5) # Let's see a second analogy # Which is the spatially closest word vector to "(vec)husband - (vec)man + (vec)woman" = ? ex <- pre_trainedMatrix["man", , drop = FALSE] - pre_trainedMatrix["husband", , drop = FALSE] + pre_trainedMatrix["woman", , drop = FALSE] cos_sim_test <- sim2(x = pre_trainedMatrix, y = ex , method = "cosine", norm = "l2") head(sort(cos_sim_test[,1], decreasing = T), 5) ################# # let's NOW match the words included in the pre-trained WE object (pre_trained) with the words included in the Dfm of our corpus pre_trained<- pre_trained[pre_trained$word %in% featnames(Dfm),] nrow(pre_trained) # a number of words < than in our DfM. Why? Cause some words included in the DfM were not included in pre_trained pre_trained[1:20, 1:11] embed2 <- matrix(NA, nrow=ndoc(Dfm), ncol=100) for (i in 1:ndoc(Dfm)){ if (i %% 100 == 0) message(i, '/', ndoc(Dfm)) # extract word counts vec <- as.numeric(Dfm[i,]) # keep words with counts of 1 or more doc_words <- featnames(Dfm)[vec>0] # extract embeddings for those words embed_vec2 <- pre_trained[pre_trained$word %in% doc_words, 2:101] # aggregate from word- to document-level embeddings by taking AVG embed2[i,] <- colMeans(embed_vec2, na.rm=TRUE) # if no words in embeddings, simply set to 0 if (nrow(embed_vec2)==0) embed2[i,] <- 0 } str(embed2) # 1500 documents, with 100 columns (1 for each dimension) # SVM library(e1071) set.seed(123) system.time(SVM <- svm(x=embed2[training,], y=as.factor(tot$Sentiment[training]), kernel='linear', cost = 1)) # computing predicted values predicted_SV <- predict(SVM, embed2[test,], type="class") table(predicted_SV) prop.table(table(predicted_SV)) confusion <- confusionMatrix( predicted_SV, as.factor(tot$Sentiment[test]), mode = "everything") accuracySV_newWE <- confusion $overall[1] F1_0 <- (2*confusion $byClass[1]*confusion $byClass[3])/(confusion $byClass[1]+confusion $byClass[3]) # F1 value for category negative F1_1 <- (2*confusion $byClass[2]*confusion $byClass[4])/(confusion $byClass[2]+confusion $byClass[4]) # F1 value for category positive F1_SV_newWE <- (F1_0+F1_1)/2 # better performance than what we got when using the local WE accuracySV_newWE F1_SV_newWE accuracySV F1_SV ############################################# ############################################# # A SVM with the pre-trained WE + features from the BoW approach ############################################# ############################################# # Finally, we can also combine both bag-of-words and WE features into a single matrix, # and use a ML algorithm to let it choose for us the best set of features. # This combination of features and classifier could help to reach the best performance X <- as.matrix(cbind(Dfm, embed2)) length(Dfm@Dimnames$features) # 12283 feauteres in our DfM str(X) # 12283 features in Dfm + 100 dimensions from WE = 12383 features set.seed(123) system.time(SVM_ALL <- svm(x=X[training,], y=as.factor(tot$Sentiment[training]), kernel='linear', cost = 1)) # computing predicted values predicted_SV_ALL <- predict(SVM_ALL, X[test,], type="class") confusion <- confusionMatrix( predicted_SV_ALL, as.factor(tot$Sentiment[test]), mode = "everything") accuracySV_ALL <- confusion $overall[1] F1_0 <- (2*confusion $byClass[1]*confusion $byClass[3])/(confusion $byClass[1]+confusion $byClass[3]) # F1 value for category 0 F1_1 <- (2*confusion $byClass[2]*confusion $byClass[4])/(confusion $byClass[2]+confusion $byClass[4]) # F1 value for category 0 F1_SV_ALL <- (F1_0+F1_1)/2 accuracySV_ALL F1_SV_ALL # an external WE better than the local one; the best model is a WE+BoW accuracySV accuracySV_newWE accuracySV_ALL F1_SV F1_SV_newWE F1_SV_ALL ############################################# ############################################# # A SVM with CV and WE? Same old stuff! ############################################# ############################################# train <- as.matrix(embed[training,]) # let's focus on the local WE and let's turn it into a matrix ttrain <- train # let's change the name of the original train data.frame, given that we are already going to use such name below in the loop # let's split our training-set in 5 folds set.seed(123) # set the see for replicability k <- 5 # the number of folds folds <- cvFolds(NROW(ttrain ), K=k) str(folds) system.time(for(i in 1:k){ train <- embed[folds$subsets[folds$which != i], ] # Set the training set validation <- embed[folds$subsets[folds$which == i], ] # Set the validation set set.seed(123) newrf <- svm(y= as.factor(Dfm[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train, kernel='linear', cost = 1) # Get your new model (just fit on the train data) and ADD the name of the output (in this case "Sentiment") newpred <- predict(newrf, validation, type="class") # Get the predicitons for the validation set (from the model just fit on the train data) class_table <- table("Predictions"= newpred, "Actual"=Dfm[folds$subsets[folds$which == i], ]@docvars$Sentiment) print(class_table) df<-confusionMatrix( class_table, mode = "everything") # Add name output (in this case "Sentiment") df.name<-paste0("conf.mat.sv",i) # create the name for the object that will save the confusion matrix for each loop (=5) assign(df.name,df) }) SVMPredict <- data.frame(col1=vector(), col2=vector(), col3=vector()) for(i in mget(ls(pattern = "conf.mat.sv")) ) { col1 <-(i)$overall[1] # save in the matrix the accuracy value col2 <- (2*(i)$byClass[1]*(i)$byClass[3])/((i)$byClass[1]+(i)$byClass[3]) # save in the matrix the F1 value for negative col3 <- (2*(i)$byClass[2]*(i)$byClass[4])/((i)$byClass[2]+(i)$byClass[4]) # save in the matrix the F1 value for positive SVMPredict <- rbind(SVMPredict , cbind(col1, col2, col3)) } colnames(SVMPredict )[1] <- "Accuracy" colnames(SVMPredict )[2] <- "F1 negative" colnames(SVMPredict )[3] <- "F1 positive" SVMPredict [is.na(SVMPredict )] <- 0 SVMPredict str(SVMPredict ) # Let's compare the average value for accuracy and f1 acc_sv_avg <- mean(SVMPredict[, 1] ) f1_sv_avg <- mean(colMeans(SVMPredict[-1] )) acc_sv_avg f1_sv_avg # Note that the result you get via cross-validation is very close to the result you get in the test-set (when knowing the "true" value for it). # Indeed, cross-validation is a good procedure!!! accuracySV F1_SV