rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/") getwd() library(quanteda) library(keyATM) library(magrittr) data("data_corpus_moviereviews", package = "quanteda.textmodels") corp <- tail(data_corpus_moviereviews, 500)+head(data_corpus_moviereviews, 500) # keeping the last and the first 500 texts ndoc(corp ) # just an example here: I could have also removed stopwords, etc. # however be careful with the stemming option! Read below tok4 <- tokens(corp, remove_numbers=TRUE) dfmt <- dfm(tok4) # min_char: I specify the minimum length in characters for tokens to be removed dfmt <- dfm_remove(dfmt, stopwords('en'), min_nchar = 2) # I keep only words that occurr in the top 90% of the distribution and in less than 10% of documents # (i.e., very frequent but document-specific words) dfmt <- dfm_trim(dfmt, min_termfreq = 0.90, termfreq_type = "quantile", max_docfreq = 0.1, docfreq_type = "prop") topfeatures(dfmt , 20) # 20 top words keywords <- list(people = c("family", "couple", "kid", "kids", "children", "parent", "parents", "husband", "wife"), space = c("alien", "aliens", "planet", "space"), monster = c("monster", "monsters", "ghost", "ghosts", "zombie", "fear"), war = c("war", "soldier", "soldiers", "tanks", "military"), crime = c("crime", "crimes", "murder", "murders", "killer", "killers", "police")) keywords keyATM_docs <- keyATM_read(texts = dfmt) summary(keyATM_docs) system.time(out <- keyATM(docs = keyATM_docs, # text input no_keyword_topics = 1, # number of topics without keywords; in this case: 1; but you can add any number you want, # including decide to NOT include any no-keyword topic keywords = keywords, # keywords model = "base", # select the model options = list(seed = 123))) ############################################################ # We can estimate the avg value for topic coherence and exclusivity to compare different models as we did for topic models and STM. # Of course we do that by estimating the avg. only with respect to keyword-topics (the ones we are mainly # interested about; also because otherwise by definition there is a higher likelihood that # for example the avg. topic coherence decreases if you increase the number of nokeyword topics...) ############################################################ # Let's first estimate the avg. topic coherence words <- top_words(out, 10) words # let's get rid of the unicode topic<- c(1:length(words)) topic for (i in topic) { words[,i] <- gsub("\u2713","",words[,i] ) words[,i] <- gsub("[[:punct:]]","",words[,i] ) words[,i] <- gsub(" ","",words[,i] ) } words # let's estimate the coherence for each topic # the coherence function below needs to have a dfm similar to the one we employ in the topicmodels package library(topicmodels) dtm <- convert(dfmt , to = "topicmodels") # let's create an empty data frame that we will fill later on results <- data.frame(first=vector()) results # The coherence statistic can be computed inside the function topic_coherence. # Note the ":::" to access internal functions of a package library(topicdoc) for (i in topic) { coherence <- topicdoc:::coherence(dtm, words[,i], 1) results <- rbind(results , cbind(coherence )) } results # avg. level of coherence mean(results [, 1][1:5] ) # Let's estimate now the avg. topic exclusiveness (following the same procedure employed in STM) out$phi str(out$phi) tt <- t(out$phi) str(tt) apply(tt,2,which.max) rownames(tt)[c(3,1137, 983, 334, 72,78)] top_words(out, 2) w <- 0.7 # weight to apply (this is the default value in STM) M <- 10 # let's focus just on the top 10 words tbeta <- t(out$phi) s <- rowSums(tbeta) mat <- tbeta/s ex <- apply(mat, 2, rank)/nrow(mat) fr <- apply(tbeta, 2, rank)/nrow(mat) frex <- 1/(w/ex + (1 - w)/fr) index <- apply(tbeta, 2, order, decreasing = TRUE)[1:M, ] exclus <- vector(length = ncol(tbeta)) for (i in 1:ncol(frex)) { exclus [i] <- sum(frex[index[, i], i]) } exclus # avg. level of exclusiveness mean(exclus[1:5]) # we can now compare the above results with the ones we get when no_keyword_topics = 2 for example, to select across models ############################################################ # for example, let's now estimate a loop to explore the results for no-keywords between 1 and 5. # note that in this case, to save time, I have specified " iterations=100". # Of course this is not something you should in your analysis (being the number of iterations far too small!) ############################################################ library(topicmodels) dtm <- convert(dfmt , to = "topicmodels") library(topicdoc) top <- c(1:5) top # let's create an empty data frame that we will fill later on results_analysis <- data.frame(first=vector(), second=vector(), third=vector()) results_df<- data.frame(nokeywtopic=vector(), coherence=vector(), exclusivity=vector()) str(results_df) for (j in top) { out <- keyATM(docs = keyATM_docs, no_keyword_topics = j, keywords = keywords, model = "base", options = list(seed = 123, iterations=100)) results_df[j,1] <- (j) words <- top_words(out, 10) topic<- c(1:length(words)) for (i in topic) { words[,i] <- gsub("\u2713","",words[,i] ) words[,i] <- gsub("[[:punct:]]","",words[,i] ) words[,i] <- gsub(" ","",words[,i] ) } results <- data.frame(first=vector()) for (i in topic) { coherence <- topicdoc:::coherence(dtm, words[,i], 1) results <- rbind(results , cbind(coherence )) } # avg. level of coherence results_df[j,2] <- mean(results [, 1][1:5] ) # Let's estimate now the avg. topic exclusiveness (following the same procedure employed in STM) tt <- t(out$phi) w <- 0.7 # weight to apply (this is the default value in STM) M <- 10 # let's focus just on the top 10 words tbeta <- t(out$phi) s <- rowSums(tbeta) mat <- tbeta/s ex <- apply(mat, 2, rank)/nrow(mat) fr <- apply(tbeta, 2, rank)/nrow(mat) frex <- 1/(w/ex + (1 - w)/fr) index <- apply(tbeta, 2, order, decreasing = TRUE)[1:M, ] exclus <- vector(length = ncol(tbeta)) for (i in 1:ncol(frex)) { exclus [i] <- sum(frex[index[, i], i]) } results_df[j,3] <- mean(exclus[1:5]) } results_df library(ggplot2) # nokeywor topics = 2 seems to be the best solution ggplot(results_df, aes(x=coherence, y=exclusivity)) + geom_point() + geom_text(label=results_df$nokeywtopic, vjust=-1) + ylab(label="Exclusivity ") + xlab("Semantic Coherence") + theme_light()