rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/")
getwd()

library(quanteda)
library(keyATM)
library(magrittr)

data("data_corpus_moviereviews", package = "quanteda.textmodels")
corp <- tail(data_corpus_moviereviews, 500)+head(data_corpus_moviereviews, 500) # keeping the last and the first 500 texts 
ndoc(corp )

# just an example here: I could have also removed stopwords, etc.
# however be careful with the stemming option! Read below

tok4 <- tokens(corp, remove_numbers=TRUE)
dfmt <- dfm(tok4)
# min_char: I specify the minimum length in characters for tokens to be removed 
dfmt <- dfm_remove(dfmt, stopwords('en'), min_nchar = 2)
# I keep only words that occurr in the top 90% of the distribution and in less than 10% of documents 
# (i.e., very frequent but document-specific words) 
dfmt <- dfm_trim(dfmt, min_termfreq = 0.90, termfreq_type = "quantile",
             max_docfreq = 0.1, docfreq_type = "prop")
topfeatures(dfmt , 20)  # 20 top words

keywords <- list(people = c("family", "couple", "kid", "kids", "children", "parent", "parents", "husband", "wife"),
                        space = c("alien", "aliens", "planet", "space"),
                        monster = c("monster", "monsters", "ghost", "ghosts", "zombie", "fear"),
                        war = c("war", "soldier", "soldiers", "tanks", "military"),
                        crime = c("crime", "crimes", "murder", "murders", "killer", "killers", "police"))
keywords 

keyATM_docs <- keyATM_read(texts = dfmt)
summary(keyATM_docs)

system.time(out <- keyATM(docs              = keyATM_docs,    # text input
              no_keyword_topics = 1,              # number of topics without keywords; in this case: 1; but you can add any number you want, 
								  # including decide to NOT include any no-keyword topic 
              keywords          = keywords,       # keywords
              model             = "base",         # select the model
              options           = list(seed = 123)))

############################################################
# We can estimate the avg value for topic coherence and exclusivity to compare different models as we did for topic models and STM.
# Of course we do that by estimating the avg. only with respect to keyword-topics (the ones we are mainly
# interested about; also because otherwise by definition there is a higher likelihood that 
# for example the avg. topic coherence decreases if you increase the number of nokeyword topics...)
############################################################

# Let's first estimate the avg. topic coherence

words <- top_words(out, 10)
words

# let's get rid of the unicode
topic<- c(1:length(words))
topic

for (i  in topic) 
{ 
words[,i]  <- gsub("\u2713","",words[,i] )
words[,i]  <- gsub("[[:punct:]]","",words[,i] )
words[,i]  <- gsub(" ","",words[,i] )
    }
words

# let's estimate the coherence for each topic
# the coherence function below needs to have a dfm similar to the one we employ in the topicmodels package
library(topicmodels)
dtm <- convert(dfmt , to = "topicmodels")

# let's create an empty data frame that we will fill later on
results <- data.frame(first=vector()) 
results 

# The coherence statistic can be computed inside the function topic_coherence. 
# Note the ":::" to access internal functions of a package

library(topicdoc)

for (i  in topic) 
{ 
coherence <- topicdoc:::coherence(dtm, words[,i], 1)
results <- rbind(results , cbind(coherence ))
    }

results
# avg. level of coherence
mean(results [, 1][1:5] )

# Let's estimate now the avg. topic exclusiveness (following the same procedure employed in STM)
out$phi
str(out$phi)
tt <- t(out$phi)
str(tt)
apply(tt,2,which.max) 
rownames(tt)[c(3,1137, 983, 334, 72,78)]
top_words(out, 2)

w <- 0.7 # weight to apply (this is the default value in STM)
M <- 10 # let's focus just on the top 10 words
tbeta <-  t(out$phi)
s <- rowSums(tbeta)
mat <- tbeta/s
ex <- apply(mat, 2, rank)/nrow(mat)
fr <- apply(tbeta, 2, rank)/nrow(mat)
frex <- 1/(w/ex + (1 - w)/fr)
index <- apply(tbeta, 2, order, decreasing = TRUE)[1:M, ]
exclus <- vector(length = ncol(tbeta))
for (i in 1:ncol(frex)) {
        exclus [i] <- sum(frex[index[, i], i])
    }
exclus 
# avg. level of exclusiveness 
mean(exclus[1:5])

# we can now compare the above results with the ones we get when no_keyword_topics = 2 for example, to select across models

############################################################
# for example, let's now estimate a loop to explore the results for no-keywords between 1 and 5.
# note that in this case, to save time, I have specified " iterations=100". 
# Of course this is not something you should in your analysis (being the number of iterations far too small!)
############################################################

library(topicmodels)
dtm <- convert(dfmt , to = "topicmodels")
library(topicdoc)

top <- c(1:5)
top

# let's create an empty data frame that we will fill later on
results_analysis <- data.frame(first=vector(), second=vector(), third=vector()) 

results_df<- data.frame(nokeywtopic=vector(), coherence=vector(), exclusivity=vector())
str(results_df)

for (j  in top) 
{ 
out <- keyATM(docs = keyATM_docs, no_keyword_topics = j, keywords = keywords, model = "base", options = list(seed = 123, iterations=100))
results_df[j,1]  <- (j)
words <- top_words(out, 10)
topic<- c(1:length(words))
for (i  in topic) 
{ 
words[,i]  <- gsub("\u2713","",words[,i] )
words[,i]  <- gsub("[[:punct:]]","",words[,i] )
words[,i]  <- gsub(" ","",words[,i] )
    }
results <- data.frame(first=vector()) 
for (i  in topic) 
{ 
coherence <- topicdoc:::coherence(dtm, words[,i], 1)
results <- rbind(results , cbind(coherence ))
    }
# avg. level of coherence
results_df[j,2] <- mean(results [, 1][1:5] )
# Let's estimate now the avg. topic exclusiveness (following the same procedure employed in STM)
tt <- t(out$phi)
w <- 0.7 # weight to apply (this is the default value in STM)
M <- 10 # let's focus just on the top 10 words
tbeta <-  t(out$phi)
s <- rowSums(tbeta)
mat <- tbeta/s
ex <- apply(mat, 2, rank)/nrow(mat)
fr <- apply(tbeta, 2, rank)/nrow(mat)
frex <- 1/(w/ex + (1 - w)/fr)
index <- apply(tbeta, 2, order, decreasing = TRUE)[1:M, ]
exclus <- vector(length = ncol(tbeta))
for (i in 1:ncol(frex)) {
        exclus [i] <- sum(frex[index[, i], i])
    }
results_df[j,3] <- mean(exclus[1:5])
}

results_df

library(ggplot2)

# nokeywor topics = 2 seems to be the best solution

ggplot(results_df, aes(x=coherence, y=exclusivity)) + geom_point() + 
  geom_text(label=results_df$nokeywtopic, vjust=-1) +  
ylab(label="Exclusivity ") +  xlab("Semantic Coherence")  + 
  theme_light()