rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/")
getwd()

library(newsmap)
library(quanteda)

# Step 1: let's begin with 6 texts
text_en <- c(text1 = "This is an article about Italy",
             text2 = "Milan is a beautiful Italian city",
             text3= "I miss the sea of Spain",
             text4="Granada is an ancient Spanish city",
             text5 = "We liked food more in Siena than in Milan",
             text6 = "In Granada you can feel history all around you") 
corp <- corpus(text_en)
tok <- tokens(corp)
feat_dfm_en <- dfm(corp)
feat_dfm_en

# Step 2: let's apply a dictionary of seed-words
# let's use this list of seed-words about countries coming with newsmap - including word stems/wildcards!
data_dictionary_newsmap_en
names(data_dictionary_newsmap_en)
names(data_dictionary_newsmap_en[["EUROPE"]])
names(data_dictionary_newsmap_en[["EUROPE"]] [["SOUTH"]])
print(data_dictionary_newsmap_en[["EUROPE"]][["SOUTH"]], max_nkey=15)

label <- dfm_lookup(feat_dfm_en, dictionary = data_dictionary_newsmap_en)
label[1:6, 190:205]  
# i.e., the keywords of the topic "IT" of our dictionary appears twice in the corpus, the same for the topic "ES"
# while 2 texts have no seed words

# Step 3: let's train the model
model_en <- textmodel_newsmap(feat_dfm_en, label)
# now all the words of our two texts, including those not included in the seed-words, get a value!
# take a look at the scores for Milan and Granada (not originally included in the seed dictionary!)
model_en$model
# let's predict the label - we also predicted (correctly) both text5 and text6 (i.e., those texts that did not
# include any seed words originally). How was it possible?
predict(model_en)

# Step 4: we can now apply the trained model to other new texts
text_en2 <- c(text7 = "Make India great again", text8 = "I love the sea!")
corp2 <- tokens(text_en2)
feat_dfm_en2 <- dfm(corp2, tolower = TRUE)
# can you explain why you get such results?
predict(model_en, newdata=feat_dfm_en2)

# we cannot classify text5 because it does not include any words included in the texts included considered during
# our training-stage. On the other side text6 has been classified as "Spain" thanks to the word sea
model_en$model

########################################
### another example with movie reviews
########################################

# Step 1: let's begin with the texts
data("data_corpus_moviereviews", package = "quanteda.textmodels")
corp <- tail(data_corpus_moviereviews, 500) # keeping the first 500 texts 
as.character(corp )[2]
ndoc(corp )
tok <- tokens(corp, remove_number = TRUE)
tok <- tokens_remove(tok, stopwords("en"))
dfmt <- dfm(tok )
# min_char: I specify the minimum length in characters for tokens to be removed 
dfmt <- dfm_remove(dfmt, min_nchar = 2)
dfmt <- dfm_trim(dfmt, min_termfreq = 0.90, termfreq_type = "quantile",
             max_docfreq = 0.1, docfreq_type = "prop")

topfeatures(dfmt, 50)
 
# Step 2: let's apply a dictionary of seed-words - remember my dictionary option
dict <- dictionary(list(people = c("family", "couple", "kid*", "child*", "parents", "husband", "wife"),
                        space = c("alien*", "planet", "space"),
                        monster = c("monster*", "ghost*", "zombie*", "fear"),
                        war = c("war", "soldier*", "tanks", "military"),
                        crime = c("crime*", "murder*", "killer*", "police")))
dict

# dfm_lookup
label <- dfm_lookup(dfmt, dictionary = dict)
label[2:3, 1:5] # i.e., the keywords of the topic "crime" of our dictionary appears 2 times in the second review, etc.

# Step 3: let's train the model
model_en <- textmodel_newsmap(dfmt, label)
# now all the words of our texts, including those not included in the seed-words, get a value!
# take a look at words such as dating and tarantino
str(model_en$model)
model_en$model[c("people", "space", "monster", "war", "crime"), c("dating", "tarantino")]

# let's see the top ten features associated to each of the 5 topics we defined above
for (i in 1:5) 
{ 
print(dict[i,])
 print(sort(model_en $model[i,], decreasing = T)[1:10]) 
}

# let's predict the label
predict(model_en)
prop.table(table(predict(model_en)))

# let's plot it
newsmap_pred <- as.data.frame(prop.table(table(predict(model_en))))
str(newsmap_pred)
names(newsmap_pred)[1] <- "Topics"
str(newsmap_pred)

library(ggplot2)
ggplot(newsmap_pred, aes(x = reorder(Topics, -Freq), y = Freq)) + 
geom_bar(stat = "identity") + xlab("Topics in the training-set (via Newsmap)")

# let's save the trained prediction
newsmap_pred <- prop.table(table(predict(model_en)))
newsmap_pred

# Step 4: let's apply the trained model to completely new texts
corp2 <- tail(data_corpus_moviereviews, 250) # keeping the last 250 texts 
ndoc(corp2)
tok2 <- tokens(corp2, remove_number = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
dfmt2 <- dfm(tok2 )
# min_char: I specify the minimum length in characters for tokens to be removed 
dfmt2 <- dfm_remove(dfmt2, min_nchar = 2)
dfmt2 <- dfm_trim(dfmt2, min_termfreq = 0.90, termfreq_type = "quantile",
             max_docfreq = 0.1, docfreq_type = "prop")
predict(model_en, newdata=dfmt2)

prop.table(table(predict(model_en)))
prop.table(table(predict(model_en, newdata=dfmt2)))