rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")

library(readtext)
library(quanteda)
library(ggplot2)

#########################################################################
# Running example: How to deal with Japanese language
#########################################################################

# When you want to show Japanese kanji on a non-Japanese laptop, always write the following at the beginning of your session:
Sys.setlocale("LC_CTYPE","japanese")

# Suppose we want to analyze the text of speeches made on 17 Nov, 2017 and 20 Nov, 2017 
# for the new Japanese parliamentary session

myText <- readtext("Diet speeches/*.txt", encoding = "UTF-8")
str(myText)

# tokens() can segment Japanese texts without additional tools based on the rules defined in the ICU library, 
# which is available via the stringi package (that comes with the Quanteda one). ICU detects boundaries of Japanese words using a dictionary 
# with frequency information (see: http://source.icu-project.org/repos/icu/icu/tags/release-58-rc/source/data/brkitr/dictionaries/)
# for the list of Asian languages covered, including Chinese, Thai, but not Korean

icu_toks <- tokens(myText$text)
# this expression means: consider document 2, and report me the first 40 characters appearing in it 
head(icu_toks[[2]], 40)

# Even if you use a morphological analysis tool, tokenization of Japanese text is far from perfect. 
# You can refine tokens by compounding sequence of the same character class using textstat_collocations() 
# and tokens_compound(). Explore it by yourself if you are interested! 

# Japanese stopwords
stopwords("ja", source = "stopwords-iso")

# A better alternative is using the source "marimo"
stopwords("ja", source = "marimo")

# About stopwords in different languages, take a look at here: https://cran.r-project.org/web/packages/stopwords/README.html
# Alternatively,  you want to remove them by yourself by using the "remove" option discussed previously

jap_dfm <- dfm(icu_toks, tolower = FALSE, remove_punct = TRUE, remove = c(stopwords("ja", source = "marimo")))
topfeatures(jap_dfm, 20)
jap_dfm[, 1:5]

# I want to name properly the texts
jap_dfm@docvars
jap_dfm@docvars[,2]
jap_dfm@docvars[,2] <- myText$doc_id
jap_dfm[, 1:5]

# I could also write directly the name I want to give to such texts (this is an hypothetical example with fictious names!)
jap_dfm@docvars[,2] <- c("A", "B", "C", "D", "E", "F", "G")
jap_dfm@docvars[,2]
jap_dfm[, 1:5]

# If you want to perform more accurate tokenization, you need to install a morphological analysis tool, and call it from R. 
# For example: RcppMeCab (Chinese, Japanese, and Korean): https://github.com/junhewk/RcppMeCab

#########################################################################
# Running example: How to deal with Chinese language
#########################################################################

library(quanteda.corpora)
corp <- quanteda.corpora::download(url = "https://www.dropbox.com/s/37ojd5knz1qeyul/data_corpus_chinesegovreport.rds?dl=1")

# Chinese stopwords
stopwords("zh", source = "misc")

# Alternative
stopwords("zh", source = "stopwords-iso")

# tokenize, remove stop-words and remove punctuation
ch_toks <- tokens(corp) 
head(ch_toks[[2]], 40)

ch_dfm <- dfm(ch_toks, tolower = FALSE, remove_punct = TRUE, remove = c(stopwords("zh", source = "misc")))
topfeatures(ch_dfm)