rm(list=ls(all=TRUE))
getwd()
# set your working directory (i.e., where you have saved the datasets, etc.) 
# In my case my working directory is C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL
# setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL")
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()

library(readtext)
library(quanteda)
library(ggplot2)

#########################################################################
# Running example: How to deal with Japanese language
#########################################################################

# When you want to show Japanese kanji on a non-Japanese laptop, always write the following at the beginning of your session:
Sys.setlocale("LC_CTYPE","japanese")

# Suppose we want to analyze the text of speeches made on 17 Nov, 2017 and 20 Nov, 2017 
# for the new Japanese parliamentary session

myText <- readtext("Diet speeches/*.txt", encoding = "UTF-8")
str(myText)

# tokens() can segment Japanese texts without additional tools based on the rules defined in the ICU library, 
# which is available via the stringi package (that comes with the Quanteda one). ICU detects boundaries of Japanese words using a dictionary 
# with frequency information (see: http://source.icu-project.org/repos/icu/icu/tags/release-58-rc/source/data/brkitr/dictionaries/)
# for the list of Asian languages covered, including Chinese, Thai, but not Korean

icu_toks <- tokens(myText$text)
# this expression means: consider document 2, and report me the first 40 characters appearing in it 
head(icu_toks[[2]], 40)

# Japanese stopwords
stopwords("ja", source = "stopwords-iso")
# A better alternative is using the source "marimo"
stopwords("ja", source = "marimo")

# About stopwords in different languages, take a look at here: https://cran.r-project.org/web/packages/stopwords/README.html
# Alternatively,  you want to remove them by yourself by using the "remove" option discussed previously

# tokenize corpus and apply pre-processing
toks <- tokens(myText$text, remove_punct = TRUE, remove_numbers = TRUE, remove_separators = TRUE)
head(icu_toks[[1]], 40)
head(toks[[1]], 40)
toks <- tokens_remove(toks, stopwords("ja", source = "marimo"))
head(toks[[1]], 40)
# You can select hiragana tokens only 
# Note: "\\p" means match a collection of character, not single one
# # $ means "Match the end of the string" (the position after the last character in the string) to ensure that the entire string is matched instead of just a substring

toksHira <- tokens_select(toks, pattern = "\\p{script=Hira}+$", valuetype = "regex")
head(toks[[1]], 40)
head(toksHira [[1]], 40)

# or you can select katakana tokens only
toksKata <- tokens_select(toks, pattern = "\\p{script=Kana}+$", valuetype = "regex")
head(toks[[1]], 40)
head(toksHira [[1]], 40)
head(toksKata [[1]], 40)

# or you can select kanji tokens only
toksKanji <- tokens_select(toks, pattern = "\\p{script=Hani}+$", valuetype = "regex")
head(toksKanji [[1]], 40)
head(toksHira [[1]], 40)
head(toksKata [[1]], 40)

# Finally, you can decide also select only Japanese words (either Hiragana, Katakana, or Kanji)
toks2 <- tokens_select(toks, pattern =c("\\p{script=Hira}+$", "\\p{script=Kana}+$", "\\p{script=Hani}+$"), valuetype = "regex")
head(toks2[[1]], 40)

jap_dfm <- dfm(toks)
topfeatures(jap_dfm, 20)
jap_dfm[, 1:5]

# I want to name properly the texts. The function "docnames" also works for corpus and tokens objects
docnames(jap_dfm) <- myText$doc_id
jap_dfm[, 1:5]

# I could also write directly the name I want to give to such texts (this is an hypothetical example with fictious names!)
docnames(jap_dfm) <- c("A", "B", "C", "D", "E", "F", "G")
jap_dfm[, 1:5]

# If you want to perform more accurate tokenization, you need to install a morphological analysis tool, and call it from R. 
# For example: RcppMeCab (Chinese, Japanese, and Korean): https://github.com/junhewk/RcppMeCab

#########################################################################
# Running example: How to deal with Chinese language
#########################################################################

library(quanteda.corpora)
corp <- quanteda.corpora::download(url = "https://www.dropbox.com/s/37ojd5knz1qeyul/data_corpus_chinesegovreport.rds?dl=1")

# tokenize corpus and apply pre-processing
ch_toks<- tokens(corp, remove_punct = TRUE, remove_numbers = TRUE, remove_separators = TRUE)
head(ch_toks[[1]], 40)
ch_toks<- tokens_remove(toks, stopwords("zh_cn", source = "marimo"))
head(ch_toks[[1]], 40)
# Alternative Chinese stopwords
stopwords("zh", source = "misc")
stopwords("zh", source = "stopwords-iso")

# You can keep only Chinese characters 
ch_toks<- tokens_select(toks, pattern = "\\p{script=Hani}+$", valuetype = "regex")
head(ch_toks[[1]], 40)

ch_dfm <- dfm(ch_toks)
topfeatures(ch_dfm)
ch_dfm[, 1:5]