rm(list=ls(all=TRUE)) getwd() # set your working directory (i.e., where you have saved the datasets, etc.) # In my case my working directory is C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL # setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL") setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(readtext) library(quanteda) library(ggplot2) ######################################################################### # Running example: How to deal with Japanese language ######################################################################### # When you want to show Japanese kanji on a non-Japanese laptop, always write the following at the beginning of your session: Sys.setlocale("LC_CTYPE","japanese") # Suppose we want to analyze the text of speeches made on 17 Nov, 2017 and 20 Nov, 2017 # for the new Japanese parliamentary session myText <- readtext("Diet speeches/*.txt", encoding = "UTF-8") str(myText) # tokens() can segment Japanese texts without additional tools based on the rules defined in the ICU library, # which is available via the stringi package (that comes with the Quanteda one). ICU detects boundaries of Japanese words using a dictionary # with frequency information (see: http://source.icu-project.org/repos/icu/icu/tags/release-58-rc/source/data/brkitr/dictionaries/) # for the list of Asian languages covered, including Chinese, Thai, but not Korean icu_toks <- tokens(myText$text) # this expression means: consider document 2, and report me the first 40 characters appearing in it head(icu_toks[[2]], 40) # Japanese stopwords stopwords("ja", source = "stopwords-iso") # A better alternative is using the source "marimo" stopwords("ja", source = "marimo") # About stopwords in different languages, take a look at here: https://cran.r-project.org/web/packages/stopwords/README.html # Alternatively, you want to remove them by yourself by using the "remove" option discussed previously # tokenize corpus and apply pre-processing toks <- tokens(myText$text, remove_punct = TRUE, remove_numbers = TRUE, remove_separators = TRUE) head(icu_toks[[1]], 40) head(toks[[1]], 40) toks <- tokens_remove(toks, stopwords("ja", source = "marimo")) head(toks[[1]], 40) # You can select hiragana tokens only # Note: "\\p" means match a collection of character, not single one # # $ means "Match the end of the string" (the position after the last character in the string) to ensure that the entire string is matched instead of just a substring toksHira <- tokens_select(toks, pattern = "\\p{script=Hira}+$", valuetype = "regex") head(toks[[1]], 40) head(toksHira [[1]], 40) # or you can select katakana tokens only toksKata <- tokens_select(toks, pattern = "\\p{script=Kana}+$", valuetype = "regex") head(toks[[1]], 40) head(toksHira [[1]], 40) head(toksKata [[1]], 40) # or you can select kanji tokens only toksKanji <- tokens_select(toks, pattern = "\\p{script=Hani}+$", valuetype = "regex") head(toksKanji [[1]], 40) head(toksHira [[1]], 40) head(toksKata [[1]], 40) # Finally, you can decide also select only Japanese words (either Hiragana, Katakana, or Kanji) toks2 <- tokens_select(toks, pattern =c("\\p{script=Hira}+$", "\\p{script=Kana}+$", "\\p{script=Hani}+$"), valuetype = "regex") head(toks2[[1]], 40) jap_dfm <- dfm(toks) topfeatures(jap_dfm, 20) jap_dfm[, 1:5] # I want to name properly the texts. The function "docnames" also works for corpus and tokens objects docnames(jap_dfm) <- myText$doc_id jap_dfm[, 1:5] # I could also write directly the name I want to give to such texts (this is an hypothetical example with fictious names!) docnames(jap_dfm) <- c("A", "B", "C", "D", "E", "F", "G") jap_dfm[, 1:5] # If you want to perform more accurate tokenization, you need to install a morphological analysis tool, and call it from R. # For example: RcppMeCab (Chinese, Japanese, and Korean): https://github.com/junhewk/RcppMeCab ######################################################################### # Running example: How to deal with Chinese language ######################################################################### library(quanteda.corpora) corp <- quanteda.corpora::download(url = "https://www.dropbox.com/s/37ojd5knz1qeyul/data_corpus_chinesegovreport.rds?dl=1") # tokenize corpus and apply pre-processing ch_toks<- tokens(corp, remove_punct = TRUE, remove_numbers = TRUE, remove_separators = TRUE) head(ch_toks[[1]], 40) ch_toks<- tokens_remove(toks, stopwords("zh_cn", source = "marimo")) head(ch_toks[[1]], 40) # Alternative Chinese stopwords stopwords("zh", source = "misc") stopwords("zh", source = "stopwords-iso") # You can keep only Chinese characters ch_toks<- tokens_select(toks, pattern = "\\p{script=Hani}+$", valuetype = "regex") head(ch_toks[[1]], 40) ch_dfm <- dfm(ch_toks) topfeatures(ch_dfm) ch_dfm[, 1:5]