rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL") getwd() library(manifestoR) library(quanteda) library(ggplot2) mp_setapikey(key.file = NULL, key = "YOUR API MANIFESTO KEY") ########################## # Downloading the digital texts of original party manifestoes ######################### Japan <- mp_availability(countryname == "Japan") Japan # I can get access to 77 party manifestos on-line; out of this, however ONLY 2 are also in the CMP archive in digital format # and for 2 documents also the original coding is available Japan <- as.data.frame(Japan) str(Japan) print(Japan[c("party", "date", "originals", "manifestos")]) # if originals=TRUE then the programs are available on-line (but such texts are NOT necessarily # in the CMP archive in digital format! Indeed, only 2 are available - see TRUE for "manifestos"!) # let's see the pdf of the Japanese party manifestos that are available mp_view_originals(countryname=="Japan" & date==200509, maxn = 6 ) # the only two programs included in the CMP archive in digital format belongs both to 2014 elections, # for parties 71220 and 71320 from 2014 elections print(Japan[c("party", "date", "originals", "manifestos")]) # let's call the documents that are available in the CMP archive in digital format Japan_corpus <- mp_corpus(countryname == "Japan" ) summary(Japan_corpus) head(content(Japan_corpus[["71220_201412"]])) # to read the entire program txt <- content(Japan_corpus[["71220_201412"]]) txt head(content(Japan_corpus[["71320_201412"]])) # but which party is this 71220? and the 71320? cmp <- mp_maindataset() # let's download the CMP core dataset japan_cmp <- cmp[ which(cmp$countryname=="Japan" & cmp$date==201412),] # select name of country and elections print(japan_cmp [c("partyname", "party", "edate", "date")]) # 71220 is the Japanese Communist Party # while 71320 is the Social Democratic Party ########################## # Passing CMP party manifestoes to Quanteda ########################## # Let's focus on the Irish manifestoes of the 2016 elections ireland <- mp_corpus(countryname=="Ireland" & date == 201602) ireland summary(ireland) # converting the 10 party manifestoes recovered from the CMP dataset to a Quanteda corpus quanteda_ireland <- corpus(ireland) summary(head(quanteda_ireland )) ndoc(quanteda_ireland ) # hmmm, we have much more than 10 documents, actually it seems that we have 10803 documents! why? # because we are keeping each quasi-sentence for each party manifestoes as a separated document! # how to deal with that? # FIRST: making the DFM dfm_irish <- dfm(quanteda_ireland, stem = TRUE, tolower=TRUE, remove_punct = TRUE, remove = stopwords("english"), remove_numbers=TRUE) dfm_irish[1:10, 1:5] # SECOND: but of course we would like to create a DFM by grouping together all the quasi-sentences beloning to the same party! # so we use the dfm_group option! grouping by "manifesto_id" or "party" [i.e., document variables inclded in the corpus "ireland" we created! summary(head(quanteda_ireland )) dfm_irish2 <- dfm_group(dfm_irish, "manifesto_id") dfm_irish2[1:10, 1:5] str(dfm_irish2) # let's pass the original Irish party name included in the CMP core dataset to the dfm just created! ireland<- cmp[ which(cmp $countryname=="Ireland" & cmp $date == 201602),] # select name of country and elections print(ireland[c("partyname", "party", "edate", "date", "partyabbrev")]) ireland$partyname dfm_irish2@Dimnames$docs dfm_irish2@Dimnames$docs <- ireland$partyname str(dfm_irish2) ######################################################################### # Using wordfish on CMP corpus ######################################################################### wfm <- textmodel_wordfish(dfm_irish2, dir = c(1, 3)) summary(wfm) # Plot estimated word positions textplot_scale1d(wfm, margin = "features") # Plot estimated document positions textplot_scale1d(wfm, margin = "documents")