library(quanteda) library(quanteda.textmodels) # Let's create 3 random texts in our corpus: the first two are the reference texts and the last one is the virgin text testText <-c("The quick brown fox named over brown brown", "with the newspaper a boy named in his brown", "Text pre-processing: make some fox cleaning over brown") testText testCorpus <- corpus(testText) tok3 <- tokens(testCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok3 myDfm <- dfm(tok3) topfeatures(myDfm , 10) # 20 top features myDfm[,1:18] # let's define the reference scores refscores <- c(3, 8, NA) refscores # Let's assign the reference scores to your dfm ws <- textmodel_wordscores(myDfm, refscores) summary(ws) # how do you get such scores for words? Let's redo the analysis step-by-step w/o the textmodel_wordscores function # let's estimate the relative frequency of each word in each single reference text tFwr <- t(dfm_weight(myDfm[1:2,] , "prop")) # the relative frequency for "the" in text 1 is 0.125 cause it appears once over 8 words in that text tFwr myDfm[1,1:18] # now, which is the probability of reading text 1 given that we only read the word "the"? 0.125/(0.125+0.11111) Pwr <- tFwr / rowSums(tFwr) Pwr # let's now assign the score to each word # For "the" is going to be equal to... 0.5294118*3+0.4705882*8 Sw <- Pwr %*% refscores[1:2] Sw[,1] # same result as above! summary(ws) # Let's predict the virgin text predict(ws, newdata = myDfm[c(3), ]) # how do you get such scores for the virgin text? Let's redo the analysis step-by-step w/o the predict function # Note: you have just 3 words in common between virgin and the reference texts: brown, fox, over; # each of this 3 words appear once in the virgin text (i.e., the relative frequency of each word is therefore .3333) # and given that the score for brown=4.143; fox=3; over=3, then the predicted score for the virgin text is: (4.143*.3333)+(3*.3333) +(3*.3333) # a different way to get the same outcome: sw <- coef(ws) sw # force_conformance is an internal function of quanteda.textmodels that takes a dfm and a set of features, # and makes them match the features listed in the set (remember: you have fewer features in sw than in the original dfm) data <- quanteda.textmodels:::force_conformance(myDfm, names(sw)) # in our original DfM we had 18 features length(myDfm@Dimnames$features) # now just 12, i.e., words included in the DfM = number of the estimated word-scores length(data@Dimnames$features) str(ws) # relative frequencies of words in the virgin text that are also included in the reference texts data[3,] # let's compute the score for the virgin text by multiplying the relative frequencies for the wordscores rowSums(dfm_weight(data[3,], "prop") %*% sw)