# Let's beging with an example library(quanteda) # 3 random texts in our corpus: the first two are the reference texts and the last one is the virgin text testText <-c("The quick brown fox named over brown brown", "with the newspaper a boy named in his brown", "Text pre-processing: make some fox cleaning over brown") testText testCorpus <- corpus(testText) tok3 <- tokens(testCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok3 myDfm <- dfm(tok3) topfeatures(myDfm , 10) # 20 top features myDfm[,1:18] # let's define the reference scores refscores <- c(3, 8, NA) refscores # Let's assign the reference scores to your dfm ws <- textmodel_wordscores(myDfm, refscores) summary(ws) # how do you get such scores for words? Let's redo the analysis step-by-step w/o the textmodel_wordscores function tFwr <- t(dfm_weight(myDfm[1:2,] , "prop")) tFwr Pwr <- tFwr / rowSums(tFwr) Pwr Sw <- Pwr %*% refscores[1:2] # same score for words! Sw[,1] summary(ws) # Let's predict the virgin text predict(ws, newdata = myDfm[c(3), ]) # how do you get such scores for the virgin text? Let's redo the analysis step-by-step w/o the predict function # note: you have just 3 words in common between virgin and the reference texts: brown, fox, over; # each of this 3 words appear once in the virgin text (i.e., the relative frequency of each word is therefore .3333) # and given that the score for brown=4.143; fox=3; over=3, then the predicted score for the virgin text is: (4.143*.3333)+(3*.3333) +(3*.3333) # a different way to get the same outcome: sw <- coef(ws) sw # force_conformance is an internal function of quanteda.textmodels that takes a dfm and a set of features, # and makes them match the features listed in the set (remember: you have fewer features in sw than in the original dfm) data <- quanteda.textmodels:::force_conformance(myDfm, names(sw)) rowSums(dfm_weight(data[3,], "prop") %*% sw)