library(quanteda)
library(quanteda.textmodels)

# Let's create 3 random texts in our corpus: the first two are the reference texts and the last one is the virgin text
testText <-c("The quick brown fox named over brown brown", "with the newspaper a boy named in his brown", 
"Text pre-processing: make some fox cleaning over brown")
testText
testCorpus <- corpus(testText)
tok3 <- tokens(testCorpus,  remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, 
remove_separators = TRUE)
tok3 
myDfm <- dfm(tok3)
topfeatures(myDfm , 10)  # 20 top features
myDfm[,1:18]

# let's define the reference scores 
refscores <- c(3, 8, NA)
refscores

# Let's assign the reference scores to your dfm
ws <- textmodel_wordscores(myDfm, refscores)
summary(ws) 

# how do you get such scores for words? Let's redo the analysis step-by-step w/o the textmodel_wordscores function
# let's estimate the relative frequency of each word in each single reference text
tFwr <- t(dfm_weight(myDfm[1:2,] , "prop"))
# the relative frequency for "the" in text 1 is 0.125 cause it appears once over 8 words in that text
tFwr
myDfm[1,1:18]
# now, which is the probability of reading text 1 given that we only read the word "the"? 
0.125/(0.125+0.11111)
Pwr <- tFwr / rowSums(tFwr)
Pwr 
# let's now assign the score to each word
# For "the" is going to be equal to...
0.5294118*3+0.4705882*8
Sw <- Pwr %*% refscores[1:2]
Sw[,1]
# same result as above!
summary(ws)

# Let's predict the virgin text 
predict(ws, newdata = myDfm[c(3), ])

# how do you get such scores for the virgin text? Let's redo the analysis step-by-step w/o the predict function
# Note: you have just 3 words in common between virgin and the reference texts: brown, fox, over; 
# each of this 3 words appear once in the virgin text (i.e., the relative frequency of each word is therefore .3333)
# and given that the score for brown=4.143; fox=3; over=3, then the predicted score for the virgin text is:
(4.143*.3333)+(3*.3333) +(3*.3333)

# a different way to get the same outcome:
sw <- coef(ws)
sw
# force_conformance is an internal function of quanteda.textmodels that takes a dfm and a set of features, 
# and makes them match the features listed in the set (remember: you have fewer features in sw than in the original dfm)
data <- quanteda.textmodels:::force_conformance(myDfm, names(sw))
# in our original DfM we had 18 features
length(myDfm@Dimnames$features)
# now just 12, i.e., words included in the DfM = number of the estimated word-scores 
length(data@Dimnames$features)
str(ws)
# relative frequencies of words in the virgin text that are also included in the reference texts 
data[3,]
# let's compute the score for the virgin text by multiplying the relative frequencies for the wordscores
rowSums(dfm_weight(data[3,], "prop") %*% sw)