textmodel Performance Comparisons

library("quanteda")
## Package version: 4.1.0
## Unicode version: 15.1
## ICU version: 74.2
## Parallel computing: 4 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
library("quanteda.textmodels")

Naive Bayes

quanteda.textmodels implements fast methods for fitting and predicting Naive Bayes textmodels built especially for sparse document-feature matrices from textual data. It implements two models: multinomial and Bernoulli. (See Manning, Raghavan, and Schütze 2008, Chapter 13.)

Here, we compare performance for the two models, and then to the performance from two other packages for fitting these models.

For these tests, we will choose the dataset of 50,000 movie reviews from Maas et. al. (2011). We will use their partition into test and training sets for training and fitting our models.

# large movie review database of 50,000 movie reviews
load(url("https://quanteda.org/data/data_corpus_LMRD.rda"))

dfmat <- tokens(data_corpus_LMRD) %>%
  dfm()
dfmat_train <- dfm_subset(dfmat, set == "train")
dfmat_test <- dfm_subset(dfmat, set == "test")

Comparing the performance of fitting the model:

library("microbenchmark")
microbenchmark(
    multi = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
    bern = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
    times = 20
)
## Unit: milliseconds
##   expr      min       lq      mean   median        uq      max neval
##  multi 79.04747 80.50046  90.08327 93.71772  94.81939 100.5724    20
##   bern 86.85288 87.24687 102.18642 98.12383 101.80704 243.0324    20

And for prediction:

microbenchmark(
    multi = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
                    newdata = dfmat_test),
    bern = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
                   newdata = dfmat_test),
    times = 20
)
## Unit: milliseconds
##   expr       min       lq     mean    median       uq      max neval
##  multi  90.38307  90.4792  96.1013  90.81755 104.7194 106.0697    20
##   bern 118.57859 132.0036 142.6051 133.05496 145.2316 280.1497    20

Now let’s see how textmodel_nb() compares to equivalent functions from other packages. Multinomial:

library("fastNaiveBayes")
library("naivebayes")
## naivebayes 1.0.0 loaded
## For more information please visit:
## https://majkamichal.github.io/naivebayes/

microbenchmark(
    textmodels = {
      tmod <-  textmodel_nb(dfmat_train, dfmat_train$polarity, smooth = 1, distribution = "multinomial")
      pred <- predict(tmod, newdata = dfmat_test)
    },
    fastNaiveBayes = { 
      tmod <- fnb.multinomial(as(dfmat_train, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
      pred <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
    },
    naivebayes = {
      tmod = multinomial_naive_bayes(as(dfmat_train, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
      pred <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
    },
    times = 20
)
## Unit: milliseconds
##            expr       min        lq     mean    median       uq      max neval
##      textmodels  90.22805  90.75009 104.8857  93.96268 104.7333 249.5586    20
##  fastNaiveBayes 128.14467 140.01157 144.1268 142.32475 147.1867 182.1490    20
##      naivebayes 102.56294 103.61853 120.3717 116.19784 117.5862 263.5188    20

And Bernoulli. Note here that while we are supplying the Boolean matrix to textmodel_nb(), this re-weighting from the count matrix would have been performed automatically within the function had we not done so in advance - it’s done here just for comparison.

dfmat_train_bern <- dfm_weight(dfmat_train, scheme = "boolean")
dfmat_test_bern <- dfm_weight(dfmat_test, scheme = "boolean")

microbenchmark(
    textmodel_nb = {
      tmod <-  textmodel_nb(dfmat_train_bern, dfmat_train$polarity, smooth = 1, distribution = "Bernoulli")
      pred <- predict(tmod, newdata = dfmat_test)
    },
    fastNaiveBayes = { 
      tmod <- fnb.bernoulli(as(dfmat_train_bern, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
      pred <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
    },
    naivebayes = {
      tmod = bernoulli_naive_bayes(as(dfmat_train_bern, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
      pred <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
    },
    times = 20
)
## Unit: milliseconds
##            expr      min       lq     mean   median       uq      max neval
##    textmodel_nb 118.2851 131.9372 135.1787 132.3858 136.2900 151.0798    20
##  fastNaiveBayes 140.4363 151.0463 160.1832 153.2031 158.2842 300.6095    20
##      naivebayes 112.4060 113.5875 126.7682 115.5624 126.2661 272.1587    20

References

Maas, Andrew L., Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts (2011). “Learning Word Vectors for Sentiment Analysis”. The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).

Majka M (2020). naivebayes: High Performance Implementation of the Naive Bayes Algorithm in R. R package version 0.9.7, <URL: https://CRAN.R-project.org/package=naivebayes>. Date: 2020-03-08.

Manning, Christopher D., Prabhakar Raghavan, and Hinrich Schütze (2008). Introduction to Information Retrieval. Cambridge University Press.

Skogholt, Martin (2020). fastNaiveBayes: Extremely Fast Implementation of a Naive Bayes Classifier. R package version 2.2.1. https://github.com/mskogholt/fastNaiveBayes. Date: 2020-05-04.