From b7c75049f780c83d703590f44492ca45b2d4374b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 23 Mar 2019 16:08:27 +0100 Subject: [PATCH] Support 'bow' architecture for TextCategorizer This allows efficient ngram bag-of-words models, which are better when the classifier needs to run quickly, especially when the texts are long. Pass architecture="bow" to use it. The extra arguments ngram_size and attr are also available, e.g. ngram_size=2 means unigram and bigram features will be extracted. --- spacy/pipeline/pipes.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 7ad67cb5a..5e94c2f95 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -25,6 +25,7 @@ from ..attrs import POS, ID from ..parts_of_speech import X from .._ml import Tok2Vec, build_tagger_model from .._ml import build_text_classifier, build_simple_cnn_text_classifier +from .._ml import build_bow_text_classifier from .._ml import link_vectors_to_models, zero_init, flatten from .._ml import masked_language_model, create_default_optimizer from ..errors import Errors, TempErrors @@ -876,6 +877,8 @@ class TextCategorizer(Pipe): if cfg.get("architecture") == "simple_cnn": tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg) return build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg) + elif cfg.get("architecture") == "bow": + return build_bow_text_classifier(nr_class, **cfg) else: return build_text_classifier(nr_class, **cfg)