Fix code for bag-of-words feature extraction

The _ml.py module had a redundant copy of a function to extract unigram bag-of-words features, except one had a bug that set values to 0. Another function allowed extraction of bigram features. Replace all three with a new function that supports arbitrary ngram sizes and also allows control of which attribute is used (e.g. ORTH, LOWER, etc).
2025-10-02 18:06:46 +03:00 · 2019-03-23 16:05:28 +01:00 · 2019-03-23 16:05:28 +01:00 · 2ddd552d52
commit 2ddd552d52
parent 06bf130890
1 changed files with 54 additions and 40 deletions
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -81,18 +81,6 @@ def _zero_init(model):
    return model
@layerize
 def _preprocess_doc(docs, drop=0.0):
    keys = [doc.to_array(LOWER) for doc in docs]
    # The dtype here matches what thinc is expecting -- which differs per
    # platform (by int definition). This should be fixed once the problem
    # is fixed on Thinc's side.
    lengths = numpy.array([arr.shape[0] for arr in keys], dtype=numpy.int_)
    keys = numpy.concatenate(keys)
    vals = numpy.zeros(keys.shape, dtype='f')
    return (keys, vals, lengths), None
 def with_cpu(ops, model):
    """Wrap a model that should run on CPU, transferring inputs and outputs
    as necessary."""
@ -133,20 +121,31 @@ def _to_device(ops, X):
        return ops.asarray(X)
-@layerize
+class extract_ngrams(Model):
-def _preprocess_doc_bigrams(docs, drop=0.0):
+    def __init__(self, ngram_size, attr=LOWER):
-    unigrams = [doc.to_array(LOWER) for doc in docs]
+        Model.__init__(self)
-    ops = Model.ops
+        self.ngram_size = ngram_size
-    bigrams = [ops.ngrams(2, doc_unis) for doc_unis in unigrams]
+        self.attr = attr
-    keys = [ops.xp.concatenate(feats) for feats in zip(unigrams, bigrams)]
+
-    keys, vals = zip(*[ops.xp.unique(k, return_counts=True) for k in keys])
+    def begin_update(self, docs, drop=0.0):
        batch_keys = []
        batch_vals = []
        for doc in docs:
            unigrams = doc.to_array([self.attr])
            ngrams = [unigrams]
            for n in range(2, self.ngram_size + 1):
                ngrams.append(self.ops.ngrams(n, unigrams))
            keys = self.ops.xp.concatenate(ngrams)
            keys, vals = self.ops.xp.unique(keys, return_counts=True)
            batch_keys.append(keys)
            batch_vals.append(vals)
        # The dtype here matches what thinc is expecting -- which differs per
        # platform (by int definition). This should be fixed once the problem
        # is fixed on Thinc's side.
-    lengths = ops.asarray([arr.shape[0] for arr in keys], dtype=numpy.int_)
+        lengths = self.ops.asarray([arr.shape[0] for arr in batch_keys], dtype=numpy.int_)
-    keys = ops.xp.concatenate(keys)
+        batch_keys = self.ops.xp.concatenate(batch_keys)
-    vals = ops.asarray(ops.xp.concatenate(vals), dtype="f")
+        batch_vals = self.ops.asarray(self.ops.xp.concatenate(batch_vals), dtype="f")
-    return (keys, vals, lengths), None
+        return (batch_keys, batch_vals, lengths), None
@describe.on_data(
@ -486,16 +485,6 @@ def zero_init(model):
    return model
@layerize
 def preprocess_doc(docs, drop=0.0):
    keys = [doc.to_array([LOWER]) for doc in docs]
    ops = Model.ops
    lengths = ops.asarray([arr.shape[0] for arr in keys])
    keys = ops.xp.concatenate(keys)
    vals = ops.allocate(keys.shape[0]) + 1
    return (keys, vals, lengths), None
 def getitem(i):
    def getitem_fwd(X, drop=0.0):
        return X[i], None
@ -602,10 +591,8 @@ def build_text_classifier(nr_class, width=64, **cfg):
            >> zero_init(Affine(nr_class, width, drop_factor=0.0))
        )
-        linear_model = (
+        linear_model = build_bow_text_classifier(
-            _preprocess_doc
+            nr_class, ngram_size=cfg.get("ngram_size", 1), no_output_layer=True)
            >> with_cpu(Model.ops, LinearModel(nr_class))
        )
        if cfg.get('exclusive_classes'):
            output_layer = Softmax(nr_class, nr_class * 2)
        else:
@ -623,6 +610,33 @@ def build_text_classifier(nr_class, width=64, **cfg):
    return model
 def build_bow_text_classifier(nr_class, ngram_size=1, exclusive_classes=False,
        no_output_layer=False, **cfg):
    with Model.define_operators({">>": chain}):
        model = (
            extract_ngrams(ngram_size, attr=ORTH) 
            >> with_cpu(Model.ops,
                LinearModel(nr_class)
            )
        )
        if not no_output_layer:
            model = model >> (cpu_softmax if exclusive_classes else logistic)
    model.nO = nr_class
    return model
@layerize
 def cpu_softmax(X, drop=0.):
    ops = NumpyOps()
    Y = ops.softmax(X)
    def cpu_softmax_backward(dY, sgd=None):
        return dY
    return ops.softmax(X), cpu_softmax_backward
 def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, **cfg):
    """
    Build a simple CNN text classifier, given a token-to-vector model as inputs.