Fixing ngram bug (#3953)

* minimal failing example for Issue #3661 * referenced Issue #3661 instead of Issue #3611 * cleanup
2026-03-03 03:11:28 +03:00 · 2019-07-12 10:01:35 +02:00 · 2019-07-12 10:01:35 +02:00 · ed774cb953
commit ed774cb953
parent 123929b58b
1 changed files with 51 additions and 0 deletions
--- a/spacy/tests/regression/test_issue3611.py
+++ b/spacy/tests/regression/test_issue3611.py
@ -0,0 +1,51 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+import spacy
+from spacy.util import minibatch, compounding
+
+
+def test_issue3611():
+    """ Test whether adding n-grams in the textcat works even when n > token length of some docs """
+    unique_classes = ["offensive", "inoffensive"]
+    x_train = ["This is an offensive text",
+               "This is the second offensive text",
+               "inoff"]
+    y_train = ["offensive", "offensive", "inoffensive"]
+
+    # preparing the data
+    pos_cats = list()
+    for train_instance in y_train:
+        pos_cats.append({label: label == train_instance for label in unique_classes})
+    train_data = list(zip(x_train, [{'cats': cats} for cats in pos_cats]))
+
+    # set up the spacy model with a text categorizer component
+    nlp = spacy.blank('en')
+
+    textcat = nlp.create_pipe(
+        "textcat",
+        config={
+            "exclusive_classes": True,
+            "architecture": "bow",
+            "ngram_size": 2
+        }
+    )
+
+    for label in unique_classes:
+        textcat.add_label(label)
+    nlp.add_pipe(textcat, last=True)
+
+    # training the network
+    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
+    with nlp.disable_pipes(*other_pipes):
+        optimizer = nlp.begin_training()
+        for i in range(3):
+            losses = {}
+            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+
+            for batch in batches:
+                texts, annotations = zip(*batch)
+                nlp.update(docs=texts, golds=annotations, sgd=optimizer, drop=0.1, losses=losses)
+
+