Handle missing annotations in the edit tree lemmatizer (#12098)

The losses/gradients of missing annotations were not correctly masked out. Fix this and check the masking in the partial data test.
2025-07-15 02:32:37 +03:00 · 2023-01-12 12:13:55 +01:00 · 2023-01-12 12:13:55 +01:00 · dda7331da3
commit dda7331da3
parent 319eb508b5
2 changed files with 15 additions and 1 deletions
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@ -128,7 +128,7 @@ class EditTreeLemmatizer(TrainablePipe):
            for (predicted, gold_lemma) in zip(
                eg.predicted, eg.get_aligned("LEMMA", as_string=True)
            ):
-                if gold_lemma is None:
+                if gold_lemma is None or gold_lemma == "":
                    label = -1
                else:
                    tree_id = self.trees.add(predicted.text, gold_lemma)
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@ -139,6 +139,20 @@ def test_incomplete_data():
    assert doc[1].lemma_ == "like"
    assert doc[2].lemma_ == "blue"

+    # Check that incomplete annotations are ignored.
+    scores, _ = lemmatizer.model([eg.predicted for eg in train_examples], is_train=True)
+    _, dX = lemmatizer.get_loss(train_examples, scores)
+    xp = lemmatizer.model.ops.xp
+
+    # Missing annotations.
+    assert xp.count_nonzero(dX[0][0]) == 0
+    assert xp.count_nonzero(dX[0][3]) == 0
+    assert xp.count_nonzero(dX[1][0]) == 0
+    assert xp.count_nonzero(dX[1][3]) == 0
+
+    # Misaligned annotations.
+    assert xp.count_nonzero(dX[1][1]) == 0
+

 def test_overfitting_IO():
    nlp = English()