From dda7331da38b8cff2861a26712d6685927bff73d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Thu, 12 Jan 2023 12:13:55 +0100 Subject: [PATCH] Handle missing annotations in the edit tree lemmatizer (#12098) The losses/gradients of missing annotations were not correctly masked out. Fix this and check the masking in the partial data test. --- spacy/pipeline/edit_tree_lemmatizer.py | 2 +- spacy/tests/pipeline/test_edit_tree_lemmatizer.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py index a56c9975e..e83fe63ba 100644 --- a/spacy/pipeline/edit_tree_lemmatizer.py +++ b/spacy/pipeline/edit_tree_lemmatizer.py @@ -128,7 +128,7 @@ class EditTreeLemmatizer(TrainablePipe): for (predicted, gold_lemma) in zip( eg.predicted, eg.get_aligned("LEMMA", as_string=True) ): - if gold_lemma is None: + if gold_lemma is None or gold_lemma == "": label = -1 else: tree_id = self.trees.add(predicted.text, gold_lemma) diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py index b12ca5dd4..c4f9b09f3 100644 --- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py +++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py @@ -139,6 +139,20 @@ def test_incomplete_data(): assert doc[1].lemma_ == "like" assert doc[2].lemma_ == "blue" + # Check that incomplete annotations are ignored. + scores, _ = lemmatizer.model([eg.predicted for eg in train_examples], is_train=True) + _, dX = lemmatizer.get_loss(train_examples, scores) + xp = lemmatizer.model.ops.xp + + # Missing annotations. + assert xp.count_nonzero(dX[0][0]) == 0 + assert xp.count_nonzero(dX[0][3]) == 0 + assert xp.count_nonzero(dX[1][0]) == 0 + assert xp.count_nonzero(dX[1][3]) == 0 + + # Misaligned annotations. + assert xp.count_nonzero(dX[1][1]) == 0 + def test_overfitting_IO(): nlp = English()