Handle missing annotations in the edit tree lemmatizer (#12098)

The losses/gradients of missing annotations were not correctly masked
out. Fix this and check the masking in the partial data test.
This commit is contained in:
Daniël de Kok 2023-01-12 12:13:55 +01:00 committed by GitHub
parent 319eb508b5
commit dda7331da3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 15 additions and 1 deletions

View File

@ -128,7 +128,7 @@ class EditTreeLemmatizer(TrainablePipe):
for (predicted, gold_lemma) in zip(
eg.predicted, eg.get_aligned("LEMMA", as_string=True)
):
if gold_lemma is None:
if gold_lemma is None or gold_lemma == "":
label = -1
else:
tree_id = self.trees.add(predicted.text, gold_lemma)

View File

@ -139,6 +139,20 @@ def test_incomplete_data():
assert doc[1].lemma_ == "like"
assert doc[2].lemma_ == "blue"
# Check that incomplete annotations are ignored.
scores, _ = lemmatizer.model([eg.predicted for eg in train_examples], is_train=True)
_, dX = lemmatizer.get_loss(train_examples, scores)
xp = lemmatizer.model.ops.xp
# Missing annotations.
assert xp.count_nonzero(dX[0][0]) == 0
assert xp.count_nonzero(dX[0][3]) == 0
assert xp.count_nonzero(dX[1][0]) == 0
assert xp.count_nonzero(dX[1][3]) == 0
# Misaligned annotations.
assert xp.count_nonzero(dX[1][1]) == 0
def test_overfitting_IO():
nlp = English()