mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-24 00:04:15 +03:00
Handle missing annotations in the edit tree lemmatizer (#12098)
The losses/gradients of missing annotations were not correctly masked out. Fix this and check the masking in the partial data test.
This commit is contained in:
parent
319eb508b5
commit
dda7331da3
|
@ -128,7 +128,7 @@ class EditTreeLemmatizer(TrainablePipe):
|
|||
for (predicted, gold_lemma) in zip(
|
||||
eg.predicted, eg.get_aligned("LEMMA", as_string=True)
|
||||
):
|
||||
if gold_lemma is None:
|
||||
if gold_lemma is None or gold_lemma == "":
|
||||
label = -1
|
||||
else:
|
||||
tree_id = self.trees.add(predicted.text, gold_lemma)
|
||||
|
|
|
@ -139,6 +139,20 @@ def test_incomplete_data():
|
|||
assert doc[1].lemma_ == "like"
|
||||
assert doc[2].lemma_ == "blue"
|
||||
|
||||
# Check that incomplete annotations are ignored.
|
||||
scores, _ = lemmatizer.model([eg.predicted for eg in train_examples], is_train=True)
|
||||
_, dX = lemmatizer.get_loss(train_examples, scores)
|
||||
xp = lemmatizer.model.ops.xp
|
||||
|
||||
# Missing annotations.
|
||||
assert xp.count_nonzero(dX[0][0]) == 0
|
||||
assert xp.count_nonzero(dX[0][3]) == 0
|
||||
assert xp.count_nonzero(dX[1][0]) == 0
|
||||
assert xp.count_nonzero(dX[1][3]) == 0
|
||||
|
||||
# Misaligned annotations.
|
||||
assert xp.count_nonzero(dX[1][1]) == 0
|
||||
|
||||
|
||||
def test_overfitting_IO():
|
||||
nlp = English()
|
||||
|
|
Loading…
Reference in New Issue
Block a user