Fix get_loss for values outside of labels in senter (#5730)

* Fix get_loss for None alignments in senter

When converting the `sent_start` values back to `SentenceRecognizer`
labels, handle `None` alignments.

* Handle SENT_START as -1

Handle SENT_START as -1 (or -1 converted to uint64) by treating any
values other than 1 the same as 0 in `SentenceRecognizer.get_loss`.
This commit is contained in:
Adriane Boyd 2020-07-09 01:41:58 +02:00 committed by GitHub
parent 9b49787f35
commit ad15499b3b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 17 additions and 1 deletions

View File

@ -523,7 +523,18 @@ class SentenceRecognizer(Tagger):
def get_loss(self, examples, scores):
labels = self.labels
loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
truths = [[labels[x] for x in eg.get_aligned("sent_start")] for eg in examples]
truths = []
for eg in examples:
eg_truth = []
for x in eg.get_aligned("sent_start"):
if x == None:
eg_truth.append(None)
elif x == 1:
eg_truth.append(labels[1])
else:
# anything other than 1: 0, -1, -1 as uint64
eg_truth.append(labels[0])
truths.append(eg_truth)
d_scores, loss = loss_func(scores, truths)
if self.model.ops.xp.isnan(loss):
raise ValueError("nan value when computing loss")

View File

@ -38,6 +38,11 @@ def test_overfitting_IO():
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
# add some cases where SENT_START == -1
train_examples[0].reference[10].is_sent_start = False
train_examples[1].reference[1].is_sent_start = False
train_examples[1].reference[11].is_sent_start = False
nlp.add_pipe(senter)
optimizer = nlp.begin_training()