Fix get_loss for values outside of labels in senter (#5730)

* Fix get_loss for None alignments in senter When converting the `sent_start` values back to `SentenceRecognizer` labels, handle `None` alignments. * Handle SENT_START as -1 Handle SENT_START as -1 (or -1 converted to uint64) by treating any values other than 1 the same as 0 in `SentenceRecognizer.get_loss`.
2025-12-16 22:54:18 +03:00 · 2020-07-09 01:41:58 +02:00 · 2020-07-09 01:41:58 +02:00 · ad15499b3b
commit ad15499b3b
parent 9b49787f35
2 changed files with 17 additions and 1 deletions
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -523,7 +523,18 @@ class SentenceRecognizer(Tagger):
    def get_loss(self, examples, scores):
        labels = self.labels
        loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
-        truths = [[labels[x] for x in eg.get_aligned("sent_start")] for eg in examples]
+        truths = []
+        for eg in examples:
+            eg_truth = []
+            for x in eg.get_aligned("sent_start"):
+                if x == None:
+                    eg_truth.append(None)
+                elif x == 1:
+                    eg_truth.append(labels[1])
+                else:
+                    # anything other than 1: 0, -1, -1 as uint64
+                    eg_truth.append(labels[0])
+            truths.append(eg_truth)
        d_scores, loss = loss_func(scores, truths)
        if self.model.ops.xp.isnan(loss):
            raise ValueError("nan value when computing loss")
--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@ -38,6 +38,11 @@ def test_overfitting_IO():
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    # add some cases where SENT_START == -1
+    train_examples[0].reference[10].is_sent_start = False
+    train_examples[1].reference[1].is_sent_start = False
+    train_examples[1].reference[11].is_sent_start = False
+
    nlp.add_pipe(senter)
    optimizer = nlp.begin_training()