diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index cc3c39f03..86c768e9b 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -523,7 +523,18 @@ class SentenceRecognizer(Tagger): def get_loss(self, examples, scores): labels = self.labels loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False) - truths = [[labels[x] for x in eg.get_aligned("sent_start")] for eg in examples] + truths = [] + for eg in examples: + eg_truth = [] + for x in eg.get_aligned("sent_start"): + if x == None: + eg_truth.append(None) + elif x == 1: + eg_truth.append(labels[1]) + else: + # anything other than 1: 0, -1, -1 as uint64 + eg_truth.append(labels[0]) + truths.append(eg_truth) d_scores, loss = loss_func(scores, truths) if self.model.ops.xp.isnan(loss): raise ValueError("nan value when computing loss") diff --git a/spacy/tests/pipeline/test_senter.py b/spacy/tests/pipeline/test_senter.py index bfa1bd65a..82f536076 100644 --- a/spacy/tests/pipeline/test_senter.py +++ b/spacy/tests/pipeline/test_senter.py @@ -38,6 +38,11 @@ def test_overfitting_IO(): train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + # add some cases where SENT_START == -1 + train_examples[0].reference[10].is_sent_start = False + train_examples[1].reference[1].is_sent_start = False + train_examples[1].reference[11].is_sent_start = False + nlp.add_pipe(senter) optimizer = nlp.begin_training()