From c9f0f75778515a2cd00a96681b57358c95b83acf Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 8 Jul 2020 13:59:28 +0200
Subject: [PATCH] Update get_loss for senter and morphologizer (#5724)

* Update get_loss for senter

Update `SentenceRecognizer.get_loss` to keep it similar to `Tagger`.

* Update get_loss for morphologizer

Update `Morphologizer.get_loss` to keep it similar to `Tagger`.
---
 spacy/morphology.pyx             |  2 +-
 spacy/pipeline/morphologizer.pyx | 30 +++++++++---------------------
 spacy/pipeline/pipes.pyx         | 29 ++++++-----------------------
 3 files changed, 16 insertions(+), 45 deletions(-)

diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 78e8e17c0..a3aa8be22 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -272,7 +272,7 @@ cdef class Morphology:
 
     @staticmethod
     def feats_to_dict(feats):
-        if not feats:
+        if not feats or feats == Morphology.EMPTY_MORPH:
             return {}
         return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
                 [feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index f792d57b0..57b778434 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -3,7 +3,7 @@ cimport numpy as np
 
 import numpy
 import srsly
-from thinc.api import to_categorical
+from thinc.api import SequenceCategoricalCrossentropy
 
 from ..tokens.doc cimport Doc
 from ..vocab cimport Vocab
@@ -85,13 +85,10 @@ class Morphologizer(Tagger):
             doc.is_morphed = True
 
     def get_loss(self, examples, scores):
-        scores = self.model.ops.flatten(scores)
-        tag_index = {tag: i for i, tag in enumerate(self.labels)}
-        cdef int idx = 0
-        correct = numpy.zeros((scores.shape[0],), dtype="i")
-        guesses = scores.argmax(axis=1)
-        known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
+        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
+        truths = []
         for eg in examples:
+            eg_truths = []
             pos_tags = eg.get_aligned("POS", as_string=True)
             morphs = eg.get_aligned("MORPH", as_string=True)
             for i in range(len(morphs)):
@@ -104,20 +101,11 @@ class Morphologizer(Tagger):
                     morph = self.vocab.strings[self.vocab.morphology.add(feats)]
                 if morph == "":
                     morph = Morphology.EMPTY_MORPH
-                if morph is None:
-                    correct[idx] = guesses[idx]
-                elif morph in tag_index:
-                    correct[idx] = tag_index[morph]
-                else:
-                    correct[idx] = 0
-                    known_labels[idx] = 0.
-                idx += 1
-        correct = self.model.ops.xp.array(correct, dtype="i")
-        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
-        d_scores *= self.model.ops.asarray(known_labels)
-        loss = (d_scores**2).sum()
-        docs = [eg.predicted for eg in examples]
-        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
+                eg_truths.append(morph)
+            truths.append(eg_truths)
+        d_scores, loss = loss_func(scores, truths)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError("nan value when computing loss")
         return float(loss), d_scores
 
     def to_bytes(self, exclude=tuple()):
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 2b147785e..cc3c39f03 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -521,29 +521,12 @@ class SentenceRecognizer(Tagger):
                         doc.c[j].sent_start = -1
 
     def get_loss(self, examples, scores):
-        scores = self.model.ops.flatten(scores)
-        tag_index = range(len(self.labels))
-        cdef int idx = 0
-        correct = numpy.zeros((scores.shape[0],), dtype="i")
-        guesses = scores.argmax(axis=1)
-        known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
-        for eg in examples:
-            sent_starts = eg.get_aligned("sent_start")
-            for sent_start in sent_starts:
-                if sent_start is None:
-                    correct[idx] = guesses[idx]
-                elif sent_start in tag_index:
-                    correct[idx] = sent_start
-                else:
-                    correct[idx] = 0
-                    known_labels[idx] = 0.
-                idx += 1
-        correct = self.model.ops.xp.array(correct, dtype="i")
-        d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
-        d_scores *= self.model.ops.asarray(known_labels)
-        loss = (d_scores**2).sum()
-        docs = [eg.predicted for eg in examples]
-        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
+        labels = self.labels
+        loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
+        truths = [[labels[x] for x in eg.get_aligned("sent_start")] for eg in examples]
+        d_scores, loss = loss_func(scores, truths)
+        if self.model.ops.xp.isnan(loss):
+            raise ValueError("nan value when computing loss")
         return float(loss), d_scores
 
     def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,