From c9f0f75778515a2cd00a96681b57358c95b83acf Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 8 Jul 2020 13:59:28 +0200 Subject: [PATCH] Update get_loss for senter and morphologizer (#5724) * Update get_loss for senter Update `SentenceRecognizer.get_loss` to keep it similar to `Tagger`. * Update get_loss for morphologizer Update `Morphologizer.get_loss` to keep it similar to `Tagger`. --- spacy/morphology.pyx | 2 +- spacy/pipeline/morphologizer.pyx | 30 +++++++++--------------------- spacy/pipeline/pipes.pyx | 29 ++++++----------------------- 3 files changed, 16 insertions(+), 45 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 78e8e17c0..a3aa8be22 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -272,7 +272,7 @@ cdef class Morphology: @staticmethod def feats_to_dict(feats): - if not feats: + if not feats or feats == Morphology.EMPTY_MORPH: return {} return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in [feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]} diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index f792d57b0..57b778434 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -3,7 +3,7 @@ cimport numpy as np import numpy import srsly -from thinc.api import to_categorical +from thinc.api import SequenceCategoricalCrossentropy from ..tokens.doc cimport Doc from ..vocab cimport Vocab @@ -85,13 +85,10 @@ class Morphologizer(Tagger): doc.is_morphed = True def get_loss(self, examples, scores): - scores = self.model.ops.flatten(scores) - tag_index = {tag: i for i, tag in enumerate(self.labels)} - cdef int idx = 0 - correct = numpy.zeros((scores.shape[0],), dtype="i") - guesses = scores.argmax(axis=1) - known_labels = numpy.ones((scores.shape[0], 1), dtype="f") + loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False) + truths = [] for eg in examples: + eg_truths = [] pos_tags = eg.get_aligned("POS", as_string=True) morphs = eg.get_aligned("MORPH", as_string=True) for i in range(len(morphs)): @@ -104,20 +101,11 @@ class Morphologizer(Tagger): morph = self.vocab.strings[self.vocab.morphology.add(feats)] if morph == "": morph = Morphology.EMPTY_MORPH - if morph is None: - correct[idx] = guesses[idx] - elif morph in tag_index: - correct[idx] = tag_index[morph] - else: - correct[idx] = 0 - known_labels[idx] = 0. - idx += 1 - correct = self.model.ops.xp.array(correct, dtype="i") - d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) - d_scores *= self.model.ops.asarray(known_labels) - loss = (d_scores**2).sum() - docs = [eg.predicted for eg in examples] - d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) + eg_truths.append(morph) + truths.append(eg_truths) + d_scores, loss = loss_func(scores, truths) + if self.model.ops.xp.isnan(loss): + raise ValueError("nan value when computing loss") return float(loss), d_scores def to_bytes(self, exclude=tuple()): diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 2b147785e..cc3c39f03 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -521,29 +521,12 @@ class SentenceRecognizer(Tagger): doc.c[j].sent_start = -1 def get_loss(self, examples, scores): - scores = self.model.ops.flatten(scores) - tag_index = range(len(self.labels)) - cdef int idx = 0 - correct = numpy.zeros((scores.shape[0],), dtype="i") - guesses = scores.argmax(axis=1) - known_labels = numpy.ones((scores.shape[0], 1), dtype="f") - for eg in examples: - sent_starts = eg.get_aligned("sent_start") - for sent_start in sent_starts: - if sent_start is None: - correct[idx] = guesses[idx] - elif sent_start in tag_index: - correct[idx] = sent_start - else: - correct[idx] = 0 - known_labels[idx] = 0. - idx += 1 - correct = self.model.ops.xp.array(correct, dtype="i") - d_scores = scores - to_categorical(correct, n_classes=scores.shape[1]) - d_scores *= self.model.ops.asarray(known_labels) - loss = (d_scores**2).sum() - docs = [eg.predicted for eg in examples] - d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) + labels = self.labels + loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False) + truths = [[labels[x] for x in eg.get_aligned("sent_start")] for eg in examples] + d_scores, loss = loss_func(scores, truths) + if self.model.ops.xp.isnan(loss): + raise ValueError("nan value when computing loss") return float(loss), d_scores def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,