Update get_loss for senter and morphologizer (#5724)

* Update get_loss for senter

Update `SentenceRecognizer.get_loss` to keep it similar to `Tagger`.

* Update get_loss for morphologizer

Update `Morphologizer.get_loss` to keep it similar to `Tagger`.
This commit is contained in:
Adriane Boyd 2020-07-08 13:59:28 +02:00 committed by GitHub
parent 8cb7f9ccff
commit c9f0f75778
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 16 additions and 45 deletions

View File

@ -272,7 +272,7 @@ cdef class Morphology:
@staticmethod @staticmethod
def feats_to_dict(feats): def feats_to_dict(feats):
if not feats: if not feats or feats == Morphology.EMPTY_MORPH:
return {} return {}
return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
[feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]} [feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}

View File

@ -3,7 +3,7 @@ cimport numpy as np
import numpy import numpy
import srsly import srsly
from thinc.api import to_categorical from thinc.api import SequenceCategoricalCrossentropy
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..vocab cimport Vocab from ..vocab cimport Vocab
@ -85,13 +85,10 @@ class Morphologizer(Tagger):
doc.is_morphed = True doc.is_morphed = True
def get_loss(self, examples, scores): def get_loss(self, examples, scores):
scores = self.model.ops.flatten(scores) loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
tag_index = {tag: i for i, tag in enumerate(self.labels)} truths = []
cdef int idx = 0
correct = numpy.zeros((scores.shape[0],), dtype="i")
guesses = scores.argmax(axis=1)
known_labels = numpy.ones((scores.shape[0], 1), dtype="f")
for eg in examples: for eg in examples:
eg_truths = []
pos_tags = eg.get_aligned("POS", as_string=True) pos_tags = eg.get_aligned("POS", as_string=True)
morphs = eg.get_aligned("MORPH", as_string=True) morphs = eg.get_aligned("MORPH", as_string=True)
for i in range(len(morphs)): for i in range(len(morphs)):
@ -104,20 +101,11 @@ class Morphologizer(Tagger):
morph = self.vocab.strings[self.vocab.morphology.add(feats)] morph = self.vocab.strings[self.vocab.morphology.add(feats)]
if morph == "": if morph == "":
morph = Morphology.EMPTY_MORPH morph = Morphology.EMPTY_MORPH
if morph is None: eg_truths.append(morph)
correct[idx] = guesses[idx] truths.append(eg_truths)
elif morph in tag_index: d_scores, loss = loss_func(scores, truths)
correct[idx] = tag_index[morph] if self.model.ops.xp.isnan(loss):
else: raise ValueError("nan value when computing loss")
correct[idx] = 0
known_labels[idx] = 0.
idx += 1
correct = self.model.ops.xp.array(correct, dtype="i")
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
d_scores *= self.model.ops.asarray(known_labels)
loss = (d_scores**2).sum()
docs = [eg.predicted for eg in examples]
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores return float(loss), d_scores
def to_bytes(self, exclude=tuple()): def to_bytes(self, exclude=tuple()):

View File

@ -521,29 +521,12 @@ class SentenceRecognizer(Tagger):
doc.c[j].sent_start = -1 doc.c[j].sent_start = -1
def get_loss(self, examples, scores): def get_loss(self, examples, scores):
scores = self.model.ops.flatten(scores) labels = self.labels
tag_index = range(len(self.labels)) loss_func = SequenceCategoricalCrossentropy(names=labels, normalize=False)
cdef int idx = 0 truths = [[labels[x] for x in eg.get_aligned("sent_start")] for eg in examples]
correct = numpy.zeros((scores.shape[0],), dtype="i") d_scores, loss = loss_func(scores, truths)
guesses = scores.argmax(axis=1) if self.model.ops.xp.isnan(loss):
known_labels = numpy.ones((scores.shape[0], 1), dtype="f") raise ValueError("nan value when computing loss")
for eg in examples:
sent_starts = eg.get_aligned("sent_start")
for sent_start in sent_starts:
if sent_start is None:
correct[idx] = guesses[idx]
elif sent_start in tag_index:
correct[idx] = sent_start
else:
correct[idx] = 0
known_labels[idx] = 0.
idx += 1
correct = self.model.ops.xp.array(correct, dtype="i")
d_scores = scores - to_categorical(correct, n_classes=scores.shape[1])
d_scores *= self.model.ops.asarray(known_labels)
loss = (d_scores**2).sum()
docs = [eg.predicted for eg in examples]
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores return float(loss), d_scores
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,