mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Ignore misaligned in Morphologizer.get_loss (#6363)
Fix bug where `Morphologizer.get_loss` treated misaligned annotation as `EMPTY_MORPH` rather than ignoring it. Remove unneeded default `EMPTY_MORPH` mappings.
This commit is contained in:
parent
a0c899a0ff
commit
a7e7d6c6c9
|
@ -92,9 +92,6 @@ class Morphologizer(Tagger):
|
|||
# 2) labels_pos stores a mapping from morph+POS->POS
|
||||
cfg = {"labels_morph": labels_morph or {}, "labels_pos": labels_pos or {}}
|
||||
self.cfg = dict(sorted(cfg.items()))
|
||||
# add mappings for empty morph
|
||||
self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH
|
||||
self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""]
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
|
@ -201,8 +198,8 @@ class Morphologizer(Tagger):
|
|||
doc_tag_ids = doc_tag_ids.get()
|
||||
for j, tag_id in enumerate(doc_tag_ids):
|
||||
morph = self.labels[tag_id]
|
||||
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph])
|
||||
doc.c[j].pos = self.cfg["labels_pos"][morph]
|
||||
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"].get(morph, 0))
|
||||
doc.c[j].pos = self.cfg["labels_pos"].get(morph, 0)
|
||||
|
||||
def get_loss(self, examples, scores):
|
||||
"""Find the loss and gradient of loss for the batch of documents and
|
||||
|
@ -228,12 +225,12 @@ class Morphologizer(Tagger):
|
|||
# doesn't, so if either is None, treat both as None here so that
|
||||
# truths doesn't end up with an unknown morph+POS combination
|
||||
if pos is None or morph is None:
|
||||
pos = None
|
||||
morph = None
|
||||
label_dict = Morphology.feats_to_dict(morph)
|
||||
if pos:
|
||||
label_dict[self.POS_FEAT] = pos
|
||||
label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
|
||||
label = None
|
||||
else:
|
||||
label_dict = Morphology.feats_to_dict(morph)
|
||||
if pos:
|
||||
label_dict[self.POS_FEAT] = pos
|
||||
label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
|
||||
eg_truths.append(label)
|
||||
truths.append(eg_truths)
|
||||
d_scores, loss = loss_func(scores, truths)
|
||||
|
|
|
@ -116,3 +116,23 @@ def test_overfitting_IO():
|
|||
no_batch_deps = [doc.to_array([MORPH]) for doc in [nlp(text) for text in texts]]
|
||||
assert_equal(batch_deps_1, batch_deps_2)
|
||||
assert_equal(batch_deps_1, no_batch_deps)
|
||||
|
||||
# Test without POS
|
||||
nlp.remove_pipe("morphologizer")
|
||||
nlp.add_pipe("morphologizer")
|
||||
for example in train_examples:
|
||||
for token in example.reference:
|
||||
token.pos_ = ""
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
for i in range(50):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||
assert losses["morphologizer"] < 0.00001
|
||||
|
||||
# Test the trained model
|
||||
test_text = "I like blue ham"
|
||||
doc = nlp(test_text)
|
||||
gold_morphs = ["Feat=N", "Feat=V", "", ""]
|
||||
gold_pos_tags = ["", "", "", ""]
|
||||
assert [str(t.morph) for t in doc] == gold_morphs
|
||||
assert [t.pos_ for t in doc] == gold_pos_tags
|
||||
|
|
Loading…
Reference in New Issue
Block a user