Handle unset token.morph in Morphologizer (#6704)

* Handle unset token.morph in Morphologizer

Handle unset `token.morph` in `Morphologizer.initialize` and
`Morphologizer.get_loss`. If both `token.morph` and `token.pos` are
unset, treat the annotation as missing rather than empty.

* Add token.has_morph()
This commit is contained in:
Adriane Boyd 2021-01-15 17:20:10 +01:00 committed by GitHub
parent 7b3f0c6f1b
commit 9328dd5625
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 58 additions and 3 deletions

View File

@ -145,6 +145,10 @@ class Morphologizer(Tagger):
for example in get_examples(): for example in get_examples():
for i, token in enumerate(example.reference): for i, token in enumerate(example.reference):
pos = token.pos_ pos = token.pos_
# if both are unset, annotation is missing, so do not add
# an empty label
if pos == "" and not token.has_morph():
continue
morph = str(token.morph) morph = str(token.morph)
# create and add the combined morph+POS label # create and add the combined morph+POS label
morph_dict = Morphology.feats_to_dict(morph) morph_dict = Morphology.feats_to_dict(morph)
@ -155,7 +159,7 @@ class Morphologizer(Tagger):
if norm_label not in self.cfg["labels_morph"]: if norm_label not in self.cfg["labels_morph"]:
self.cfg["labels_morph"][norm_label] = morph self.cfg["labels_morph"][norm_label] = morph
self.cfg["labels_pos"][norm_label] = POS_IDS[pos] self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
if len(self.labels) <= 1: if len(self.labels) < 1:
raise ValueError(Errors.E143.format(name=self.name)) raise ValueError(Errors.E143.format(name=self.name))
doc_sample = [] doc_sample = []
label_sample = [] label_sample = []
@ -217,15 +221,24 @@ class Morphologizer(Tagger):
pos = pos_tags[i] pos = pos_tags[i]
morph = morphs[i] morph = morphs[i]
# POS may align (same value for multiple tokens) when morph # POS may align (same value for multiple tokens) when morph
# doesn't, so if either is None, treat both as None here so that # doesn't, so if either is misaligned (None), treat the
# truths doesn't end up with an unknown morph+POS combination # annotation as missing so that truths doesn't end up with an
# unknown morph+POS combination
if pos is None or morph is None: if pos is None or morph is None:
label = None label = None
# If both are unset, the annotation is missing (empty morph
# converted from int is "_" rather than "")
elif pos == "" and morph == "":
label = None
# Otherwise, generate the combined label
else: else:
label_dict = Morphology.feats_to_dict(morph) label_dict = Morphology.feats_to_dict(morph)
if pos: if pos:
label_dict[self.POS_FEAT] = pos label_dict[self.POS_FEAT] = pos
label = self.vocab.strings[self.vocab.morphology.add(label_dict)] label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
# As a fail-safe, skip any unrecognized labels
if label not in self.labels:
label = None
eg_truths.append(label) eg_truths.append(label)
truths.append(eg_truths) truths.append(eg_truths)
d_scores, loss = loss_func(scores, truths) d_scores, loss = loss_func(scores, truths)

View File

@ -136,3 +136,28 @@ def test_overfitting_IO():
gold_pos_tags = ["", "", "", ""] gold_pos_tags = ["", "", "", ""]
assert [str(t.morph) for t in doc] == gold_morphs assert [str(t.morph) for t in doc] == gold_morphs
assert [t.pos_ for t in doc] == gold_pos_tags assert [t.pos_ for t in doc] == gold_pos_tags
# Test with unset morph and partial POS
nlp.remove_pipe("morphologizer")
nlp.add_pipe("morphologizer")
for example in train_examples:
for token in example.reference:
if token.text == "ham":
token.pos_ = "NOUN"
else:
token.pos_ = ""
token.set_morph(None)
optimizer = nlp.initialize(get_examples=lambda: train_examples)
print(nlp.get_pipe("morphologizer").labels)
for i in range(50):
losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses)
assert losses["morphologizer"] < 0.00001
# Test the trained model
test_text = "I like blue ham"
doc = nlp(test_text)
gold_morphs = ["", "", "", ""]
gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"]
assert [str(t.morph) for t in doc] == gold_morphs
assert [t.pos_ for t in doc] == gold_pos_tags

View File

@ -211,6 +211,14 @@ cdef class Token:
xp = get_array_module(vector) xp = get_array_module(vector)
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)) return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
def has_morph(self):
"""Check whether the token has annotated morph information.
Return False when the morph annotation is unset/missing.
RETURNS (bool): Whether the morph annotation is set.
"""
return not self.c.morph == 0
property morph: property morph:
def __get__(self): def __get__(self):
return MorphAnalysis.from_id(self.vocab, self.c.morph) return MorphAnalysis.from_id(self.vocab, self.c.morph)

View File

@ -191,6 +191,15 @@ the morph to an unset state.
| -------- | --------------------------------------------------------------------------------- | | -------- | --------------------------------------------------------------------------------- |
| features | The morphological features to set. ~~Union[int, dict, str, MorphAnalysis, None]~~ | | features | The morphological features to set. ~~Union[int, dict, str, MorphAnalysis, None]~~ |
## Token.has_morph {#has_morph tag="method"}
Check whether the token has annotated morph information. Return `False` when the
morph annotation is unset/missing.
| Name | Description |
| ----------- | --------------------------------------------- |
| **RETURNS** | Whether the morph annotation is set. ~~bool~~ |
## Token.is_ancestor {#is_ancestor tag="method" model="parser"} ## Token.is_ancestor {#is_ancestor tag="method" model="parser"}
Check whether this token is a parent, grandparent, etc. of another in the Check whether this token is a parent, grandparent, etc. of another in the