mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 10:26:35 +03:00
Handle unset token.morph in Morphologizer (#6704)
* Handle unset token.morph in Morphologizer Handle unset `token.morph` in `Morphologizer.initialize` and `Morphologizer.get_loss`. If both `token.morph` and `token.pos` are unset, treat the annotation as missing rather than empty. * Add token.has_morph()
This commit is contained in:
parent
7b3f0c6f1b
commit
9328dd5625
|
@ -145,6 +145,10 @@ class Morphologizer(Tagger):
|
||||||
for example in get_examples():
|
for example in get_examples():
|
||||||
for i, token in enumerate(example.reference):
|
for i, token in enumerate(example.reference):
|
||||||
pos = token.pos_
|
pos = token.pos_
|
||||||
|
# if both are unset, annotation is missing, so do not add
|
||||||
|
# an empty label
|
||||||
|
if pos == "" and not token.has_morph():
|
||||||
|
continue
|
||||||
morph = str(token.morph)
|
morph = str(token.morph)
|
||||||
# create and add the combined morph+POS label
|
# create and add the combined morph+POS label
|
||||||
morph_dict = Morphology.feats_to_dict(morph)
|
morph_dict = Morphology.feats_to_dict(morph)
|
||||||
|
@ -155,7 +159,7 @@ class Morphologizer(Tagger):
|
||||||
if norm_label not in self.cfg["labels_morph"]:
|
if norm_label not in self.cfg["labels_morph"]:
|
||||||
self.cfg["labels_morph"][norm_label] = morph
|
self.cfg["labels_morph"][norm_label] = morph
|
||||||
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
|
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
|
||||||
if len(self.labels) <= 1:
|
if len(self.labels) < 1:
|
||||||
raise ValueError(Errors.E143.format(name=self.name))
|
raise ValueError(Errors.E143.format(name=self.name))
|
||||||
doc_sample = []
|
doc_sample = []
|
||||||
label_sample = []
|
label_sample = []
|
||||||
|
@ -217,15 +221,24 @@ class Morphologizer(Tagger):
|
||||||
pos = pos_tags[i]
|
pos = pos_tags[i]
|
||||||
morph = morphs[i]
|
morph = morphs[i]
|
||||||
# POS may align (same value for multiple tokens) when morph
|
# POS may align (same value for multiple tokens) when morph
|
||||||
# doesn't, so if either is None, treat both as None here so that
|
# doesn't, so if either is misaligned (None), treat the
|
||||||
# truths doesn't end up with an unknown morph+POS combination
|
# annotation as missing so that truths doesn't end up with an
|
||||||
|
# unknown morph+POS combination
|
||||||
if pos is None or morph is None:
|
if pos is None or morph is None:
|
||||||
label = None
|
label = None
|
||||||
|
# If both are unset, the annotation is missing (empty morph
|
||||||
|
# converted from int is "_" rather than "")
|
||||||
|
elif pos == "" and morph == "":
|
||||||
|
label = None
|
||||||
|
# Otherwise, generate the combined label
|
||||||
else:
|
else:
|
||||||
label_dict = Morphology.feats_to_dict(morph)
|
label_dict = Morphology.feats_to_dict(morph)
|
||||||
if pos:
|
if pos:
|
||||||
label_dict[self.POS_FEAT] = pos
|
label_dict[self.POS_FEAT] = pos
|
||||||
label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
|
label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
|
||||||
|
# As a fail-safe, skip any unrecognized labels
|
||||||
|
if label not in self.labels:
|
||||||
|
label = None
|
||||||
eg_truths.append(label)
|
eg_truths.append(label)
|
||||||
truths.append(eg_truths)
|
truths.append(eg_truths)
|
||||||
d_scores, loss = loss_func(scores, truths)
|
d_scores, loss = loss_func(scores, truths)
|
||||||
|
|
|
@ -136,3 +136,28 @@ def test_overfitting_IO():
|
||||||
gold_pos_tags = ["", "", "", ""]
|
gold_pos_tags = ["", "", "", ""]
|
||||||
assert [str(t.morph) for t in doc] == gold_morphs
|
assert [str(t.morph) for t in doc] == gold_morphs
|
||||||
assert [t.pos_ for t in doc] == gold_pos_tags
|
assert [t.pos_ for t in doc] == gold_pos_tags
|
||||||
|
|
||||||
|
# Test with unset morph and partial POS
|
||||||
|
nlp.remove_pipe("morphologizer")
|
||||||
|
nlp.add_pipe("morphologizer")
|
||||||
|
for example in train_examples:
|
||||||
|
for token in example.reference:
|
||||||
|
if token.text == "ham":
|
||||||
|
token.pos_ = "NOUN"
|
||||||
|
else:
|
||||||
|
token.pos_ = ""
|
||||||
|
token.set_morph(None)
|
||||||
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
print(nlp.get_pipe("morphologizer").labels)
|
||||||
|
for i in range(50):
|
||||||
|
losses = {}
|
||||||
|
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
|
assert losses["morphologizer"] < 0.00001
|
||||||
|
|
||||||
|
# Test the trained model
|
||||||
|
test_text = "I like blue ham"
|
||||||
|
doc = nlp(test_text)
|
||||||
|
gold_morphs = ["", "", "", ""]
|
||||||
|
gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"]
|
||||||
|
assert [str(t.morph) for t in doc] == gold_morphs
|
||||||
|
assert [t.pos_ for t in doc] == gold_pos_tags
|
||||||
|
|
|
@ -211,6 +211,14 @@ cdef class Token:
|
||||||
xp = get_array_module(vector)
|
xp = get_array_module(vector)
|
||||||
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
|
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
|
||||||
|
|
||||||
|
def has_morph(self):
|
||||||
|
"""Check whether the token has annotated morph information.
|
||||||
|
Return False when the morph annotation is unset/missing.
|
||||||
|
|
||||||
|
RETURNS (bool): Whether the morph annotation is set.
|
||||||
|
"""
|
||||||
|
return not self.c.morph == 0
|
||||||
|
|
||||||
property morph:
|
property morph:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return MorphAnalysis.from_id(self.vocab, self.c.morph)
|
return MorphAnalysis.from_id(self.vocab, self.c.morph)
|
||||||
|
|
|
@ -191,6 +191,15 @@ the morph to an unset state.
|
||||||
| -------- | --------------------------------------------------------------------------------- |
|
| -------- | --------------------------------------------------------------------------------- |
|
||||||
| features | The morphological features to set. ~~Union[int, dict, str, MorphAnalysis, None]~~ |
|
| features | The morphological features to set. ~~Union[int, dict, str, MorphAnalysis, None]~~ |
|
||||||
|
|
||||||
|
## Token.has_morph {#has_morph tag="method"}
|
||||||
|
|
||||||
|
Check whether the token has annotated morph information. Return `False` when the
|
||||||
|
morph annotation is unset/missing.
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | --------------------------------------------- |
|
||||||
|
| **RETURNS** | Whether the morph annotation is set. ~~bool~~ |
|
||||||
|
|
||||||
## Token.is_ancestor {#is_ancestor tag="method" model="parser"}
|
## Token.is_ancestor {#is_ancestor tag="method" model="parser"}
|
||||||
|
|
||||||
Check whether this token is a parent, grandparent, etc. of another in the
|
Check whether this token is a parent, grandparent, etc. of another in the
|
||||||
|
|
Loading…
Reference in New Issue
Block a user