Update morphologizer (#5766)

* update `Morphologizer.begin_training` for use with `Example`

* make init and begin_training more consistent

* add `Morphology.normalize_features` to normalize outside of
`Morphology.add`

* make sure `get_loss` doesn't create unknown labels when the POS and
morph alignments differ
This commit is contained in:
Adriane Boyd 2020-07-19 11:10:51 +02:00 committed by GitHub
parent 38b59d728d
commit b81a89f0a9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 91 additions and 40 deletions

View File

@ -58,7 +58,7 @@ cdef class Morphology:
FEATURE_SEP = "|" FEATURE_SEP = "|"
FIELD_SEP = "=" FIELD_SEP = "="
VALUE_SEP = "," VALUE_SEP = ","
EMPTY_MORPH = "_" EMPTY_MORPH = "_" # not an empty string so that the PreshMap key is not 0
def __init__(self, StringStore strings, tag_map, lemmatizer, exc=None): def __init__(self, StringStore strings, tag_map, lemmatizer, exc=None):
self.mem = Pool() self.mem = Pool()
@ -117,13 +117,7 @@ cdef class Morphology:
if not isinstance(features, dict): if not isinstance(features, dict):
warnings.warn(Warnings.W100.format(feature=features)) warnings.warn(Warnings.W100.format(feature=features))
features = {} features = {}
features = _normalize_props(features)
string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()} string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
# normalized UFEATS string with sorted fields and values
norm_feats_string = self.FEATURE_SEP.join(sorted([
self.FIELD_SEP.join([field, values])
for field, values in string_features.items()
]))
# intified ("Field", "Field=Value") pairs # intified ("Field", "Field=Value") pairs
field_feature_pairs = [] field_feature_pairs = []
for field in sorted(string_features): for field in sorted(string_features):
@ -137,6 +131,7 @@ cdef class Morphology:
# the hash key for the tag is either the hash of the normalized UFEATS # the hash key for the tag is either the hash of the normalized UFEATS
# string or the hash of an empty placeholder (using the empty string # string or the hash of an empty placeholder (using the empty string
# would give a hash key of 0, which is not good for PreshMap) # would give a hash key of 0, which is not good for PreshMap)
norm_feats_string = self.normalize_features(features)
if norm_feats_string: if norm_feats_string:
tag.key = self.strings.add(norm_feats_string) tag.key = self.strings.add(norm_feats_string)
else: else:
@ -144,6 +139,26 @@ cdef class Morphology:
self.insert(tag) self.insert(tag)
return tag.key return tag.key
def normalize_features(self, features):
"""Create a normalized UFEATS string from a features string or dict.
features (Union[dict, str]): Features as dict or UFEATS string.
RETURNS (str): Features as normalized UFEATS string.
"""
if isinstance(features, str):
features = self.feats_to_dict(features)
if not isinstance(features, dict):
warnings.warn(Warnings.W100.format(feature=features))
features = {}
features = _normalize_props(features)
string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
# normalized UFEATS string with sorted fields and values
norm_feats_string = self.FEATURE_SEP.join(sorted([
self.FIELD_SEP.join([field, values])
for field, values in string_features.items()
]))
return norm_feats_string or self.EMPTY_MORPH
cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *: cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *:
"""Creates a MorphAnalysisC from a list of intified """Creates a MorphAnalysisC from a list of intified
("Field", "Field=Value") tuples where fields with multiple values have ("Field", "Field=Value") tuples where fields with multiple values have

View File

@ -23,29 +23,45 @@ from .defaults import default_morphologizer
@component("morphologizer", assigns=["token.morph", "token.pos"], default_model=default_morphologizer) @component("morphologizer", assigns=["token.morph", "token.pos"], default_model=default_morphologizer)
class Morphologizer(Tagger): class Morphologizer(Tagger):
POS_FEAT = "POS"
def __init__(self, vocab, model, **cfg): def __init__(self, vocab, model, **cfg):
self.vocab = vocab self.vocab = vocab
self.model = model self.model = model
self._rehearsal_model = None self._rehearsal_model = None
self.cfg = dict(sorted(cfg.items())) self.cfg = dict(sorted(cfg.items()))
self.cfg.setdefault("labels", {}) # to be able to set annotations without string operations on labels,
self.cfg.setdefault("morph_pos", {}) # store mappings from morph+POS labels to token-level annotations:
# 1) labels_morph stores a mapping from morph+POS->morph
self.cfg.setdefault("labels_morph", {})
# 2) labels_pos stores a mapping from morph+POS->POS
self.cfg.setdefault("labels_pos", {})
# add mappings for empty morph
self.cfg["labels_morph"][Morphology.EMPTY_MORPH] = Morphology.EMPTY_MORPH
self.cfg["labels_pos"][Morphology.EMPTY_MORPH] = POS_IDS[""]
@property @property
def labels(self): def labels(self):
return tuple(self.cfg["labels"].keys()) return tuple(self.cfg["labels_morph"].keys())
def add_label(self, label): def add_label(self, label):
if not isinstance(label, str): if not isinstance(label, str):
raise ValueError(Errors.E187) raise ValueError(Errors.E187)
if label in self.labels: if label in self.labels:
return 0 return 0
morph = Morphology.feats_to_dict(label) # normalize label
norm_morph_pos = self.vocab.strings[self.vocab.morphology.add(morph)] norm_label = self.vocab.morphology.normalize_features(label)
pos = morph.get("POS", "") # extract separate POS and morph tags
if norm_morph_pos not in self.cfg["labels"]: label_dict = Morphology.feats_to_dict(label)
self.cfg["labels"][norm_morph_pos] = norm_morph_pos pos = label_dict.get(self.POS_FEAT, "")
self.cfg["morph_pos"][norm_morph_pos] = POS_IDS[pos] if self.POS_FEAT in label_dict:
label_dict.pop(self.POS_FEAT)
# normalize morph string and add to morphology table
norm_morph = self.vocab.strings[self.vocab.morphology.add(label_dict)]
# add label mappings
if norm_label not in self.cfg["labels_morph"]:
self.cfg["labels_morph"][norm_label] = norm_morph
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
return 1 return 1
def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None,
@ -53,14 +69,16 @@ class Morphologizer(Tagger):
for example in get_examples(): for example in get_examples():
for i, token in enumerate(example.reference): for i, token in enumerate(example.reference):
pos = token.pos_ pos = token.pos_
morph = token.morph morph = token.morph_
norm_morph = self.vocab.strings[self.vocab.morphology.add(morph)] # create and add the combined morph+POS label
morph_dict = Morphology.feats_to_dict(morph)
if pos: if pos:
morph["POS"] = pos morph_dict[self.POS_FEAT] = pos
norm_morph_pos = self.vocab.strings[self.vocab.morphology.add(morph)] norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
if norm_morph_pos not in self.cfg["labels"]: # add label->morph and label->POS mappings
self.cfg["labels"][norm_morph_pos] = norm_morph if norm_label not in self.cfg["labels_morph"]:
self.cfg["morph_pos"][norm_morph_pos] = POS_IDS[pos] self.cfg["labels_morph"][norm_label] = morph
self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
self.set_output(len(self.labels)) self.set_output(len(self.labels))
self.model.initialize() self.model.initialize()
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)
@ -79,8 +97,8 @@ class Morphologizer(Tagger):
doc_tag_ids = doc_tag_ids.get() doc_tag_ids = doc_tag_ids.get()
for j, tag_id in enumerate(doc_tag_ids): for j, tag_id in enumerate(doc_tag_ids):
morph = self.labels[tag_id] morph = self.labels[tag_id]
doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels"][morph]) doc.c[j].morph = self.vocab.morphology.add(self.cfg["labels_morph"][morph])
doc.c[j].pos = self.cfg["morph_pos"][morph] doc.c[j].pos = self.cfg["labels_pos"][morph]
doc.is_morphed = True doc.is_morphed = True
@ -94,14 +112,17 @@ class Morphologizer(Tagger):
for i in range(len(morphs)): for i in range(len(morphs)):
pos = pos_tags[i] pos = pos_tags[i]
morph = morphs[i] morph = morphs[i]
feats = Morphology.feats_to_dict(morph) # POS may align (same value for multiple tokens) when morph
# doesn't, so if either is None, treat both as None here so that
# truths doesn't end up with an unknown morph+POS combination
if pos is None or morph is None:
pos = None
morph = None
label_dict = Morphology.feats_to_dict(morph)
if pos: if pos:
feats["POS"] = pos label_dict[self.POS_FEAT] = pos
if len(feats) > 0: label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
morph = self.vocab.strings[self.vocab.morphology.add(feats)] eg_truths.append(label)
if morph == "":
morph = Morphology.EMPTY_MORPH
eg_truths.append(morph)
truths.append(eg_truths) truths.append(eg_truths)
d_scores, loss = loss_func(scores, truths) d_scores, loss = loss_func(scores, truths)
if self.model.ops.xp.isnan(loss): if self.model.ops.xp.isnan(loss):

View File

@ -5,6 +5,7 @@ from spacy.gold import Example
from spacy.lang.en import English from spacy.lang.en import English
from spacy.language import Language from spacy.language import Language
from spacy.tests.util import make_tempdir from spacy.tests.util import make_tempdir
from spacy.morphology import Morphology
def test_label_types(): def test_label_types():
@ -23,9 +24,10 @@ TRAIN_DATA = [
"pos": ["NOUN", "VERB", "ADJ", "NOUN"], "pos": ["NOUN", "VERB", "ADJ", "NOUN"],
}, },
), ),
# test combinations of morph+POS
( (
"Eat blue ham", "Eat blue ham",
{"morphs": ["Feat=V", "Feat=J", "Feat=N"], "pos": ["VERB", "ADJ", "NOUN"]}, {"morphs": ["Feat=V", "", ""], "pos": ["", "ADJ", ""]},
), ),
] ]
@ -38,7 +40,12 @@ def test_overfitting_IO():
for inst in TRAIN_DATA: for inst in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1])) train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
for morph, pos in zip(inst[1]["morphs"], inst[1]["pos"]): for morph, pos in zip(inst[1]["morphs"], inst[1]["pos"]):
morphologizer.add_label(morph + "|POS=" + pos) if morph and pos:
morphologizer.add_label(morph + Morphology.FEATURE_SEP + "POS" + Morphology.FIELD_SEP + pos)
elif pos:
morphologizer.add_label("POS" + Morphology.FIELD_SEP + pos)
elif morph:
morphologizer.add_label(morph)
nlp.add_pipe(morphologizer) nlp.add_pipe(morphologizer)
optimizer = nlp.begin_training() optimizer = nlp.begin_training()
@ -48,19 +55,27 @@ def test_overfitting_IO():
assert losses["morphologizer"] < 0.00001 assert losses["morphologizer"] < 0.00001
# test the trained model # test the trained model
test_text = "I like blue eggs" test_text = "I like blue ham"
doc = nlp(test_text) doc = nlp(test_text)
gold_morphs = [ gold_morphs = [
"Feat=N|POS=NOUN", "Feat=N",
"Feat=V|POS=VERB", "Feat=V",
"Feat=J|POS=ADJ", "",
"Feat=N|POS=NOUN", "",
]
gold_pos_tags = [
"NOUN",
"VERB",
"ADJ",
"",
] ]
assert [t.morph_ for t in doc] == gold_morphs assert [t.morph_ for t in doc] == gold_morphs
assert [t.pos_ for t in doc] == gold_pos_tags
# Also test the results are still the same after IO # Also test the results are still the same after IO
with make_tempdir() as tmp_dir: with make_tempdir() as tmp_dir:
nlp.to_disk(tmp_dir) nlp.to_disk(tmp_dir)
nlp2 = util.load_model_from_path(tmp_dir) nlp2 = util.load_model_from_path(tmp_dir)
doc2 = nlp2(test_text) doc2 = nlp2(test_text)
assert gold_morphs == [t.morph_ for t in doc2] assert [t.morph_ for t in doc2] == gold_morphs
assert [t.pos_ for t in doc2] == gold_pos_tags