From 50db3f0cdb7795526fa00748a8684cc29a2a489c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 16 Jul 2020 21:20:15 +0200 Subject: [PATCH] Serialize morph rules with tagger Serialize `morph_rules` with the tagger alongside the `tag_map`. Use `Morphology.load_tag_map` and `Morphology.load_morph_exceptions` to load these settings rather than reinitializing the morphology each time they are changed. --- spacy/pipeline/pipes.pyx | 37 +++++++++++++++++------------ spacy/tests/pipeline/test_tagger.py | 8 +++++-- 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 1234733d0..08dce2fbd 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -359,9 +359,7 @@ class Tagger(Pipe): if new_tag_map: if "_SP" in orig_tag_map: new_tag_map["_SP"] = orig_tag_map["_SP"] - vocab.morphology = Morphology(vocab.strings, new_tag_map, - vocab.morphology.lemmatizer, - exc=vocab.morphology.exc) + vocab.morphology.load_tag_map(new_tag_map) self.set_output(len(self.labels)) doc_sample = [Doc(self.vocab, words=["hello", "world"])] if pipeline is not None: @@ -400,10 +398,7 @@ class Tagger(Pipe): if values is None: values = {POS: "X"} tag_map[label] = values - self.vocab.morphology = Morphology( - self.vocab.strings, tag_map=tag_map, - lemmatizer=self.vocab.morphology.lemmatizer, - exc=self.vocab.morphology.exc) + self.vocab.morphology.load_tag_map(tag_map) return 1 def use_params(self, params): @@ -417,6 +412,8 @@ class Tagger(Pipe): serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) tag_map = dict(sorted(self.vocab.morphology.tag_map.items())) serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map) + morph_rules = dict(self.vocab.morphology.exc) + serialize["morph_rules"] = lambda: srsly.msgpack_dumps(morph_rules) return util.to_bytes(serialize, exclude) def from_bytes(self, bytes_data, exclude=tuple()): @@ -428,14 +425,18 @@ class Tagger(Pipe): def load_tag_map(b): tag_map = srsly.msgpack_loads(b) - self.vocab.morphology = Morphology( - self.vocab.strings, tag_map=tag_map, - lemmatizer=self.vocab.morphology.lemmatizer, - exc=self.vocab.morphology.exc) + self.vocab.morphology.load_tag_map(tag_map) + def load_morph_rules(b): + morph_rules = srsly.msgpack_loads(b) + self.vocab.morphology.load_morph_exceptions(morph_rules) + + self.vocab.morphology = Morphology(self.vocab.strings, dict(), + lemmatizer=self.vocab.morphology.lemmatizer) deserialize = { "vocab": lambda b: self.vocab.from_bytes(b), "tag_map": load_tag_map, + "morph_rules": load_morph_rules, "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), "model": lambda b: load_model(b), } @@ -444,9 +445,11 @@ class Tagger(Pipe): def to_disk(self, path, exclude=tuple()): tag_map = dict(sorted(self.vocab.morphology.tag_map.items())) + morph_rules = dict(self.vocab.morphology.exc) serialize = { "vocab": lambda p: self.vocab.to_disk(p), "tag_map": lambda p: srsly.write_msgpack(p, tag_map), + "morph_rules": lambda p: srsly.write_msgpack(p, morph_rules), "model": lambda p: self.model.to_disk(p), "cfg": lambda p: srsly.write_json(p, self.cfg), } @@ -462,15 +465,19 @@ class Tagger(Pipe): def load_tag_map(p): tag_map = srsly.read_msgpack(p) - self.vocab.morphology = Morphology( - self.vocab.strings, tag_map=tag_map, - lemmatizer=self.vocab.morphology.lemmatizer, - exc=self.vocab.morphology.exc) + self.vocab.morphology.load_tag_map(tag_map) + def load_morph_rules(p): + morph_rules = srsly.read_msgpack(p) + self.vocab.morphology.load_morph_exceptions(morph_rules) + + self.vocab.morphology = Morphology(self.vocab.strings, dict(), + lemmatizer=self.vocab.morphology.lemmatizer) deserialize = { "vocab": lambda p: self.vocab.from_disk(p), "cfg": lambda p: self.cfg.update(_load_cfg(p)), "tag_map": load_tag_map, + "morph_rules": load_morph_rules, "model": load_model, } util.from_disk(path, deserialize, exclude) diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index aedf8e2b3..81ae3a42c 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -17,6 +17,8 @@ def test_label_types(): TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}} +MORPH_RULES = {"V": {"like": {"lemma": "luck"}}} + TRAIN_DATA = [ ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), ("Eat blue ham", {"tags": ["V", "J", "N"]}), @@ -26,9 +28,9 @@ TRAIN_DATA = [ def test_overfitting_IO(): # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly nlp = English() + nlp.vocab.morphology.load_tag_map(TAG_MAP) + nlp.vocab.morphology.load_morph_exceptions(MORPH_RULES) tagger = nlp.create_pipe("tagger") - for tag, values in TAG_MAP.items(): - tagger.add_label(tag, values) train_examples = [] for t in TRAIN_DATA: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) @@ -47,6 +49,7 @@ def test_overfitting_IO(): assert doc[1].tag_ is "V" assert doc[2].tag_ is "J" assert doc[3].tag_ is "N" + assert doc[1].lemma_ == "luck" # Also test the results are still the same after IO with make_tempdir() as tmp_dir: @@ -57,3 +60,4 @@ def test_overfitting_IO(): assert doc2[1].tag_ is "V" assert doc2[2].tag_ is "J" assert doc2[3].tag_ is "N" + assert doc[1].lemma_ == "luck"