From ab270364f17c45c2759c4bd057db8a08fdf19a62 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Sun, 13 Sep 2020 14:06:07 +0200 Subject: [PATCH] Modify Token.morph to enable unsetting (#6043) Modify `Token.morph` property so that `Token.c.morph` can be reset back to an internal value of `0`. Allow setting `Token.morph` from a hash as long as the morph string is already in the `StringStore`, setting it indirectly through `Token.morph_` so that the value is added to the morphology. If the hash is not in the `StringStore`, raise an error. --- spacy/errors.py | 3 +++ spacy/tests/doc/test_morphanalysis.py | 28 +++++++++++++++++++++++++++ spacy/tokens/token.pyx | 14 +++++++++++--- 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 7164598b6..8f95609a6 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -671,6 +671,9 @@ class Errors: E1007 = ("Unsupported DependencyMatcher operator '{op}'.") E1008 = ("Invalid pattern: each pattern should be a list of dicts. Check " "that you are providing a list of patterns as `List[List[dict]]`.") + E1009 = ("String for hash '{val}' not found in StringStore. Set the value " + "through token.morph_ instead or add the string to the " + "StringStore with `nlp.vocab.strings.add(string)`.") @add_codes diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py index 6bfc198fd..f378ce042 100644 --- a/spacy/tests/doc/test_morphanalysis.py +++ b/spacy/tests/doc/test_morphanalysis.py @@ -66,3 +66,31 @@ def test_morph_set(i_has): def test_morph_str(i_has): assert str(i_has[0].morph) == "PronType=prs" assert str(i_has[1].morph) == "Number=sing|Person=three|Tense=pres|VerbForm=fin" + + +def test_morph_property(tokenizer): + doc = tokenizer("a dog") + + # set through token.morph_ + doc[0].morph_ = "PronType=prs" + assert doc[0].morph_ == "PronType=prs" + assert doc.to_array(["MORPH"])[0] != 0 + + # unset with token.morph + doc[0].morph = 0 + assert doc.to_array(["MORPH"])[0] == 0 + + # empty morph is equivalent to "_" + doc[0].morph_ = "" + assert doc[0].morph_ == "" + assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"] + + # "_" morph is also equivalent to empty morph + doc[0].morph_ = "_" + assert doc[0].morph_ == "" + assert doc.to_array(["MORPH"])[0] == tokenizer.vocab.strings["_"] + + # set through existing hash with token.morph + tokenizer.vocab.strings.add("Feat=Val") + doc[0].morph = tokenizer.vocab.strings.add("Feat=Val") + assert doc[0].morph_ == "Feat=Val" diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 50f1c5da3..2474f0637 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -214,9 +214,17 @@ cdef class Token: xp = get_array_module(vector) return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)) - @property - def morph(self): - return MorphAnalysis.from_id(self.vocab, self.c.morph) + property morph: + def __get__(self): + return MorphAnalysis.from_id(self.vocab, self.c.morph) + + def __set__(self, attr_t morph): + if morph == 0: + self.c.morph = morph + elif morph in self.vocab.strings: + self.morph_ = self.vocab.strings[morph] + else: + raise ValueError(Errors.E1009.format(val=morph)) property morph_: def __get__(self):