Add MORPH attr, add support in retokenizer (#4947)

* Add MORPH attr / symbol for token attrs * Update retokenizer for MORPH
2025-07-15 18:52:29 +03:00 · 2020-01-29 17:45:46 +01:00 · 2020-01-29 17:45:46 +01:00 · 5ee9d8c9b8
commit 5ee9d8c9b8
parent a365359b36
8 changed files with 20 additions and 2 deletions
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@ -91,3 +91,4 @@ cdef enum attr_id_t:

    LANG
    ENT_KB_ID = symbols.ENT_KB_ID
+    MORPH
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -87,6 +87,7 @@ IDS = {
    "SPACY": SPACY,
    "PROB": PROB,
    "LANG": LANG,
+    "MORPH": MORPH,
 }


--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@ -462,3 +462,4 @@ cdef enum symbol_t:
    acl

    ENT_KB_ID
+    MORPH
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -462,6 +462,7 @@ IDS = {

    "acl": acl,
    "LAW": LAW,
+    "MORPH": MORPH,
 }


--- a/spacy/tests/doc/test_retokenize_merge.py
+++ b/spacy/tests/doc/test_retokenize_merge.py
@ -8,7 +8,7 @@ from ..util import get_doc

 def test_doc_retokenize_merge(en_tokenizer):
    text = "WKRO played songs by the beach boys all night"
-    attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
+    attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE", "morph": "Number=Plur"}
    doc = en_tokenizer(text)
    assert len(doc) == 9
    with doc.retokenize() as retokenizer:
@ -18,9 +18,11 @@ def test_doc_retokenize_merge(en_tokenizer):
    assert doc[4].text == "the beach boys"
    assert doc[4].text_with_ws == "the beach boys "
    assert doc[4].tag_ == "NAMED"
+    assert doc[4].morph_ == "Number=Plur"
    assert doc[5].text == "all night"
    assert doc[5].text_with_ws == "all night"
    assert doc[5].tag_ == "NAMED"
+    assert doc[5].morph_ == "Number=Plur"


 def test_doc_retokenize_merge_children(en_tokenizer):
--- a/spacy/tests/doc/test_retokenize_split.py
+++ b/spacy/tests/doc/test_retokenize_split.py
@ -22,15 +22,18 @@ def test_doc_retokenize_split(en_vocab):
                "tag": ["NNP"] * 2,
                "lemma": ["Los", "Angeles"],
                "ent_type": ["GPE"] * 2,
+                "morph": ["Number=Sing"] * 2,
            },
        )
    assert len(doc) == 4
    assert doc[0].text == "Los"
    assert doc[0].head.text == "Angeles"
    assert doc[0].idx == 0
+    assert doc[0].morph_ == "Number=Sing"
    assert doc[1].idx == 3
    assert doc[1].text == "Angeles"
    assert doc[1].head.text == "start"
+    assert doc[1].morph_ == "Number=Sing"
    assert doc[2].text == "start"
    assert doc[2].head.text == "."
    assert doc[3].text == "."
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@ -13,7 +13,7 @@ from .span cimport Span
 from .token cimport Token
 from ..lexeme cimport Lexeme, EMPTY_LEXEME
 from ..structs cimport LexemeC, TokenC
-from ..attrs cimport TAG
+from ..attrs cimport TAG, MORPH

 from .underscore import is_writable_attr
 from ..attrs import intify_attrs
@ -65,6 +65,8 @@ cdef class Retokenizer:
            attrs["_"] = extensions
        else:
            attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
+            if MORPH in attrs:
+                self.doc.vocab.morphology.add(self.doc.vocab.strings.as_string(attrs[MORPH]))
        self.merges.append((span, attrs))

    def split(self, Token token, orths, heads, attrs=SimpleFrozenDict()):
@ -96,6 +98,9 @@ cdef class Retokenizer:
            # NB: Since we support {"KEY": [value, value]} syntax here, this
            # will only "intify" the keys, not the values
            attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
+            if MORPH in attrs:
+                for morph in attrs[MORPH]:
+                    self.doc.vocab.morphology.add(self.doc.vocab.strings.as_string(morph))
        head_offsets = []
        for head in heads:
            if isinstance(head, Token):
--- a/spacy/tokens/token.pxd
+++ b/spacy/tokens/token.pxd
@ -43,6 +43,8 @@ cdef class Token:
            return token.pos
        elif feat_name == TAG:
            return token.tag
+        elif feat_name == MORPH:
+            return token.morph
        elif feat_name == DEP:
            return token.dep
        elif feat_name == HEAD:
@ -71,6 +73,8 @@ cdef class Token:
            token.pos = <univ_pos_t>value
        elif feat_name == TAG:
            token.tag = value
+        elif feat_name == MORPH:
+            token.morph = value
        elif feat_name == DEP:
            token.dep = value
        elif feat_name == HEAD: