From 5ee9d8c9b80c9e80491f320e58c0d86d2ec917b7 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 29 Jan 2020 17:45:46 +0100 Subject: [PATCH] Add MORPH attr, add support in retokenizer (#4947) * Add MORPH attr / symbol for token attrs * Update retokenizer for MORPH --- spacy/attrs.pxd | 1 + spacy/attrs.pyx | 1 + spacy/symbols.pxd | 1 + spacy/symbols.pyx | 1 + spacy/tests/doc/test_retokenize_merge.py | 4 +++- spacy/tests/doc/test_retokenize_split.py | 3 +++ spacy/tokens/_retokenize.pyx | 7 ++++++- spacy/tokens/token.pxd | 4 ++++ 8 files changed, 20 insertions(+), 2 deletions(-) diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index d9aca078c..7fc0b9111 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -91,3 +91,4 @@ cdef enum attr_id_t: LANG ENT_KB_ID = symbols.ENT_KB_ID + MORPH diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index a601a7a66..97ca627fb 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -87,6 +87,7 @@ IDS = { "SPACY": SPACY, "PROB": PROB, "LANG": LANG, + "MORPH": MORPH, } diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index b95b4b805..5c1970628 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -462,3 +462,4 @@ cdef enum symbol_t: acl ENT_KB_ID + MORPH diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index 36b9ffa67..128946ec7 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -462,6 +462,7 @@ IDS = { "acl": acl, "LAW": LAW, + "MORPH": MORPH, } diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index c82c04eeb..17bcd2c64 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -8,7 +8,7 @@ from ..util import get_doc def test_doc_retokenize_merge(en_tokenizer): text = "WKRO played songs by the beach boys all night" - attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"} + attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE", "morph": "Number=Plur"} doc = en_tokenizer(text) assert len(doc) == 9 with doc.retokenize() as retokenizer: @@ -18,9 +18,11 @@ def test_doc_retokenize_merge(en_tokenizer): assert doc[4].text == "the beach boys" assert doc[4].text_with_ws == "the beach boys " assert doc[4].tag_ == "NAMED" + assert doc[4].morph_ == "Number=Plur" assert doc[5].text == "all night" assert doc[5].text_with_ws == "all night" assert doc[5].tag_ == "NAMED" + assert doc[5].morph_ == "Number=Plur" def test_doc_retokenize_merge_children(en_tokenizer): diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py index 33b6fbe81..5f40da425 100644 --- a/spacy/tests/doc/test_retokenize_split.py +++ b/spacy/tests/doc/test_retokenize_split.py @@ -22,15 +22,18 @@ def test_doc_retokenize_split(en_vocab): "tag": ["NNP"] * 2, "lemma": ["Los", "Angeles"], "ent_type": ["GPE"] * 2, + "morph": ["Number=Sing"] * 2, }, ) assert len(doc) == 4 assert doc[0].text == "Los" assert doc[0].head.text == "Angeles" assert doc[0].idx == 0 + assert doc[0].morph_ == "Number=Sing" assert doc[1].idx == 3 assert doc[1].text == "Angeles" assert doc[1].head.text == "start" + assert doc[1].morph_ == "Number=Sing" assert doc[2].text == "start" assert doc[2].head.text == "." assert doc[3].text == "." diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index cd7e5a426..ec7e8a9e8 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -13,7 +13,7 @@ from .span cimport Span from .token cimport Token from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..structs cimport LexemeC, TokenC -from ..attrs cimport TAG +from ..attrs cimport TAG, MORPH from .underscore import is_writable_attr from ..attrs import intify_attrs @@ -65,6 +65,8 @@ cdef class Retokenizer: attrs["_"] = extensions else: attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) + if MORPH in attrs: + self.doc.vocab.morphology.add(self.doc.vocab.strings.as_string(attrs[MORPH])) self.merges.append((span, attrs)) def split(self, Token token, orths, heads, attrs=SimpleFrozenDict()): @@ -96,6 +98,9 @@ cdef class Retokenizer: # NB: Since we support {"KEY": [value, value]} syntax here, this # will only "intify" the keys, not the values attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) + if MORPH in attrs: + for morph in attrs[MORPH]: + self.doc.vocab.morphology.add(self.doc.vocab.strings.as_string(morph)) head_offsets = [] for head in heads: if isinstance(head, Token): diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd index ec5df3fac..82d9c7c2a 100644 --- a/spacy/tokens/token.pxd +++ b/spacy/tokens/token.pxd @@ -43,6 +43,8 @@ cdef class Token: return token.pos elif feat_name == TAG: return token.tag + elif feat_name == MORPH: + return token.morph elif feat_name == DEP: return token.dep elif feat_name == HEAD: @@ -71,6 +73,8 @@ cdef class Token: token.pos = value elif feat_name == TAG: token.tag = value + elif feat_name == MORPH: + token.morph = value elif feat_name == DEP: token.dep = value elif feat_name == HEAD: