Add MORPH attr, add support in retokenizer (#4947)

* Add MORPH attr / symbol for token attrs

* Update retokenizer for MORPH
This commit is contained in:
adrianeboyd 2020-01-29 17:45:46 +01:00 committed by GitHub
parent a365359b36
commit 5ee9d8c9b8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 20 additions and 2 deletions

View File

@ -91,3 +91,4 @@ cdef enum attr_id_t:
LANG LANG
ENT_KB_ID = symbols.ENT_KB_ID ENT_KB_ID = symbols.ENT_KB_ID
MORPH

View File

@ -87,6 +87,7 @@ IDS = {
"SPACY": SPACY, "SPACY": SPACY,
"PROB": PROB, "PROB": PROB,
"LANG": LANG, "LANG": LANG,
"MORPH": MORPH,
} }

View File

@ -462,3 +462,4 @@ cdef enum symbol_t:
acl acl
ENT_KB_ID ENT_KB_ID
MORPH

View File

@ -462,6 +462,7 @@ IDS = {
"acl": acl, "acl": acl,
"LAW": LAW, "LAW": LAW,
"MORPH": MORPH,
} }

View File

@ -8,7 +8,7 @@ from ..util import get_doc
def test_doc_retokenize_merge(en_tokenizer): def test_doc_retokenize_merge(en_tokenizer):
text = "WKRO played songs by the beach boys all night" text = "WKRO played songs by the beach boys all night"
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"} attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE", "morph": "Number=Plur"}
doc = en_tokenizer(text) doc = en_tokenizer(text)
assert len(doc) == 9 assert len(doc) == 9
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:
@ -18,9 +18,11 @@ def test_doc_retokenize_merge(en_tokenizer):
assert doc[4].text == "the beach boys" assert doc[4].text == "the beach boys"
assert doc[4].text_with_ws == "the beach boys " assert doc[4].text_with_ws == "the beach boys "
assert doc[4].tag_ == "NAMED" assert doc[4].tag_ == "NAMED"
assert doc[4].morph_ == "Number=Plur"
assert doc[5].text == "all night" assert doc[5].text == "all night"
assert doc[5].text_with_ws == "all night" assert doc[5].text_with_ws == "all night"
assert doc[5].tag_ == "NAMED" assert doc[5].tag_ == "NAMED"
assert doc[5].morph_ == "Number=Plur"
def test_doc_retokenize_merge_children(en_tokenizer): def test_doc_retokenize_merge_children(en_tokenizer):

View File

@ -22,15 +22,18 @@ def test_doc_retokenize_split(en_vocab):
"tag": ["NNP"] * 2, "tag": ["NNP"] * 2,
"lemma": ["Los", "Angeles"], "lemma": ["Los", "Angeles"],
"ent_type": ["GPE"] * 2, "ent_type": ["GPE"] * 2,
"morph": ["Number=Sing"] * 2,
}, },
) )
assert len(doc) == 4 assert len(doc) == 4
assert doc[0].text == "Los" assert doc[0].text == "Los"
assert doc[0].head.text == "Angeles" assert doc[0].head.text == "Angeles"
assert doc[0].idx == 0 assert doc[0].idx == 0
assert doc[0].morph_ == "Number=Sing"
assert doc[1].idx == 3 assert doc[1].idx == 3
assert doc[1].text == "Angeles" assert doc[1].text == "Angeles"
assert doc[1].head.text == "start" assert doc[1].head.text == "start"
assert doc[1].morph_ == "Number=Sing"
assert doc[2].text == "start" assert doc[2].text == "start"
assert doc[2].head.text == "." assert doc[2].head.text == "."
assert doc[3].text == "." assert doc[3].text == "."

View File

@ -13,7 +13,7 @@ from .span cimport Span
from .token cimport Token from .token cimport Token
from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..lexeme cimport Lexeme, EMPTY_LEXEME
from ..structs cimport LexemeC, TokenC from ..structs cimport LexemeC, TokenC
from ..attrs cimport TAG from ..attrs cimport TAG, MORPH
from .underscore import is_writable_attr from .underscore import is_writable_attr
from ..attrs import intify_attrs from ..attrs import intify_attrs
@ -65,6 +65,8 @@ cdef class Retokenizer:
attrs["_"] = extensions attrs["_"] = extensions
else: else:
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
if MORPH in attrs:
self.doc.vocab.morphology.add(self.doc.vocab.strings.as_string(attrs[MORPH]))
self.merges.append((span, attrs)) self.merges.append((span, attrs))
def split(self, Token token, orths, heads, attrs=SimpleFrozenDict()): def split(self, Token token, orths, heads, attrs=SimpleFrozenDict()):
@ -96,6 +98,9 @@ cdef class Retokenizer:
# NB: Since we support {"KEY": [value, value]} syntax here, this # NB: Since we support {"KEY": [value, value]} syntax here, this
# will only "intify" the keys, not the values # will only "intify" the keys, not the values
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings) attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
if MORPH in attrs:
for morph in attrs[MORPH]:
self.doc.vocab.morphology.add(self.doc.vocab.strings.as_string(morph))
head_offsets = [] head_offsets = []
for head in heads: for head in heads:
if isinstance(head, Token): if isinstance(head, Token):

View File

@ -43,6 +43,8 @@ cdef class Token:
return token.pos return token.pos
elif feat_name == TAG: elif feat_name == TAG:
return token.tag return token.tag
elif feat_name == MORPH:
return token.morph
elif feat_name == DEP: elif feat_name == DEP:
return token.dep return token.dep
elif feat_name == HEAD: elif feat_name == HEAD:
@ -71,6 +73,8 @@ cdef class Token:
token.pos = <univ_pos_t>value token.pos = <univ_pos_t>value
elif feat_name == TAG: elif feat_name == TAG:
token.tag = value token.tag = value
elif feat_name == MORPH:
token.morph = value
elif feat_name == DEP: elif feat_name == DEP:
token.dep = value token.dep = value
elif feat_name == HEAD: elif feat_name == HEAD: