mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Add MORPH attr, add support in retokenizer (#4947)
* Add MORPH attr / symbol for token attrs * Update retokenizer for MORPH
This commit is contained in:
parent
a365359b36
commit
5ee9d8c9b8
|
@ -91,3 +91,4 @@ cdef enum attr_id_t:
|
|||
|
||||
LANG
|
||||
ENT_KB_ID = symbols.ENT_KB_ID
|
||||
MORPH
|
||||
|
|
|
@ -87,6 +87,7 @@ IDS = {
|
|||
"SPACY": SPACY,
|
||||
"PROB": PROB,
|
||||
"LANG": LANG,
|
||||
"MORPH": MORPH,
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -462,3 +462,4 @@ cdef enum symbol_t:
|
|||
acl
|
||||
|
||||
ENT_KB_ID
|
||||
MORPH
|
||||
|
|
|
@ -462,6 +462,7 @@ IDS = {
|
|||
|
||||
"acl": acl,
|
||||
"LAW": LAW,
|
||||
"MORPH": MORPH,
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ from ..util import get_doc
|
|||
|
||||
def test_doc_retokenize_merge(en_tokenizer):
|
||||
text = "WKRO played songs by the beach boys all night"
|
||||
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
|
||||
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE", "morph": "Number=Plur"}
|
||||
doc = en_tokenizer(text)
|
||||
assert len(doc) == 9
|
||||
with doc.retokenize() as retokenizer:
|
||||
|
@ -18,9 +18,11 @@ def test_doc_retokenize_merge(en_tokenizer):
|
|||
assert doc[4].text == "the beach boys"
|
||||
assert doc[4].text_with_ws == "the beach boys "
|
||||
assert doc[4].tag_ == "NAMED"
|
||||
assert doc[4].morph_ == "Number=Plur"
|
||||
assert doc[5].text == "all night"
|
||||
assert doc[5].text_with_ws == "all night"
|
||||
assert doc[5].tag_ == "NAMED"
|
||||
assert doc[5].morph_ == "Number=Plur"
|
||||
|
||||
|
||||
def test_doc_retokenize_merge_children(en_tokenizer):
|
||||
|
|
|
@ -22,15 +22,18 @@ def test_doc_retokenize_split(en_vocab):
|
|||
"tag": ["NNP"] * 2,
|
||||
"lemma": ["Los", "Angeles"],
|
||||
"ent_type": ["GPE"] * 2,
|
||||
"morph": ["Number=Sing"] * 2,
|
||||
},
|
||||
)
|
||||
assert len(doc) == 4
|
||||
assert doc[0].text == "Los"
|
||||
assert doc[0].head.text == "Angeles"
|
||||
assert doc[0].idx == 0
|
||||
assert doc[0].morph_ == "Number=Sing"
|
||||
assert doc[1].idx == 3
|
||||
assert doc[1].text == "Angeles"
|
||||
assert doc[1].head.text == "start"
|
||||
assert doc[1].morph_ == "Number=Sing"
|
||||
assert doc[2].text == "start"
|
||||
assert doc[2].head.text == "."
|
||||
assert doc[3].text == "."
|
||||
|
|
|
@ -13,7 +13,7 @@ from .span cimport Span
|
|||
from .token cimport Token
|
||||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||
from ..structs cimport LexemeC, TokenC
|
||||
from ..attrs cimport TAG
|
||||
from ..attrs cimport TAG, MORPH
|
||||
|
||||
from .underscore import is_writable_attr
|
||||
from ..attrs import intify_attrs
|
||||
|
@ -65,6 +65,8 @@ cdef class Retokenizer:
|
|||
attrs["_"] = extensions
|
||||
else:
|
||||
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
|
||||
if MORPH in attrs:
|
||||
self.doc.vocab.morphology.add(self.doc.vocab.strings.as_string(attrs[MORPH]))
|
||||
self.merges.append((span, attrs))
|
||||
|
||||
def split(self, Token token, orths, heads, attrs=SimpleFrozenDict()):
|
||||
|
@ -96,6 +98,9 @@ cdef class Retokenizer:
|
|||
# NB: Since we support {"KEY": [value, value]} syntax here, this
|
||||
# will only "intify" the keys, not the values
|
||||
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
|
||||
if MORPH in attrs:
|
||||
for morph in attrs[MORPH]:
|
||||
self.doc.vocab.morphology.add(self.doc.vocab.strings.as_string(morph))
|
||||
head_offsets = []
|
||||
for head in heads:
|
||||
if isinstance(head, Token):
|
||||
|
|
|
@ -43,6 +43,8 @@ cdef class Token:
|
|||
return token.pos
|
||||
elif feat_name == TAG:
|
||||
return token.tag
|
||||
elif feat_name == MORPH:
|
||||
return token.morph
|
||||
elif feat_name == DEP:
|
||||
return token.dep
|
||||
elif feat_name == HEAD:
|
||||
|
@ -71,6 +73,8 @@ cdef class Token:
|
|||
token.pos = <univ_pos_t>value
|
||||
elif feat_name == TAG:
|
||||
token.tag = value
|
||||
elif feat_name == MORPH:
|
||||
token.morph = value
|
||||
elif feat_name == DEP:
|
||||
token.dep = value
|
||||
elif feat_name == HEAD:
|
||||
|
|
Loading…
Reference in New Issue
Block a user