mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Add MORPH attr, add support in retokenizer (#4947)
* Add MORPH attr / symbol for token attrs * Update retokenizer for MORPH
This commit is contained in:
parent
a365359b36
commit
5ee9d8c9b8
|
@ -91,3 +91,4 @@ cdef enum attr_id_t:
|
||||||
|
|
||||||
LANG
|
LANG
|
||||||
ENT_KB_ID = symbols.ENT_KB_ID
|
ENT_KB_ID = symbols.ENT_KB_ID
|
||||||
|
MORPH
|
||||||
|
|
|
@ -87,6 +87,7 @@ IDS = {
|
||||||
"SPACY": SPACY,
|
"SPACY": SPACY,
|
||||||
"PROB": PROB,
|
"PROB": PROB,
|
||||||
"LANG": LANG,
|
"LANG": LANG,
|
||||||
|
"MORPH": MORPH,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -462,3 +462,4 @@ cdef enum symbol_t:
|
||||||
acl
|
acl
|
||||||
|
|
||||||
ENT_KB_ID
|
ENT_KB_ID
|
||||||
|
MORPH
|
||||||
|
|
|
@ -462,6 +462,7 @@ IDS = {
|
||||||
|
|
||||||
"acl": acl,
|
"acl": acl,
|
||||||
"LAW": LAW,
|
"LAW": LAW,
|
||||||
|
"MORPH": MORPH,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ from ..util import get_doc
|
||||||
|
|
||||||
def test_doc_retokenize_merge(en_tokenizer):
|
def test_doc_retokenize_merge(en_tokenizer):
|
||||||
text = "WKRO played songs by the beach boys all night"
|
text = "WKRO played songs by the beach boys all night"
|
||||||
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE"}
|
attrs = {"tag": "NAMED", "lemma": "LEMMA", "ent_type": "TYPE", "morph": "Number=Plur"}
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
assert len(doc) == 9
|
assert len(doc) == 9
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
|
@ -18,9 +18,11 @@ def test_doc_retokenize_merge(en_tokenizer):
|
||||||
assert doc[4].text == "the beach boys"
|
assert doc[4].text == "the beach boys"
|
||||||
assert doc[4].text_with_ws == "the beach boys "
|
assert doc[4].text_with_ws == "the beach boys "
|
||||||
assert doc[4].tag_ == "NAMED"
|
assert doc[4].tag_ == "NAMED"
|
||||||
|
assert doc[4].morph_ == "Number=Plur"
|
||||||
assert doc[5].text == "all night"
|
assert doc[5].text == "all night"
|
||||||
assert doc[5].text_with_ws == "all night"
|
assert doc[5].text_with_ws == "all night"
|
||||||
assert doc[5].tag_ == "NAMED"
|
assert doc[5].tag_ == "NAMED"
|
||||||
|
assert doc[5].morph_ == "Number=Plur"
|
||||||
|
|
||||||
|
|
||||||
def test_doc_retokenize_merge_children(en_tokenizer):
|
def test_doc_retokenize_merge_children(en_tokenizer):
|
||||||
|
|
|
@ -22,15 +22,18 @@ def test_doc_retokenize_split(en_vocab):
|
||||||
"tag": ["NNP"] * 2,
|
"tag": ["NNP"] * 2,
|
||||||
"lemma": ["Los", "Angeles"],
|
"lemma": ["Los", "Angeles"],
|
||||||
"ent_type": ["GPE"] * 2,
|
"ent_type": ["GPE"] * 2,
|
||||||
|
"morph": ["Number=Sing"] * 2,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
assert len(doc) == 4
|
assert len(doc) == 4
|
||||||
assert doc[0].text == "Los"
|
assert doc[0].text == "Los"
|
||||||
assert doc[0].head.text == "Angeles"
|
assert doc[0].head.text == "Angeles"
|
||||||
assert doc[0].idx == 0
|
assert doc[0].idx == 0
|
||||||
|
assert doc[0].morph_ == "Number=Sing"
|
||||||
assert doc[1].idx == 3
|
assert doc[1].idx == 3
|
||||||
assert doc[1].text == "Angeles"
|
assert doc[1].text == "Angeles"
|
||||||
assert doc[1].head.text == "start"
|
assert doc[1].head.text == "start"
|
||||||
|
assert doc[1].morph_ == "Number=Sing"
|
||||||
assert doc[2].text == "start"
|
assert doc[2].text == "start"
|
||||||
assert doc[2].head.text == "."
|
assert doc[2].head.text == "."
|
||||||
assert doc[3].text == "."
|
assert doc[3].text == "."
|
||||||
|
|
|
@ -13,7 +13,7 @@ from .span cimport Span
|
||||||
from .token cimport Token
|
from .token cimport Token
|
||||||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
from ..lexeme cimport Lexeme, EMPTY_LEXEME
|
||||||
from ..structs cimport LexemeC, TokenC
|
from ..structs cimport LexemeC, TokenC
|
||||||
from ..attrs cimport TAG
|
from ..attrs cimport TAG, MORPH
|
||||||
|
|
||||||
from .underscore import is_writable_attr
|
from .underscore import is_writable_attr
|
||||||
from ..attrs import intify_attrs
|
from ..attrs import intify_attrs
|
||||||
|
@ -65,6 +65,8 @@ cdef class Retokenizer:
|
||||||
attrs["_"] = extensions
|
attrs["_"] = extensions
|
||||||
else:
|
else:
|
||||||
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
|
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
|
||||||
|
if MORPH in attrs:
|
||||||
|
self.doc.vocab.morphology.add(self.doc.vocab.strings.as_string(attrs[MORPH]))
|
||||||
self.merges.append((span, attrs))
|
self.merges.append((span, attrs))
|
||||||
|
|
||||||
def split(self, Token token, orths, heads, attrs=SimpleFrozenDict()):
|
def split(self, Token token, orths, heads, attrs=SimpleFrozenDict()):
|
||||||
|
@ -96,6 +98,9 @@ cdef class Retokenizer:
|
||||||
# NB: Since we support {"KEY": [value, value]} syntax here, this
|
# NB: Since we support {"KEY": [value, value]} syntax here, this
|
||||||
# will only "intify" the keys, not the values
|
# will only "intify" the keys, not the values
|
||||||
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
|
attrs = intify_attrs(attrs, strings_map=self.doc.vocab.strings)
|
||||||
|
if MORPH in attrs:
|
||||||
|
for morph in attrs[MORPH]:
|
||||||
|
self.doc.vocab.morphology.add(self.doc.vocab.strings.as_string(morph))
|
||||||
head_offsets = []
|
head_offsets = []
|
||||||
for head in heads:
|
for head in heads:
|
||||||
if isinstance(head, Token):
|
if isinstance(head, Token):
|
||||||
|
|
|
@ -43,6 +43,8 @@ cdef class Token:
|
||||||
return token.pos
|
return token.pos
|
||||||
elif feat_name == TAG:
|
elif feat_name == TAG:
|
||||||
return token.tag
|
return token.tag
|
||||||
|
elif feat_name == MORPH:
|
||||||
|
return token.morph
|
||||||
elif feat_name == DEP:
|
elif feat_name == DEP:
|
||||||
return token.dep
|
return token.dep
|
||||||
elif feat_name == HEAD:
|
elif feat_name == HEAD:
|
||||||
|
@ -71,6 +73,8 @@ cdef class Token:
|
||||||
token.pos = <univ_pos_t>value
|
token.pos = <univ_pos_t>value
|
||||||
elif feat_name == TAG:
|
elif feat_name == TAG:
|
||||||
token.tag = value
|
token.tag = value
|
||||||
|
elif feat_name == MORPH:
|
||||||
|
token.morph = value
|
||||||
elif feat_name == DEP:
|
elif feat_name == DEP:
|
||||||
token.dep = value
|
token.dep = value
|
||||||
elif feat_name == HEAD:
|
elif feat_name == HEAD:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user