mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Add spacy.attrs.intify_attrs function, to normalize strings in token attribute dictionaries.
This commit is contained in:
parent
09f68bc641
commit
53d8ca8f51
|
@ -86,5 +86,44 @@ IDS = {
|
|||
"LANG": LANG,
|
||||
}
|
||||
|
||||
|
||||
# ATTR IDs, in order of the symbol
|
||||
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
|
||||
|
||||
|
||||
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||
'''Normalize a dictionary of attributes, converting them to ints.
|
||||
|
||||
Arguments:
|
||||
stringy_attrs (dict):
|
||||
Dictionary keyed by attribute string names. Values can be ints or strings.
|
||||
|
||||
strings_map (StringStore):
|
||||
Defaults to None. If provided, encodes string values into ints.
|
||||
|
||||
Returns:
|
||||
inty_attrs (dict):
|
||||
Attributes dictionary with keys and optionally values converted to
|
||||
ints.
|
||||
'''
|
||||
inty_attrs = {}
|
||||
if _do_deprecated:
|
||||
if 'F' in stringy_attrs:
|
||||
stringy_attrs["ORTH"] = stringy_attrs.pop("F")
|
||||
if 'L' in stringy_attrs:
|
||||
stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
|
||||
if 'pos' in stringy_attrs:
|
||||
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
|
||||
if 'morph' in stringy_attrs:
|
||||
morphs = stringy_attrs.pop('morph')
|
||||
for name, value in morphs.items():
|
||||
stringy_attrs[name] = value
|
||||
for name, value in stringy_attrs.items():
|
||||
if isinstance(name, int):
|
||||
int_key = name
|
||||
else:
|
||||
int_key = IDS[name.upper()]
|
||||
if strings_map is not None and isinstance(value, basestring):
|
||||
value = strings_map[value]
|
||||
inty_attrs[int_key] = value
|
||||
return inty_attrs
|
||||
|
|
32
spacy/tests/unit/test_attrs.py
Normal file
32
spacy/tests/unit/test_attrs.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
from ...attrs import *
|
||||
|
||||
|
||||
def test_key_no_value():
|
||||
int_attrs = intify_attrs({"ORTH": "dog"})
|
||||
assert int_attrs == {ORTH: "dog"}
|
||||
|
||||
|
||||
def test_lower_key():
|
||||
int_attrs = intify_attrs({"norm": "dog"})
|
||||
assert int_attrs == {NORM: "dog"}
|
||||
|
||||
|
||||
|
||||
def test_lower_key_value():
|
||||
vals = {'dog': 10}
|
||||
int_attrs = intify_attrs({"lemma": "dog"}, strings_map=vals)
|
||||
assert int_attrs == {LEMMA: 10}
|
||||
|
||||
|
||||
def test_idempotence():
|
||||
vals = {'dog': 10}
|
||||
int_attrs = intify_attrs({"lemma": "dog", 'is_alpha': True}, strings_map=vals)
|
||||
int_attrs = intify_attrs(int_attrs)
|
||||
assert int_attrs == {LEMMA: 10, IS_ALPHA: True}
|
||||
|
||||
|
||||
def test_do_deprecated():
|
||||
vals = {'dog': 10}
|
||||
int_attrs = intify_attrs({"F": "dog", 'is_alpha': True}, strings_map=vals,
|
||||
_do_deprecated=True)
|
||||
assert int_attrs == {ORTH: 10, IS_ALPHA: True}
|
Loading…
Reference in New Issue
Block a user