Add spacy.attrs.intify_attrs function, to normalize strings in token attribute dictionaries.

This commit is contained in:
Matthew Honnibal 2016-11-25 11:34:30 +01:00
parent 09f68bc641
commit 53d8ca8f51
2 changed files with 71 additions and 0 deletions

View File

@ -86,5 +86,44 @@ IDS = {
"LANG": LANG,
}
# ATTR IDs, in order of the symbol
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
'''Normalize a dictionary of attributes, converting them to ints.
Arguments:
stringy_attrs (dict):
Dictionary keyed by attribute string names. Values can be ints or strings.
strings_map (StringStore):
Defaults to None. If provided, encodes string values into ints.
Returns:
inty_attrs (dict):
Attributes dictionary with keys and optionally values converted to
ints.
'''
inty_attrs = {}
if _do_deprecated:
if 'F' in stringy_attrs:
stringy_attrs["ORTH"] = stringy_attrs.pop("F")
if 'L' in stringy_attrs:
stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
if 'pos' in stringy_attrs:
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
if 'morph' in stringy_attrs:
morphs = stringy_attrs.pop('morph')
for name, value in morphs.items():
stringy_attrs[name] = value
for name, value in stringy_attrs.items():
if isinstance(name, int):
int_key = name
else:
int_key = IDS[name.upper()]
if strings_map is not None and isinstance(value, basestring):
value = strings_map[value]
inty_attrs[int_key] = value
return inty_attrs

View File

@ -0,0 +1,32 @@
from ...attrs import *
def test_key_no_value():
int_attrs = intify_attrs({"ORTH": "dog"})
assert int_attrs == {ORTH: "dog"}
def test_lower_key():
int_attrs = intify_attrs({"norm": "dog"})
assert int_attrs == {NORM: "dog"}
def test_lower_key_value():
vals = {'dog': 10}
int_attrs = intify_attrs({"lemma": "dog"}, strings_map=vals)
assert int_attrs == {LEMMA: 10}
def test_idempotence():
vals = {'dog': 10}
int_attrs = intify_attrs({"lemma": "dog", 'is_alpha': True}, strings_map=vals)
int_attrs = intify_attrs(int_attrs)
assert int_attrs == {LEMMA: 10, IS_ALPHA: True}
def test_do_deprecated():
vals = {'dog': 10}
int_attrs = intify_attrs({"F": "dog", 'is_alpha': True}, strings_map=vals,
_do_deprecated=True)
assert int_attrs == {ORTH: 10, IS_ALPHA: True}