mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 17:54:39 +03:00
Add spacy.attrs.intify_attrs function, to normalize strings in token attribute dictionaries.
This commit is contained in:
parent
09f68bc641
commit
53d8ca8f51
|
@ -86,5 +86,44 @@ IDS = {
|
||||||
"LANG": LANG,
|
"LANG": LANG,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# ATTR IDs, in order of the symbol
|
# ATTR IDs, in order of the symbol
|
||||||
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
|
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
|
||||||
|
|
||||||
|
|
||||||
|
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
|
'''Normalize a dictionary of attributes, converting them to ints.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
stringy_attrs (dict):
|
||||||
|
Dictionary keyed by attribute string names. Values can be ints or strings.
|
||||||
|
|
||||||
|
strings_map (StringStore):
|
||||||
|
Defaults to None. If provided, encodes string values into ints.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
inty_attrs (dict):
|
||||||
|
Attributes dictionary with keys and optionally values converted to
|
||||||
|
ints.
|
||||||
|
'''
|
||||||
|
inty_attrs = {}
|
||||||
|
if _do_deprecated:
|
||||||
|
if 'F' in stringy_attrs:
|
||||||
|
stringy_attrs["ORTH"] = stringy_attrs.pop("F")
|
||||||
|
if 'L' in stringy_attrs:
|
||||||
|
stringy_attrs["LEMMA"] = stringy_attrs.pop("L")
|
||||||
|
if 'pos' in stringy_attrs:
|
||||||
|
stringy_attrs["TAG"] = stringy_attrs.pop("pos")
|
||||||
|
if 'morph' in stringy_attrs:
|
||||||
|
morphs = stringy_attrs.pop('morph')
|
||||||
|
for name, value in morphs.items():
|
||||||
|
stringy_attrs[name] = value
|
||||||
|
for name, value in stringy_attrs.items():
|
||||||
|
if isinstance(name, int):
|
||||||
|
int_key = name
|
||||||
|
else:
|
||||||
|
int_key = IDS[name.upper()]
|
||||||
|
if strings_map is not None and isinstance(value, basestring):
|
||||||
|
value = strings_map[value]
|
||||||
|
inty_attrs[int_key] = value
|
||||||
|
return inty_attrs
|
||||||
|
|
32
spacy/tests/unit/test_attrs.py
Normal file
32
spacy/tests/unit/test_attrs.py
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
from ...attrs import *
|
||||||
|
|
||||||
|
|
||||||
|
def test_key_no_value():
|
||||||
|
int_attrs = intify_attrs({"ORTH": "dog"})
|
||||||
|
assert int_attrs == {ORTH: "dog"}
|
||||||
|
|
||||||
|
|
||||||
|
def test_lower_key():
|
||||||
|
int_attrs = intify_attrs({"norm": "dog"})
|
||||||
|
assert int_attrs == {NORM: "dog"}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_lower_key_value():
|
||||||
|
vals = {'dog': 10}
|
||||||
|
int_attrs = intify_attrs({"lemma": "dog"}, strings_map=vals)
|
||||||
|
assert int_attrs == {LEMMA: 10}
|
||||||
|
|
||||||
|
|
||||||
|
def test_idempotence():
|
||||||
|
vals = {'dog': 10}
|
||||||
|
int_attrs = intify_attrs({"lemma": "dog", 'is_alpha': True}, strings_map=vals)
|
||||||
|
int_attrs = intify_attrs(int_attrs)
|
||||||
|
assert int_attrs == {LEMMA: 10, IS_ALPHA: True}
|
||||||
|
|
||||||
|
|
||||||
|
def test_do_deprecated():
|
||||||
|
vals = {'dog': 10}
|
||||||
|
int_attrs = intify_attrs({"F": "dog", 'is_alpha': True}, strings_map=vals,
|
||||||
|
_do_deprecated=True)
|
||||||
|
assert int_attrs == {ORTH: 10, IS_ALPHA: True}
|
Loading…
Reference in New Issue
Block a user