From 53d8ca8f517f384b985f057078c9f94329e5b9b7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 25 Nov 2016 11:34:30 +0100 Subject: [PATCH] Add spacy.attrs.intify_attrs function, to normalize strings in token attribute dictionaries. --- spacy/attrs.pyx | 39 ++++++++++++++++++++++++++++++++++ spacy/tests/unit/test_attrs.py | 32 ++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 spacy/tests/unit/test_attrs.py diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 9a191beda..3e847ccea 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -86,5 +86,44 @@ IDS = { "LANG": LANG, } + # ATTR IDs, in order of the symbol NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] + + +def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): + '''Normalize a dictionary of attributes, converting them to ints. + + Arguments: + stringy_attrs (dict): + Dictionary keyed by attribute string names. Values can be ints or strings. + + strings_map (StringStore): + Defaults to None. If provided, encodes string values into ints. + + Returns: + inty_attrs (dict): + Attributes dictionary with keys and optionally values converted to + ints. + ''' + inty_attrs = {} + if _do_deprecated: + if 'F' in stringy_attrs: + stringy_attrs["ORTH"] = stringy_attrs.pop("F") + if 'L' in stringy_attrs: + stringy_attrs["LEMMA"] = stringy_attrs.pop("L") + if 'pos' in stringy_attrs: + stringy_attrs["TAG"] = stringy_attrs.pop("pos") + if 'morph' in stringy_attrs: + morphs = stringy_attrs.pop('morph') + for name, value in morphs.items(): + stringy_attrs[name] = value + for name, value in stringy_attrs.items(): + if isinstance(name, int): + int_key = name + else: + int_key = IDS[name.upper()] + if strings_map is not None and isinstance(value, basestring): + value = strings_map[value] + inty_attrs[int_key] = value + return inty_attrs diff --git a/spacy/tests/unit/test_attrs.py b/spacy/tests/unit/test_attrs.py new file mode 100644 index 000000000..22e3e8ce9 --- /dev/null +++ b/spacy/tests/unit/test_attrs.py @@ -0,0 +1,32 @@ +from ...attrs import * + + +def test_key_no_value(): + int_attrs = intify_attrs({"ORTH": "dog"}) + assert int_attrs == {ORTH: "dog"} + + +def test_lower_key(): + int_attrs = intify_attrs({"norm": "dog"}) + assert int_attrs == {NORM: "dog"} + + + +def test_lower_key_value(): + vals = {'dog': 10} + int_attrs = intify_attrs({"lemma": "dog"}, strings_map=vals) + assert int_attrs == {LEMMA: 10} + + +def test_idempotence(): + vals = {'dog': 10} + int_attrs = intify_attrs({"lemma": "dog", 'is_alpha': True}, strings_map=vals) + int_attrs = intify_attrs(int_attrs) + assert int_attrs == {LEMMA: 10, IS_ALPHA: True} + + +def test_do_deprecated(): + vals = {'dog': 10} + int_attrs = intify_attrs({"F": "dog", 'is_alpha': True}, strings_map=vals, + _do_deprecated=True) + assert int_attrs == {ORTH: 10, IS_ALPHA: True}