From fdb8815ef5cca0d6b3a1feddb055af4a36daf437 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 24 Jul 2020 09:28:06 +0200 Subject: [PATCH] Minor refactor for Morphology and MorphAnalysis (#5804) * `MorphAnalysis.get` returns only the field values * Move `_normalize_props` inside `Morphology` as `Morphology.normalize_attrs` and simplify * Simplify POS field detection/conversion * Convert all non-POS features to strings * `Morphology` returns an empty string for a missing morph to align with the FEATS string returned for an existing morph * Remove unused `list_to_feats` --- spacy/morphology.pyx | 89 ++++++++----------- spacy/tests/doc/test_morphanalysis.py | 12 +-- .../tests/morphology/test_morph_converters.py | 4 - spacy/tokens/morphanalysis.pyx | 6 +- 4 files changed, 47 insertions(+), 64 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 775fcbab0..b2ba32a59 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -18,43 +18,14 @@ from .util import ensure_path from . import symbols -def _normalize_props(props): - """Convert attrs dict so that POS is always by ID, other features are left - as is as long as they are strings or IDs. - """ - out = {} - props = dict(props) - for key, value in props.items(): - # convert POS value to ID - if key == POS: - if hasattr(value, 'upper'): - value = value.upper() - if value in POS_IDS: - value = POS_IDS[value] - out[key] = value - elif isinstance(key, str) and key.lower() == 'pos': - out[POS] = POS_IDS[value.upper()] - # sort values - elif isinstance(value, str) and Morphology.VALUE_SEP in value: - out[key] = Morphology.VALUE_SEP.join( - sorted(value.split(Morphology.VALUE_SEP))) - # accept any string or ID fields and values - elif isinstance(key, (int, str)) and isinstance(value, (int, str)): - out[key] = value - else: - warnings.warn(Warnings.W100.format(feature={key: value})) - return out - - cdef class Morphology: - '''Store the possible morphological analyses for a language, and index them + """Store the possible morphological analyses for a language, and index them by hash. - To save space on each token, tokens only know the hash of their morphological - analysis, so queries of morphological attributes are delegated + To save space on each token, tokens only know the hash of their + morphological analysis, so queries of morphological attributes are delegated to this class. - ''' - + """ FEATURE_SEP = "|" FIELD_SEP = "=" VALUE_SEP = "," @@ -86,7 +57,7 @@ cdef class Morphology: tag_map = dict(tag_map) tag_map['_SP'] = space_attrs for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): - attrs = _normalize_props(attrs) + attrs = self.normalize_attrs(attrs) self.add(attrs) self.tag_map[tag_str] = dict(attrs) self.reverse_index[self.strings.add(tag_str)] = i @@ -138,7 +109,7 @@ cdef class Morphology: return tag.key def normalize_features(self, features): - """Create a normalized UFEATS string from a features string or dict. + """Create a normalized FEATS string from a features string or dict. features (Union[dict, str]): Features as dict or UFEATS string. RETURNS (str): Features as normalized UFEATS string. @@ -148,7 +119,7 @@ cdef class Morphology: if not isinstance(features, dict): warnings.warn(Warnings.W100.format(feature=features)) features = {} - features = _normalize_props(features) + features = self.normalize_attrs(features) string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()} # normalized UFEATS string with sorted fields and values norm_feats_string = self.FEATURE_SEP.join(sorted([ @@ -157,6 +128,33 @@ cdef class Morphology: ])) return norm_feats_string or self.EMPTY_MORPH + def normalize_attrs(self, attrs): + """Convert attrs dict so that POS is always by ID, other features are + by string. Values separated by VALUE_SEP are sorted. + """ + out = {} + attrs = dict(attrs) + for key, value in attrs.items(): + # convert POS value to ID + if key == POS or (isinstance(key, str) and key.upper() == "POS"): + if isinstance(value, str) and value.upper() in POS_IDS: + value = POS_IDS[value.upper()] + elif isinstance(value, int) and value not in POS_IDS.values(): + warnings.warn(Warnings.W100.format(feature={key: value})) + continue + out[POS] = value + # accept any string or ID fields and values and convert to strings + elif isinstance(key, (int, str)) and isinstance(value, (int, str)): + key = self.strings.as_string(key) + value = self.strings.as_string(value) + # sort values + if self.VALUE_SEP in value: + value = self.VALUE_SEP.join(sorted(value.split(self.VALUE_SEP))) + out[key] = value + else: + warnings.warn(Warnings.W100.format(feature={key: value})) + return out + cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *: """Creates a MorphAnalysisC from a list of intified ("Field", "Field=Value") tuples where fields with multiple values have @@ -183,7 +181,7 @@ cdef class Morphology: def get(self, hash_t morph): tag = self.tags.get(morph) if tag == NULL: - return [] + return "" else: return self.strings[tag.key] @@ -218,7 +216,7 @@ cdef class Morphology: orth (str): The word-form to key the exception. """ attrs = dict(attrs) - attrs = _normalize_props(attrs) + attrs = self.normalize_attrs(attrs) self.add(attrs) attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) self._exc[(tag_str, self.strings.add(orth_str))] = attrs @@ -282,7 +280,7 @@ cdef class Morphology: # Map (form, pos) to attributes for tag, exc in morph_rules.items(): for orth, attrs in exc.items(): - attrs = _normalize_props(attrs) + attrs = self.normalize_attrs(attrs) self.add_special_case(self.strings.as_string(tag), self.strings.as_string(orth), attrs) @property @@ -309,19 +307,6 @@ cdef class Morphology: return "" return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()])) - @staticmethod - def list_to_feats(feats_list): - if len(feats_list) == 0: - return "" - feats_dict = {} - for feat in feats_list: - field, value = feat.split(Morphology.FIELD_SEP) - if field not in feats_dict: - feats_dict[field] = set() - feats_dict[field].add(value) - feats_dict = {field: Morphology.VALUE_SEP.join(sorted(values)) for field, values in feats_dict.items()} - return Morphology.dict_to_feats(feats_dict) - cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil: cdef int i diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py index 41b4acd0c..88557d100 100644 --- a/spacy/tests/doc/test_morphanalysis.py +++ b/spacy/tests/doc/test_morphanalysis.py @@ -35,7 +35,7 @@ def test_token_morph_key(i_has): def test_morph_props(i_has): - assert i_has[0].morph.get("PronType") == ["PronType=prs"] + assert i_has[0].morph.get("PronType") == ["prs"] assert i_has[1].morph.get("PronType") == [] @@ -47,20 +47,20 @@ def test_morph_iter(i_has): def test_morph_get(i_has): - assert i_has[0].morph.get("PronType") == ["PronType=prs"] + assert i_has[0].morph.get("PronType") == ["prs"] def test_morph_set(i_has): - assert i_has[0].morph.get("PronType") == ["PronType=prs"] + assert i_has[0].morph.get("PronType") == ["prs"] # set by string i_has[0].morph_ = "PronType=unk" - assert i_has[0].morph.get("PronType") == ["PronType=unk"] + assert i_has[0].morph.get("PronType") == ["unk"] # set by string, fields are alphabetized i_has[0].morph_ = "PronType=123|NounType=unk" assert i_has[0].morph_ == "NounType=unk|PronType=123" # set by dict - i_has[0].morph_ = {"AType": "123", "BType": "unk", "POS": "ADJ"} - assert i_has[0].morph_ == "AType=123|BType=unk|POS=ADJ" + i_has[0].morph_ = {"AType": "123", "BType": "unk"} + assert i_has[0].morph_ == "AType=123|BType=unk" # set by string with multiple values, fields and values are alphabetized i_has[0].morph_ = "BType=c|AType=b,a" assert i_has[0].morph_ == "AType=a,b|BType=c" diff --git a/spacy/tests/morphology/test_morph_converters.py b/spacy/tests/morphology/test_morph_converters.py index 9486cad45..6973bf782 100644 --- a/spacy/tests/morphology/test_morph_converters.py +++ b/spacy/tests/morphology/test_morph_converters.py @@ -4,10 +4,8 @@ from spacy.morphology import Morphology def test_feats_converters(): feats = "Case=dat,gen|Number=sing" feats_dict = {"Case": "dat,gen", "Number": "sing"} - feats_list = feats.split(Morphology.FEATURE_SEP) # simple conversions - assert Morphology.list_to_feats(feats_list) == feats assert Morphology.dict_to_feats(feats_dict) == feats assert Morphology.feats_to_dict(feats) == feats_dict @@ -18,8 +16,6 @@ def test_feats_converters(): # unsorted input is normalized unsorted_feats = "Number=sing|Case=gen,dat" unsorted_feats_dict = {"Case": "gen,dat", "Number": "sing"} - unsorted_feats_list = feats.split(Morphology.FEATURE_SEP) assert Morphology.feats_to_dict(unsorted_feats) == feats_dict assert Morphology.dict_to_feats(unsorted_feats_dict) == feats - assert Morphology.list_to_feats(unsorted_feats_list) == feats assert Morphology.dict_to_feats(Morphology.feats_to_dict(unsorted_feats)) == feats diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx index c49aede4d..a7d1f2e44 100644 --- a/spacy/tokens/morphanalysis.pyx +++ b/spacy/tokens/morphanalysis.pyx @@ -2,6 +2,7 @@ from libc.string cimport memset cimport numpy as np from ..errors import Errors +from ..morphology import Morphology from ..vocab cimport Vocab from ..typedefs cimport hash_t, attr_t from ..morphology cimport list_features, check_feature, get_by_field @@ -58,10 +59,11 @@ cdef class MorphAnalysis: return self.key != other.key def get(self, field): - """Retrieve a feature by field.""" + """Retrieve feature values by field.""" cdef attr_t field_id = self.vocab.strings.as_int(field) cdef np.ndarray results = get_by_field(&self.c, field_id) - return [self.vocab.strings[result] for result in results] + features = [self.vocab.strings[result] for result in results] + return [f.split(Morphology.FIELD_SEP)[1] for f in features] def to_json(self): """Produce a json serializable representation as a UD FEATS-style