Minor refactor for Morphology and MorphAnalysis (#5804)

* `MorphAnalysis.get` returns only the field values * Move `_normalize_props` inside `Morphology` as `Morphology.normalize_attrs` and simplify * Simplify POS field detection/conversion * Convert all non-POS features to strings * `Morphology` returns an empty string for a missing morph to align with the FEATS string returned for an existing morph * Remove unused `list_to_feats`
2025-11-06 10:57:34 +03:00 · 2020-07-24 09:28:06 +02:00 · 2020-07-24 09:28:06 +02:00 · fdb8815ef5
commit fdb8815ef5
parent d0c6d1efc5
4 changed files with 47 additions and 64 deletions
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -18,43 +18,14 @@ from .util import ensure_path
 from . import symbols


-def _normalize_props(props):
-    """Convert attrs dict so that POS is always by ID, other features are left
-    as is as long as they are strings or IDs.
-    """
-    out = {}
-    props = dict(props)
-    for key, value in props.items():
-        # convert POS value to ID
-        if key == POS:
-            if hasattr(value, 'upper'):
-                value = value.upper()
-            if value in POS_IDS:
-                value = POS_IDS[value]
-            out[key] = value
-        elif isinstance(key, str) and key.lower() == 'pos':
-            out[POS] = POS_IDS[value.upper()]
-        # sort values
-        elif isinstance(value, str) and Morphology.VALUE_SEP in value:
-            out[key] = Morphology.VALUE_SEP.join(
-                    sorted(value.split(Morphology.VALUE_SEP)))
-        # accept any string or ID fields and values
-        elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
-            out[key] = value
-        else:
-            warnings.warn(Warnings.W100.format(feature={key: value}))
-    return out
-
-
 cdef class Morphology:
-    '''Store the possible morphological analyses for a language, and index them
+    """Store the possible morphological analyses for a language, and index them
    by hash.

-    To save space on each token, tokens only know the hash of their morphological
-    analysis, so queries of morphological attributes are delegated
+    To save space on each token, tokens only know the hash of their
+    morphological analysis, so queries of morphological attributes are delegated
    to this class.
-    '''
-
+    """
    FEATURE_SEP = "|"
    FIELD_SEP = "="
    VALUE_SEP = ","
@ -86,7 +57,7 @@ cdef class Morphology:
            tag_map = dict(tag_map)
            tag_map['_SP'] = space_attrs
        for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
-            attrs = _normalize_props(attrs)
+            attrs = self.normalize_attrs(attrs)
            self.add(attrs)
            self.tag_map[tag_str] = dict(attrs)
            self.reverse_index[self.strings.add(tag_str)] = i
@ -138,7 +109,7 @@ cdef class Morphology:
        return tag.key

    def normalize_features(self, features):
-        """Create a normalized UFEATS string from a features string or dict.
+        """Create a normalized FEATS string from a features string or dict.

        features (Union[dict, str]): Features as dict or UFEATS string.
        RETURNS (str): Features as normalized UFEATS string.
@ -148,7 +119,7 @@ cdef class Morphology:
        if not isinstance(features, dict):
            warnings.warn(Warnings.W100.format(feature=features))
            features = {}
-        features = _normalize_props(features)
+        features = self.normalize_attrs(features)
        string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
        # normalized UFEATS string with sorted fields and values
        norm_feats_string = self.FEATURE_SEP.join(sorted([
@ -157,6 +128,33 @@ cdef class Morphology:
        ]))
        return norm_feats_string or self.EMPTY_MORPH

+    def normalize_attrs(self, attrs):
+        """Convert attrs dict so that POS is always by ID, other features are
+        by string. Values separated by VALUE_SEP are sorted.
+        """
+        out = {}
+        attrs = dict(attrs)
+        for key, value in attrs.items():
+            # convert POS value to ID
+            if key == POS or (isinstance(key, str) and key.upper() == "POS"):
+                if isinstance(value, str) and value.upper() in POS_IDS:
+                    value = POS_IDS[value.upper()]
+                elif isinstance(value, int) and value not in POS_IDS.values():
+                    warnings.warn(Warnings.W100.format(feature={key: value}))
+                    continue
+                out[POS] = value
+            # accept any string or ID fields and values and convert to strings
+            elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
+                key = self.strings.as_string(key)
+                value = self.strings.as_string(value)
+                # sort values
+                if self.VALUE_SEP in value:
+                    value = self.VALUE_SEP.join(sorted(value.split(self.VALUE_SEP)))
+                out[key] = value
+            else:
+                warnings.warn(Warnings.W100.format(feature={key: value}))
+        return out
+
    cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *:
        """Creates a MorphAnalysisC from a list of intified
        ("Field", "Field=Value") tuples where fields with multiple values have
@ -183,7 +181,7 @@ cdef class Morphology:
    def get(self, hash_t morph):
        tag = <MorphAnalysisC*>self.tags.get(morph)
        if tag == NULL:
-            return []
+            return ""
        else:
            return self.strings[tag.key]

@ -218,7 +216,7 @@ cdef class Morphology:
        orth (str): The word-form to key the exception.
        """
        attrs = dict(attrs)
-        attrs = _normalize_props(attrs)
+        attrs = self.normalize_attrs(attrs)
        self.add(attrs)
        attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
        self._exc[(tag_str, self.strings.add(orth_str))] = attrs
@ -282,7 +280,7 @@ cdef class Morphology:
        # Map (form, pos) to attributes
        for tag, exc in morph_rules.items():
            for orth, attrs in exc.items():
-                attrs = _normalize_props(attrs)
+                attrs = self.normalize_attrs(attrs)
                self.add_special_case(self.strings.as_string(tag), self.strings.as_string(orth), attrs)

    @property
@ -309,19 +307,6 @@ cdef class Morphology:
            return ""
        return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()]))

-    @staticmethod
-    def list_to_feats(feats_list):
-        if len(feats_list) == 0:
-            return ""
-        feats_dict = {}
-        for feat in feats_list:
-            field, value = feat.split(Morphology.FIELD_SEP)
-            if field not in feats_dict:
-                feats_dict[field] = set()
-            feats_dict[field].add(value)
-        feats_dict = {field: Morphology.VALUE_SEP.join(sorted(values)) for field, values in feats_dict.items()}
-        return Morphology.dict_to_feats(feats_dict)
-

 cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil:
    cdef int i
--- a/spacy/tests/doc/test_morphanalysis.py
+++ b/spacy/tests/doc/test_morphanalysis.py
@ -35,7 +35,7 @@ def test_token_morph_key(i_has):


 def test_morph_props(i_has):
-    assert i_has[0].morph.get("PronType") == ["PronType=prs"]
+    assert i_has[0].morph.get("PronType") == ["prs"]
    assert i_has[1].morph.get("PronType") == []


@ -47,20 +47,20 @@ def test_morph_iter(i_has):


 def test_morph_get(i_has):
-    assert i_has[0].morph.get("PronType") == ["PronType=prs"]
+    assert i_has[0].morph.get("PronType") == ["prs"]


 def test_morph_set(i_has):
-    assert i_has[0].morph.get("PronType") == ["PronType=prs"]
+    assert i_has[0].morph.get("PronType") == ["prs"]
    # set by string
    i_has[0].morph_ = "PronType=unk"
-    assert i_has[0].morph.get("PronType") == ["PronType=unk"]
+    assert i_has[0].morph.get("PronType") == ["unk"]
    # set by string, fields are alphabetized
    i_has[0].morph_ = "PronType=123|NounType=unk"
    assert i_has[0].morph_ == "NounType=unk|PronType=123"
    # set by dict
-    i_has[0].morph_ = {"AType": "123", "BType": "unk", "POS": "ADJ"}
-    assert i_has[0].morph_ == "AType=123|BType=unk|POS=ADJ"
+    i_has[0].morph_ = {"AType": "123", "BType": "unk"}
+    assert i_has[0].morph_ == "AType=123|BType=unk"
    # set by string with multiple values, fields and values are alphabetized
    i_has[0].morph_ = "BType=c|AType=b,a"
    assert i_has[0].morph_ == "AType=a,b|BType=c"
--- a/spacy/tests/morphology/test_morph_converters.py
+++ b/spacy/tests/morphology/test_morph_converters.py
@ -4,10 +4,8 @@ from spacy.morphology import Morphology
 def test_feats_converters():
    feats = "Case=dat,gen|Number=sing"
    feats_dict = {"Case": "dat,gen", "Number": "sing"}
-    feats_list = feats.split(Morphology.FEATURE_SEP)

    # simple conversions
-    assert Morphology.list_to_feats(feats_list) == feats
    assert Morphology.dict_to_feats(feats_dict) == feats
    assert Morphology.feats_to_dict(feats) == feats_dict

@ -18,8 +16,6 @@ def test_feats_converters():
    # unsorted input is normalized
    unsorted_feats = "Number=sing|Case=gen,dat"
    unsorted_feats_dict = {"Case": "gen,dat", "Number": "sing"}
-    unsorted_feats_list = feats.split(Morphology.FEATURE_SEP)
    assert Morphology.feats_to_dict(unsorted_feats) == feats_dict
    assert Morphology.dict_to_feats(unsorted_feats_dict) == feats
-    assert Morphology.list_to_feats(unsorted_feats_list) == feats
    assert Morphology.dict_to_feats(Morphology.feats_to_dict(unsorted_feats)) == feats
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@ -2,6 +2,7 @@ from libc.string cimport memset
 cimport numpy as np

 from ..errors import Errors
+from ..morphology import Morphology
 from ..vocab cimport Vocab
 from ..typedefs cimport hash_t, attr_t
 from ..morphology cimport list_features, check_feature, get_by_field
@ -58,10 +59,11 @@ cdef class MorphAnalysis:
        return self.key != other.key

    def get(self, field):
-        """Retrieve a feature by field."""
+        """Retrieve feature values by field."""
        cdef attr_t field_id = self.vocab.strings.as_int(field)
        cdef np.ndarray results = get_by_field(&self.c, field_id)
-        return [self.vocab.strings[result] for result in results]
+        features = [self.vocab.strings[result] for result in results]
+        return [f.split(Morphology.FIELD_SEP)[1] for f in features]

    def to_json(self):
        """Produce a json serializable representation as a UD FEATS-style