From fdb8815ef5cca0d6b3a1feddb055af4a36daf437 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 24 Jul 2020 09:28:06 +0200
Subject: [PATCH] Minor refactor for Morphology and MorphAnalysis (#5804)

* `MorphAnalysis.get` returns only the field values
* Move `_normalize_props` inside `Morphology` as
`Morphology.normalize_attrs` and simplify
  * Simplify POS field detection/conversion
  * Convert all non-POS features to strings
* `Morphology` returns an empty string for a missing morph to align
with the FEATS string returned for an existing morph
* Remove unused `list_to_feats`
---
 spacy/morphology.pyx                          | 89 ++++++++-----------
 spacy/tests/doc/test_morphanalysis.py         | 12 +--
 .../tests/morphology/test_morph_converters.py |  4 -
 spacy/tokens/morphanalysis.pyx                |  6 +-
 4 files changed, 47 insertions(+), 64 deletions(-)

diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 775fcbab0..b2ba32a59 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -18,43 +18,14 @@ from .util import ensure_path
 from . import symbols
 
 
-def _normalize_props(props):
-    """Convert attrs dict so that POS is always by ID, other features are left
-    as is as long as they are strings or IDs.
-    """
-    out = {}
-    props = dict(props)
-    for key, value in props.items():
-        # convert POS value to ID
-        if key == POS:
-            if hasattr(value, 'upper'):
-                value = value.upper()
-            if value in POS_IDS:
-                value = POS_IDS[value]
-            out[key] = value
-        elif isinstance(key, str) and key.lower() == 'pos':
-            out[POS] = POS_IDS[value.upper()]
-        # sort values
-        elif isinstance(value, str) and Morphology.VALUE_SEP in value:
-            out[key] = Morphology.VALUE_SEP.join(
-                    sorted(value.split(Morphology.VALUE_SEP)))
-        # accept any string or ID fields and values
-        elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
-            out[key] = value
-        else:
-            warnings.warn(Warnings.W100.format(feature={key: value}))
-    return out
-
-
 cdef class Morphology:
-    '''Store the possible morphological analyses for a language, and index them
+    """Store the possible morphological analyses for a language, and index them
     by hash.
 
-    To save space on each token, tokens only know the hash of their morphological
-    analysis, so queries of morphological attributes are delegated
+    To save space on each token, tokens only know the hash of their
+    morphological analysis, so queries of morphological attributes are delegated
     to this class.
-    '''
-
+    """
     FEATURE_SEP = "|"
     FIELD_SEP = "="
     VALUE_SEP = ","
@@ -86,7 +57,7 @@ cdef class Morphology:
             tag_map = dict(tag_map)
             tag_map['_SP'] = space_attrs
         for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
-            attrs = _normalize_props(attrs)
+            attrs = self.normalize_attrs(attrs)
             self.add(attrs)
             self.tag_map[tag_str] = dict(attrs)
             self.reverse_index[self.strings.add(tag_str)] = i
@@ -138,7 +109,7 @@ cdef class Morphology:
         return tag.key
 
     def normalize_features(self, features):
-        """Create a normalized UFEATS string from a features string or dict.
+        """Create a normalized FEATS string from a features string or dict.
 
         features (Union[dict, str]): Features as dict or UFEATS string.
         RETURNS (str): Features as normalized UFEATS string.
@@ -148,7 +119,7 @@ cdef class Morphology:
         if not isinstance(features, dict):
             warnings.warn(Warnings.W100.format(feature=features))
             features = {}
-        features = _normalize_props(features)
+        features = self.normalize_attrs(features)
         string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
         # normalized UFEATS string with sorted fields and values
         norm_feats_string = self.FEATURE_SEP.join(sorted([
@@ -157,6 +128,33 @@ cdef class Morphology:
         ]))
         return norm_feats_string or self.EMPTY_MORPH
 
+    def normalize_attrs(self, attrs):
+        """Convert attrs dict so that POS is always by ID, other features are
+        by string. Values separated by VALUE_SEP are sorted.
+        """
+        out = {}
+        attrs = dict(attrs)
+        for key, value in attrs.items():
+            # convert POS value to ID
+            if key == POS or (isinstance(key, str) and key.upper() == "POS"):
+                if isinstance(value, str) and value.upper() in POS_IDS:
+                    value = POS_IDS[value.upper()]
+                elif isinstance(value, int) and value not in POS_IDS.values():
+                    warnings.warn(Warnings.W100.format(feature={key: value}))
+                    continue
+                out[POS] = value
+            # accept any string or ID fields and values and convert to strings
+            elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
+                key = self.strings.as_string(key)
+                value = self.strings.as_string(value)
+                # sort values
+                if self.VALUE_SEP in value:
+                    value = self.VALUE_SEP.join(sorted(value.split(self.VALUE_SEP)))
+                out[key] = value
+            else:
+                warnings.warn(Warnings.W100.format(feature={key: value}))
+        return out
+
     cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *:
         """Creates a MorphAnalysisC from a list of intified
         ("Field", "Field=Value") tuples where fields with multiple values have
@@ -183,7 +181,7 @@ cdef class Morphology:
     def get(self, hash_t morph):
         tag = <MorphAnalysisC*>self.tags.get(morph)
         if tag == NULL:
-            return []
+            return ""
         else:
             return self.strings[tag.key]
 
@@ -218,7 +216,7 @@ cdef class Morphology:
         orth (str): The word-form to key the exception.
         """
         attrs = dict(attrs)
-        attrs = _normalize_props(attrs)
+        attrs = self.normalize_attrs(attrs)
         self.add(attrs)
         attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
         self._exc[(tag_str, self.strings.add(orth_str))] = attrs
@@ -282,7 +280,7 @@ cdef class Morphology:
         # Map (form, pos) to attributes
         for tag, exc in morph_rules.items():
             for orth, attrs in exc.items():
-                attrs = _normalize_props(attrs)
+                attrs = self.normalize_attrs(attrs)
                 self.add_special_case(self.strings.as_string(tag), self.strings.as_string(orth), attrs)
 
     @property
@@ -309,19 +307,6 @@ cdef class Morphology:
             return ""
         return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()]))
 
-    @staticmethod
-    def list_to_feats(feats_list):
-        if len(feats_list) == 0:
-            return ""
-        feats_dict = {}
-        for feat in feats_list:
-            field, value = feat.split(Morphology.FIELD_SEP)
-            if field not in feats_dict:
-                feats_dict[field] = set()
-            feats_dict[field].add(value)
-        feats_dict = {field: Morphology.VALUE_SEP.join(sorted(values)) for field, values in feats_dict.items()}
-        return Morphology.dict_to_feats(feats_dict)
-
 
 cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil:
     cdef int i
diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py
index 41b4acd0c..88557d100 100644
--- a/spacy/tests/doc/test_morphanalysis.py
+++ b/spacy/tests/doc/test_morphanalysis.py
@@ -35,7 +35,7 @@ def test_token_morph_key(i_has):
 
 
 def test_morph_props(i_has):
-    assert i_has[0].morph.get("PronType") == ["PronType=prs"]
+    assert i_has[0].morph.get("PronType") == ["prs"]
     assert i_has[1].morph.get("PronType") == []
 
 
@@ -47,20 +47,20 @@ def test_morph_iter(i_has):
 
 
 def test_morph_get(i_has):
-    assert i_has[0].morph.get("PronType") == ["PronType=prs"]
+    assert i_has[0].morph.get("PronType") == ["prs"]
 
 
 def test_morph_set(i_has):
-    assert i_has[0].morph.get("PronType") == ["PronType=prs"]
+    assert i_has[0].morph.get("PronType") == ["prs"]
     # set by string
     i_has[0].morph_ = "PronType=unk"
-    assert i_has[0].morph.get("PronType") == ["PronType=unk"]
+    assert i_has[0].morph.get("PronType") == ["unk"]
     # set by string, fields are alphabetized
     i_has[0].morph_ = "PronType=123|NounType=unk"
     assert i_has[0].morph_ == "NounType=unk|PronType=123"
     # set by dict
-    i_has[0].morph_ = {"AType": "123", "BType": "unk", "POS": "ADJ"}
-    assert i_has[0].morph_ == "AType=123|BType=unk|POS=ADJ"
+    i_has[0].morph_ = {"AType": "123", "BType": "unk"}
+    assert i_has[0].morph_ == "AType=123|BType=unk"
     # set by string with multiple values, fields and values are alphabetized
     i_has[0].morph_ = "BType=c|AType=b,a"
     assert i_has[0].morph_ == "AType=a,b|BType=c"
diff --git a/spacy/tests/morphology/test_morph_converters.py b/spacy/tests/morphology/test_morph_converters.py
index 9486cad45..6973bf782 100644
--- a/spacy/tests/morphology/test_morph_converters.py
+++ b/spacy/tests/morphology/test_morph_converters.py
@@ -4,10 +4,8 @@ from spacy.morphology import Morphology
 def test_feats_converters():
     feats = "Case=dat,gen|Number=sing"
     feats_dict = {"Case": "dat,gen", "Number": "sing"}
-    feats_list = feats.split(Morphology.FEATURE_SEP)
 
     # simple conversions
-    assert Morphology.list_to_feats(feats_list) == feats
     assert Morphology.dict_to_feats(feats_dict) == feats
     assert Morphology.feats_to_dict(feats) == feats_dict
 
@@ -18,8 +16,6 @@ def test_feats_converters():
     # unsorted input is normalized
     unsorted_feats = "Number=sing|Case=gen,dat"
     unsorted_feats_dict = {"Case": "gen,dat", "Number": "sing"}
-    unsorted_feats_list = feats.split(Morphology.FEATURE_SEP)
     assert Morphology.feats_to_dict(unsorted_feats) == feats_dict
     assert Morphology.dict_to_feats(unsorted_feats_dict) == feats
-    assert Morphology.list_to_feats(unsorted_feats_list) == feats
     assert Morphology.dict_to_feats(Morphology.feats_to_dict(unsorted_feats)) == feats
diff --git a/spacy/tokens/morphanalysis.pyx b/spacy/tokens/morphanalysis.pyx
index c49aede4d..a7d1f2e44 100644
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@@ -2,6 +2,7 @@ from libc.string cimport memset
 cimport numpy as np
 
 from ..errors import Errors
+from ..morphology import Morphology
 from ..vocab cimport Vocab
 from ..typedefs cimport hash_t, attr_t
 from ..morphology cimport list_features, check_feature, get_by_field
@@ -58,10 +59,11 @@ cdef class MorphAnalysis:
         return self.key != other.key
 
     def get(self, field):
-        """Retrieve a feature by field."""
+        """Retrieve feature values by field."""
         cdef attr_t field_id = self.vocab.strings.as_int(field)
         cdef np.ndarray results = get_by_field(&self.c, field_id)
-        return [self.vocab.strings[result] for result in results]
+        features = [self.vocab.strings[result] for result in results]
+        return [f.split(Morphology.FIELD_SEP)[1] for f in features]
 
     def to_json(self):
         """Produce a json serializable representation as a UD FEATS-style