Morphology/Morphologizer optimizations and refactoring (#11024)

* `Morphology`: Refactor to use C types, reduce allocations, remove unused code * `Morphologzier`: Avoid unnecessary sorting of morpho features * `Morphologizer`: Remove execessive reallocations of labels, improve hash lookups of labels, coerce `numpy` numeric types to native ints Update docs * Remove unused method * Replace `unique_ptr` usage with `shared_ptr` * Add type annotations to internal Python methods, rename `hash` variable, fix typos * Add comment to clarify implementation detail * Fix return type * `Morphology`: Stop early when splitting fields and values
2024-12-24 17:06:29 +03:00 · 2022-07-15 11:14:08 +02:00 · 2022-07-15 11:14:08 +02:00 · ba18d2913d
commit ba18d2913d
parent 851a7ca4fa
8 changed files with 235 additions and 161 deletions
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -1,23 +1,41 @@
-from cymem.cymem cimport Pool
-from preshed.maps cimport PreshMap
 cimport numpy as np
-from libc.stdint cimport uint64_t
+from libc.stdint cimport uint32_t, uint64_t
+from libcpp.unordered_map cimport unordered_map
+from libcpp.vector cimport vector
+from libcpp.memory cimport shared_ptr

-from .structs cimport MorphAnalysisC
 from .strings cimport StringStore
 from .typedefs cimport attr_t, hash_t


+cdef cppclass Feature:
+    hash_t field
+    hash_t value
+
+    __init__():
+        this.field = 0
+        this.value = 0
+
+
+cdef cppclass MorphAnalysisC:
+    hash_t key  
+    vector[Feature] features
+
+    __init__():
+        this.key = 0
+
 cdef class Morphology:
-    cdef readonly Pool mem
    cdef readonly StringStore strings
-    cdef PreshMap tags # Keyed by hash, value is pointer to tag
+    cdef unordered_map[hash_t, shared_ptr[MorphAnalysisC]] tags

-    cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
-    cdef int insert(self, MorphAnalysisC tag) except -1
+    cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash)
+    cdef void _intern_morph_tag(self, hash_t tag_key, feats)
+    cdef hash_t _add(self, features)
+    cdef str _normalize_features(self, features)
+    cdef str get_morph_str(self, hash_t morph_key)
+    cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key)

-
-cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil
-cdef list list_features(const MorphAnalysisC* morph)
-cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field)
-cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil
+cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil
+cdef list list_features(const shared_ptr[MorphAnalysisC] morph)
+cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field)
+cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -1,10 +1,10 @@
 # cython: infer_types
 import numpy
 import warnings
+from typing import Union, Tuple, List, Dict, Optional
+from cython.operator cimport dereference as deref
+from libcpp.memory cimport shared_ptr

-from .attrs cimport POS
-
-from .parts_of_speech import IDS as POS_IDS
 from .errors import Warnings
 from . import symbols

@ -24,134 +24,187 @@ cdef class Morphology:
    EMPTY_MORPH = symbols.NAMES[symbols._]

    def __init__(self, StringStore strings):
-        self.mem = Pool()
        self.strings = strings
-        self.tags = PreshMap()

    def __reduce__(self):
        tags = set([self.get(self.strings[s]) for s in self.strings])
        tags -= set([""])
        return (unpickle_morphology, (self.strings, sorted(tags)), None, None)

-    def add(self, features):
+    cdef shared_ptr[MorphAnalysisC] _lookup_tag(self, hash_t tag_hash):
+        match = self.tags.find(tag_hash)
+        if match != self.tags.const_end():
+            return deref(match).second
+        else:
+            return shared_ptr[MorphAnalysisC]()
+
+    def _normalize_attr(self, attr_key : Union[int, str], attr_value : Union[int, str]) -> Optional[Tuple[str, Union[str, List[str]]]]:
+        if isinstance(attr_key, (int, str)) and isinstance(attr_value, (int, str)):
+            attr_key = self.strings.as_string(attr_key)
+            attr_value = self.strings.as_string(attr_value)
+
+            # Preserve multiple values as a list
+            if self.VALUE_SEP in attr_value:
+                values = attr_value.split(self.VALUE_SEP)
+                values.sort()
+                attr_value = values
+        else:
+            warnings.warn(Warnings.W100.format(feature={attr_key: attr_value}))
+            return None
+
+        return attr_key, attr_value
+
+    def _str_to_normalized_feat_dict(self, feats: str) -> Dict[str, str]:
+        if not feats or feats == self.EMPTY_MORPH:
+            return {}
+
+        out = []
+        for feat in feats.split(self.FEATURE_SEP):
+            field, values = feat.split(self.FIELD_SEP, 1)
+            normalized_attr = self._normalize_attr(field, values)
+            if normalized_attr is None:
+                continue
+            out.append((normalized_attr[0], normalized_attr[1]))
+        out.sort(key=lambda x: x[0])
+        return dict(out)
+
+    def _dict_to_normalized_feat_dict(self, feats: Dict[Union[int, str], Union[int, str]]) -> Dict[str, str]:
+        out = []
+        for field, values in feats.items():
+            normalized_attr = self._normalize_attr(field, values)
+            if normalized_attr is None:
+                continue
+            out.append((normalized_attr[0], normalized_attr[1]))
+        out.sort(key=lambda x: x[0])
+        return dict(out)
+
+
+    def _normalized_feat_dict_to_str(self, feats: Dict[str, str]) -> str:
+        norm_feats_string = self.FEATURE_SEP.join([
+                self.FIELD_SEP.join([field, self.VALUE_SEP.join(values) if isinstance(values, list) else values])
+            for field, values in feats.items()
+        ])
+        return norm_feats_string or self.EMPTY_MORPH
+
+
+    cdef hash_t _add(self, features):
        """Insert a morphological analysis in the morphology table, if not
        already present. The morphological analysis may be provided in the UD
        FEATS format as a string or in the tag map dict format.
        Returns the hash of the new analysis.
        """
-        cdef MorphAnalysisC* tag_ptr
+        cdef hash_t tag_hash = 0
+        cdef shared_ptr[MorphAnalysisC] tag
        if isinstance(features, str):
            if features == "":
                features = self.EMPTY_MORPH
-            tag_ptr = <MorphAnalysisC*>self.tags.get(<hash_t>self.strings[features])
-            if tag_ptr != NULL:
-                return tag_ptr.key
-            features = self.feats_to_dict(features)
-        if not isinstance(features, dict):
+
+            tag_hash = self.strings[features]
+            tag = self._lookup_tag(tag_hash)
+            if tag:
+                return deref(tag).key
+
+            features = self._str_to_normalized_feat_dict(features)
+        elif isinstance(features, dict):
+            features = self._dict_to_normalized_feat_dict(features)
+        else:
            warnings.warn(Warnings.W100.format(feature=features))
            features = {}
-        string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
-        # intified ("Field", "Field=Value") pairs
-        field_feature_pairs = []
-        for field in sorted(string_features):
-            values = string_features[field]
-            for value in values.split(self.VALUE_SEP):
-                field_feature_pairs.append((
-                    self.strings.add(field),
-                    self.strings.add(field + self.FIELD_SEP + value),
-                ))
-        cdef MorphAnalysisC tag = self.create_morph_tag(field_feature_pairs)
+
        # the hash key for the tag is either the hash of the normalized UFEATS
        # string or the hash of an empty placeholder
-        norm_feats_string = self.normalize_features(features)
-        tag.key = self.strings.add(norm_feats_string)
-        self.insert(tag)
-        return tag.key
+        norm_feats_string = self._normalized_feat_dict_to_str(features)
+        tag_hash = self.strings.add(norm_feats_string)
+        tag = self._lookup_tag(tag_hash)
+        if tag:
+            return deref(tag).key

-    def normalize_features(self, features):
+        self._intern_morph_tag(tag_hash, features)
+        return tag_hash
+
+    cdef void _intern_morph_tag(self, hash_t tag_key, feats):
+        # intified ("Field", "Field=Value") pairs where fields with multiple values have
+        # been split into individual tuples, e.g.:
+        # [("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"),
+        # ("Field2", "Field2=Value3")]
+        field_feature_pairs = []
+
+        # Feat dict is normalized at this point.
+        for field, values in feats.items():
+            field_key = self.strings.add(field)
+            if isinstance(values, list):
+                for value in values:
+                    value_key = self.strings.add(field + self.FIELD_SEP + value)
+                    field_feature_pairs.append((field_key, value_key))
+            else:
+                # We could box scalar values into a list and use a common
+                # code path to generate features but that incurs a small 
+                # but measurable allocation/iteration overhead (as this
+                # branch is taken often enough).
+                value_key = self.strings.add(field + self.FIELD_SEP + values)
+                field_feature_pairs.append((field_key, value_key))
+
+        num_features = len(field_feature_pairs)
+        cdef shared_ptr[MorphAnalysisC] tag = shared_ptr[MorphAnalysisC](new MorphAnalysisC())
+        deref(tag).key = tag_key
+        deref(tag).features.resize(num_features)
+
+        for i in range(num_features):
+            deref(tag).features[i].field = field_feature_pairs[i][0]
+            deref(tag).features[i].value = field_feature_pairs[i][1]
+
+        self.tags[tag_key] = tag
+
+    cdef str get_morph_str(self, hash_t morph_key):
+        cdef shared_ptr[MorphAnalysisC] tag = self._lookup_tag(morph_key)
+        if not tag:
+            return ""
+        else:
+            return self.strings[deref(tag).key]
+
+    cdef shared_ptr[MorphAnalysisC] get_morph_c(self, hash_t morph_key):
+        return self._lookup_tag(morph_key)
+
+    cdef str _normalize_features(self, features):
        """Create a normalized FEATS string from a features string or dict.

        features (Union[dict, str]): Features as dict or UFEATS string.
        RETURNS (str): Features as normalized UFEATS string.
        """
        if isinstance(features, str):
-            features = self.feats_to_dict(features)
-        if not isinstance(features, dict):
+            features = self._str_to_normalized_feat_dict(features)
+        elif isinstance(features, dict):
+            features = self._dict_to_normalized_feat_dict(features)
+        else:
            warnings.warn(Warnings.W100.format(feature=features))
            features = {}
-        features = self.normalize_attrs(features)
-        string_features = {self.strings.as_string(field): self.strings.as_string(values) for field, values in features.items()}
-        # normalized UFEATS string with sorted fields and values
-        norm_feats_string = self.FEATURE_SEP.join(sorted([
-                self.FIELD_SEP.join([field, values])
-            for field, values in string_features.items()
-        ]))
-        return norm_feats_string or self.EMPTY_MORPH

-    def normalize_attrs(self, attrs):
-        """Convert attrs dict so that POS is always by ID, other features are
-        by string. Values separated by VALUE_SEP are sorted.
-        """
-        out = {}
-        attrs = dict(attrs)
-        for key, value in attrs.items():
-            # convert POS value to ID
-            if key == POS or (isinstance(key, str) and key.upper() == "POS"):
-                if isinstance(value, str) and value.upper() in POS_IDS:
-                    value = POS_IDS[value.upper()]
-                elif isinstance(value, int) and value not in POS_IDS.values():
-                    warnings.warn(Warnings.W100.format(feature={key: value}))
-                    continue
-                out[POS] = value
-            # accept any string or ID fields and values and convert to strings
-            elif isinstance(key, (int, str)) and isinstance(value, (int, str)):
-                key = self.strings.as_string(key)
-                value = self.strings.as_string(value)
-                # sort values
-                if self.VALUE_SEP in value:
-                    value = self.VALUE_SEP.join(sorted(value.split(self.VALUE_SEP)))
-                out[key] = value
-            else:
-                warnings.warn(Warnings.W100.format(feature={key: value}))
-        return out
+        return self._normalized_feat_dict_to_str(features)

-    cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *:
-        """Creates a MorphAnalysisC from a list of intified
-        ("Field", "Field=Value") tuples where fields with multiple values have
-        been split into individual tuples, e.g.:
-        [("Field1", "Field1=Value1"), ("Field1", "Field1=Value2"),
-        ("Field2", "Field2=Value3")]
-        """
-        cdef MorphAnalysisC tag
-        tag.length = len(field_feature_pairs)
-        if tag.length > 0:
-            tag.fields = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
-            tag.features = <attr_t*>self.mem.alloc(tag.length, sizeof(attr_t))
-        for i, (field, feature) in enumerate(field_feature_pairs):
-            tag.fields[i] = field
-            tag.features[i] = feature
-        return tag
+    def add(self, features):
+        return self._add(features)

-    cdef int insert(self, MorphAnalysisC tag) except -1:
-        cdef hash_t key = tag.key
-        if self.tags.get(key) == NULL:
-            tag_ptr = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
-            tag_ptr[0] = tag
-            self.tags.set(key, <void*>tag_ptr)
+    def get(self, morph_key):
+        return self.get_morph_str(morph_key)

-    def get(self, hash_t morph):
-        tag = <MorphAnalysisC*>self.tags.get(morph)
-        if tag == NULL:
-            return ""
-        else:
-            return self.strings[tag.key]
+    def normalize_features(self, features):
+        return self._normalize_features(features)

    @staticmethod
-    def feats_to_dict(feats):
+    def feats_to_dict(feats, *, sort_values=True):
        if not feats or feats == Morphology.EMPTY_MORPH:
            return {}
-        return {field: Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP))) for field, values in
-                [feat.split(Morphology.FIELD_SEP) for feat in feats.split(Morphology.FEATURE_SEP)]}
+
+        out = {}
+        for feat in feats.split(Morphology.FEATURE_SEP):
+            field, values = feat.split(Morphology.FIELD_SEP, 1)
+            if sort_values:
+                values = values.split(Morphology.VALUE_SEP)
+                values.sort()
+                values = Morphology.VALUE_SEP.join(values)
+
+            out[field] = values
+        return out

    @staticmethod
    def dict_to_feats(feats_dict):
@ -160,34 +213,34 @@ cdef class Morphology:
        return Morphology.FEATURE_SEP.join(sorted([Morphology.FIELD_SEP.join([field, Morphology.VALUE_SEP.join(sorted(values.split(Morphology.VALUE_SEP)))]) for field, values in feats_dict.items()]))


-cdef int check_feature(const MorphAnalysisC* morph, attr_t feature) nogil:
+cdef int check_feature(const shared_ptr[MorphAnalysisC] morph, attr_t feature) nogil:
    cdef int i
-    for i in range(morph.length):
-        if morph.features[i] == feature:
+    for i in range(deref(morph).features.size()):
+        if deref(morph).features[i].value == feature:
            return True
    return False


-cdef list list_features(const MorphAnalysisC* morph):
+cdef list list_features(const shared_ptr[MorphAnalysisC] morph):
    cdef int i
    features = []
-    for i in range(morph.length):
-        features.append(morph.features[i])
+    for i in range(deref(morph).features.size()):
+        features.append(deref(morph).features[i].value)
    return features


-cdef np.ndarray get_by_field(const MorphAnalysisC* morph, attr_t field):
-    cdef np.ndarray results = numpy.zeros((morph.length,), dtype="uint64")
+cdef np.ndarray get_by_field(const shared_ptr[MorphAnalysisC] morph, attr_t field):
+    cdef np.ndarray results = numpy.zeros((deref(morph).features.size(),), dtype="uint64")
    n = get_n_by_field(<uint64_t*>results.data, morph, field)
    return results[:n]


-cdef int get_n_by_field(attr_t* results, const MorphAnalysisC* morph, attr_t field) nogil:
+cdef int get_n_by_field(attr_t* results, const shared_ptr[MorphAnalysisC] morph, attr_t field) nogil:
    cdef int n_results = 0
    cdef int i
-    for i in range(morph.length):
-        if morph.fields[i] == field:
-            results[n_results] = morph.features[i]
+    for i in range(deref(morph).features.size()):
+        if deref(morph).features[i].field == field:
+            results[n_results] = deref(morph).features[i].value
            n_results += 1
    return n_results

--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -127,8 +127,8 @@ class Morphologizer(Tagger):

    @property
    def labels(self):
-        """RETURNS (Tuple[str]): The labels currently added to the component."""
-        return tuple(self.cfg["labels_morph"].keys())
+        """RETURNS (Iterable[str]): The labels currently added to the component."""
+        return self.cfg["labels_morph"].keys()

    @property
    def label_data(self) -> Dict[str, Dict[str, Union[str, float, int, None]]]:
@ -151,7 +151,7 @@ class Morphologizer(Tagger):
        # normalize label
        norm_label = self.vocab.morphology.normalize_features(label)
        # extract separate POS and morph tags
-        label_dict = Morphology.feats_to_dict(label)
+        label_dict = Morphology.feats_to_dict(label, sort_values=False)
        pos = label_dict.get(self.POS_FEAT, "")
        if self.POS_FEAT in label_dict:
            label_dict.pop(self.POS_FEAT)
@ -189,7 +189,7 @@ class Morphologizer(Tagger):
                        continue
                    morph = str(token.morph)
                    # create and add the combined morph+POS label
-                    morph_dict = Morphology.feats_to_dict(morph)
+                    morph_dict = Morphology.feats_to_dict(morph, sort_values=False)
                    if pos:
                        morph_dict[self.POS_FEAT] = pos
                    norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
@ -206,7 +206,7 @@ class Morphologizer(Tagger):
            for i, token in enumerate(example.reference):
                pos = token.pos_
                morph = str(token.morph)
-                morph_dict = Morphology.feats_to_dict(morph)
+                morph_dict = Morphology.feats_to_dict(morph, sort_values=False)
                if pos:
                    morph_dict[self.POS_FEAT] = pos
                norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
@ -231,26 +231,29 @@ class Morphologizer(Tagger):
        cdef Vocab vocab = self.vocab
        cdef bint overwrite = self.cfg["overwrite"]
        cdef bint extend = self.cfg["extend"]
-        labels = self.labels
+
+        # We require random access for the upcoming ops, so we need
+        # to allocate a compatible container out of the iterable.
+        labels = tuple(self.labels)
        for i, doc in enumerate(docs):
            doc_tag_ids = batch_tag_ids[i]
            if hasattr(doc_tag_ids, "get"):
                doc_tag_ids = doc_tag_ids.get()
            for j, tag_id in enumerate(doc_tag_ids):
-                morph = labels[tag_id]
+                morph = labels[int(tag_id)]
                # set morph
                if doc.c[j].morph == 0 or overwrite or extend:
                    if overwrite and extend:
                        # morphologizer morph overwrites any existing features
                        # while extending
-                        extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph])
-                        extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0)))
+                        extended_morph = Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph], sort_values=False)
+                        extended_morph.update(Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0), sort_values=False))
                        doc.c[j].morph = self.vocab.morphology.add(extended_morph)
                    elif extend:
                        # existing features are preserved and any new features
                        # are added
-                        extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0))
-                        extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph]))
+                        extended_morph = Morphology.feats_to_dict(self.cfg["labels_morph"].get(morph, 0), sort_values=False)
+                        extended_morph.update(Morphology.feats_to_dict(self.vocab.strings[doc.c[j].morph], sort_values=False))
                        doc.c[j].morph = self.vocab.morphology.add(extended_morph)
                    else:
                        # clobber
@ -270,7 +273,7 @@ class Morphologizer(Tagger):
        DOCS: https://spacy.io/api/morphologizer#get_loss
        """
        validate_examples(examples, "Morphologizer.get_loss")
-        loss_func = SequenceCategoricalCrossentropy(names=self.labels, normalize=False)
+        loss_func = SequenceCategoricalCrossentropy(names=tuple(self.labels), normalize=False)
        truths = []
        for eg in examples:
            eg_truths = []
@ -291,7 +294,7 @@ class Morphologizer(Tagger):
                    label = None
                # Otherwise, generate the combined label
                else:
-                    label_dict = Morphology.feats_to_dict(morph)
+                    label_dict = Morphology.feats_to_dict(morph, sort_values=False)
                    if pos:
                        label_dict[self.POS_FEAT] = pos
                    label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@ -58,14 +58,6 @@ cdef struct TokenC:
    hash_t ent_id


-cdef struct MorphAnalysisC:
-    hash_t key
-    int length
-
-    attr_t* fields
-    attr_t* features
-
-
 # Internal struct, for storage and disambiguation of entities.
 cdef struct KBEntryC:

--- a/spacy/tokens/morphanalysis.pxd
+++ b/spacy/tokens/morphanalysis.pxd
@ -1,9 +1,12 @@
 from ..vocab cimport Vocab
 from ..typedefs cimport hash_t
-from ..structs cimport MorphAnalysisC
+from ..morphology cimport MorphAnalysisC
+from libcpp.memory cimport shared_ptr


 cdef class MorphAnalysis:
    cdef readonly Vocab vocab
    cdef readonly hash_t key
-    cdef MorphAnalysisC c
+    cdef shared_ptr[MorphAnalysisC] c
+
+    cdef void _init_c(self, hash_t key)
--- a/spacy/tokens/morphanalysis.pyx
+++ b/spacy/tokens/morphanalysis.pyx
@ -5,7 +5,12 @@ from ..errors import Errors
 from ..morphology import Morphology
 from ..vocab cimport Vocab
 from ..typedefs cimport hash_t, attr_t
-from ..morphology cimport list_features, check_feature, get_by_field
+from ..morphology cimport list_features, check_feature, get_by_field, MorphAnalysisC
+from libcpp.memory cimport shared_ptr
+from cython.operator cimport dereference as deref
+
+
+cdef shared_ptr[MorphAnalysisC] EMPTY_MORPH_TAG = shared_ptr[MorphAnalysisC](new MorphAnalysisC())


 cdef class MorphAnalysis:
@ -13,39 +18,38 @@ cdef class MorphAnalysis:
    def __init__(self, Vocab vocab, features=dict()):
        self.vocab = vocab
        self.key = self.vocab.morphology.add(features)
-        analysis = <const MorphAnalysisC*>self.vocab.morphology.tags.get(self.key)
-        if analysis is not NULL:
-            self.c = analysis[0]
+        self._init_c(self.key)
+
+    cdef void _init_c(self, hash_t key):
+        cdef shared_ptr[MorphAnalysisC] analysis = self.vocab.morphology.get_morph_c(key)
+        if analysis:
+            self.c = analysis
        else:
-            memset(&self.c, 0, sizeof(self.c))
+            self.c = EMPTY_MORPH_TAG

    @classmethod
    def from_id(cls, Vocab vocab, hash_t key):
        """Create a morphological analysis from a given ID."""
-        cdef MorphAnalysis morph = MorphAnalysis.__new__(MorphAnalysis, vocab)
+        cdef MorphAnalysis morph = MorphAnalysis(vocab)
        morph.vocab = vocab
        morph.key = key
-        analysis = <const MorphAnalysisC*>vocab.morphology.tags.get(key)
-        if analysis is not NULL:
-            morph.c = analysis[0]
-        else:
-            memset(&morph.c, 0, sizeof(morph.c))
+        morph._init_c(key)
        return morph

    def __contains__(self, feature):
        """Test whether the morphological analysis contains some feature."""
        cdef attr_t feat_id = self.vocab.strings.as_int(feature)
-        return check_feature(&self.c, feat_id)
+        return check_feature(self.c, feat_id)

    def __iter__(self):
        """Iterate over the features in the analysis."""
        cdef attr_t feature
-        for feature in list_features(&self.c):
+        for feature in list_features(self.c):
            yield self.vocab.strings[feature]

    def __len__(self):
        """The number of features in the analysis."""
-        return self.c.length
+        return deref(self.c).features.size()

    def __hash__(self):
        return self.key
@ -61,7 +65,7 @@ cdef class MorphAnalysis:
    def get(self, field):
        """Retrieve feature values by field."""
        cdef attr_t field_id = self.vocab.strings.as_int(field)
-        cdef np.ndarray results = get_by_field(&self.c, field_id)
+        cdef np.ndarray results = get_by_field(self.c, field_id)
        features = [self.vocab.strings[result] for result in results]
        return [f.split(Morphology.FIELD_SEP)[1] for f in features]

@ -69,7 +73,7 @@ cdef class MorphAnalysis:
        """Produce a json serializable representation as a UD FEATS-style
        string.
        """
-        morph_string = self.vocab.strings[self.c.key]
+        morph_string = self.vocab.strings[deref(self.c).key]
        if morph_string == self.vocab.morphology.EMPTY_MORPH:
            return ""
        return morph_string
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -22,6 +22,7 @@ from .. import parts_of_speech
 from ..errors import Errors, Warnings
 from ..attrs import IOB_STRINGS
 from .underscore import Underscore, get_ext_args
+from cython.operator cimport dereference as deref


 cdef class Token:
@ -230,7 +231,7 @@ cdef class Token:
            # Check that the morph has the same vocab
            if self.vocab != morph.vocab:
                raise ValueError(Errors.E1013)
-            self.c.morph = morph.c.key
+            self.c.morph = deref(morph.c).key

    def set_morph(self, features):
        cdef hash_t key
--- a/website/docs/api/morphologizer.md
+++ b/website/docs/api/morphologizer.md
@ -401,7 +401,7 @@ coarse-grained POS as the feature `POS`.

 | Name        | Description                                            |
 | ----------- | ------------------------------------------------------ |
-| **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
+| **RETURNS** | The labels added to the component. ~~Iterable[str, ...]~~ |

 ## Morphologizer.label_data {#label_data tag="property" new="3"}