Handle unset token.morph in Morphologizer (#6704)

* Handle unset token.morph in Morphologizer Handle unset `token.morph` in `Morphologizer.initialize` and `Morphologizer.get_loss`. If both `token.morph` and `token.pos` are unset, treat the annotation as missing rather than empty. * Add token.has_morph()
2025-08-06 05:10:21 +03:00 · 2021-01-15 17:20:10 +01:00 · 2021-01-15 17:20:10 +01:00 · 9328dd5625
commit 9328dd5625
parent 7b3f0c6f1b
4 changed files with 58 additions and 3 deletions
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@ -145,6 +145,10 @@ class Morphologizer(Tagger):
            for example in get_examples():
                for i, token in enumerate(example.reference):
                    pos = token.pos_
+                    # if both are unset, annotation is missing, so do not add
+                    # an empty label
+                    if pos == "" and not token.has_morph():
+                        continue
                    morph = str(token.morph)
                    # create and add the combined morph+POS label
                    morph_dict = Morphology.feats_to_dict(morph)
@ -155,7 +159,7 @@ class Morphologizer(Tagger):
                    if norm_label not in self.cfg["labels_morph"]:
                        self.cfg["labels_morph"][norm_label] = morph
                        self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
-        if len(self.labels) <= 1:
+        if len(self.labels) < 1:
            raise ValueError(Errors.E143.format(name=self.name))
        doc_sample = []
        label_sample = []
@ -217,15 +221,24 @@ class Morphologizer(Tagger):
                pos = pos_tags[i]
                morph = morphs[i]
                # POS may align (same value for multiple tokens) when morph
-                # doesn't, so if either is None, treat both as None here so that
-                # truths doesn't end up with an unknown morph+POS combination
+                # doesn't, so if either is misaligned (None), treat the
+                # annotation as missing so that truths doesn't end up with an
+                # unknown morph+POS combination
                if pos is None or morph is None:
                    label = None
+                # If both are unset, the annotation is missing (empty morph
+                # converted from int is "_" rather than "")
+                elif pos == "" and morph == "":
+                    label = None
+                # Otherwise, generate the combined label
                else:
                    label_dict = Morphology.feats_to_dict(morph)
                    if pos:
                        label_dict[self.POS_FEAT] = pos
                    label = self.vocab.strings[self.vocab.morphology.add(label_dict)]
+                    # As a fail-safe, skip any unrecognized labels
+                    if label not in self.labels:
+                        label = None
                eg_truths.append(label)
            truths.append(eg_truths)
        d_scores, loss = loss_func(scores, truths)
--- a/spacy/tests/pipeline/test_morphologizer.py
+++ b/spacy/tests/pipeline/test_morphologizer.py
@ -136,3 +136,28 @@ def test_overfitting_IO():
    gold_pos_tags = ["", "", "", ""]
    assert [str(t.morph) for t in doc] == gold_morphs
    assert [t.pos_ for t in doc] == gold_pos_tags
+
+    # Test with unset morph and partial POS
+    nlp.remove_pipe("morphologizer")
+    nlp.add_pipe("morphologizer")
+    for example in train_examples:
+        for token in example.reference:
+            if token.text == "ham":
+                token.pos_ = "NOUN"
+            else:
+                token.pos_ = ""
+            token.set_morph(None)
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+    print(nlp.get_pipe("morphologizer").labels)
+    for i in range(50):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses)
+    assert losses["morphologizer"] < 0.00001
+
+    # Test the trained model
+    test_text = "I like blue ham"
+    doc = nlp(test_text)
+    gold_morphs = ["", "", "", ""]
+    gold_pos_tags = ["NOUN", "NOUN", "NOUN", "NOUN"]
+    assert [str(t.morph) for t in doc] == gold_morphs
+    assert [t.pos_ for t in doc] == gold_pos_tags
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -211,6 +211,14 @@ cdef class Token:
        xp = get_array_module(vector)
        return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))

+    def has_morph(self):
+        """Check whether the token has annotated morph information.
+        Return False when the morph annotation is unset/missing.
+
+        RETURNS (bool): Whether the morph annotation is set.
+        """
+        return not self.c.morph == 0
+
    property morph:
        def __get__(self):
            return MorphAnalysis.from_id(self.vocab, self.c.morph)
--- a/website/docs/api/token.md
+++ b/website/docs/api/token.md
@ -191,6 +191,15 @@ the morph to an unset state.
 | -------- | --------------------------------------------------------------------------------- |
 | features | The morphological features to set. ~~Union[int, dict, str, MorphAnalysis, None]~~ |

+## Token.has_morph {#has_morph tag="method"}
+
+Check whether the token has annotated morph information. Return `False` when the
+morph annotation is unset/missing.
+
+| Name        | Description                                   |
+| ----------- | --------------------------------------------- |
+| **RETURNS** | Whether the morph annotation is set. ~~bool~~ |
+
 ## Token.is_ancestor {#is_ancestor tag="method" model="parser"}

 Check whether this token is a parent, grandparent, etc. of another in the