Add morph to morphology in Doc.from_array (#5762)

* Add morph to morphology in Doc.from_array Add morphological analyses to morphology table in `Doc.from_array`. * Use separate vocab in DocBin roundtrip test
2025-12-13 13:14:32 +03:00 · 2020-07-14 14:07:35 +02:00 · 2020-07-14 14:07:35 +02:00 · a7a7e0d2a6
commit a7a7e0d2a6
parent 872938ec76
3 changed files with 26 additions and 3 deletions
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -2,7 +2,7 @@ import pytest
 import numpy
 from spacy.tokens import Doc, Span
 from spacy.vocab import Vocab
-from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP
+from spacy.attrs import ENT_TYPE, ENT_IOB, SENT_START, HEAD, DEP, MORPH

 from ..util import get_doc

@ -290,6 +290,24 @@ def test_doc_from_array_sent_starts(en_vocab):
    assert new_doc.is_parsed


+def test_doc_from_array_morph(en_vocab):
+    words = ["I", "live", "in", "New", "York", "."]
+    # fmt: off
+    morphs = ["Feat1=A", "Feat1=B", "Feat1=C", "Feat1=A|Feat2=D", "Feat2=E", "Feat3=F"]
+    # fmt: on
+    doc = Doc(en_vocab, words=words)
+    for i, morph in enumerate(morphs):
+        doc[i].morph_ = morph
+
+    attrs = [MORPH]
+    arr = doc.to_array(attrs)
+    new_doc = Doc(en_vocab, words=words)
+    new_doc.from_array(attrs, arr)
+
+    assert [t.morph_ for t in new_doc] == morphs
+    assert [t.morph_ for t in doc] == [t.morph_ for t in new_doc]
+
+
 def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
    en_texts = ["Merging the docs is fun.", "They don't think alike."]
    de_text = "Wie war die Frage?"
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@ -515,6 +515,8 @@ def test_roundtrip_docs_to_docbin(doc):

    # roundtrip to DocBin
    with make_tempdir() as tmpdir:
+        # use a separate vocab to test that all labels are added
+        reloaded_nlp = English()
        json_file = tmpdir / "roundtrip.json"
        srsly.write_json(json_file, [docs_to_json(doc)])
        goldcorpus = Corpus(str(json_file), str(json_file))
@ -523,8 +525,8 @@ def test_roundtrip_docs_to_docbin(doc):
        with output_file.open("wb") as file_:
            file_.write(data)
        goldcorpus = Corpus(train_loc=str(output_file), dev_loc=str(output_file))
-        reloaded_example = next(goldcorpus.dev_dataset(nlp=nlp))
-        assert len(doc) == goldcorpus.count_train(nlp)
+        reloaded_example = next(goldcorpus.dev_dataset(nlp=reloaded_nlp))
+        assert len(doc) == goldcorpus.count_train(reloaded_nlp)
    assert text == reloaded_example.reference.text
    assert idx == [t.idx for t in reloaded_example.reference]
    assert tags == [t.tag_ for t in reloaded_example.reference]
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -870,6 +870,9 @@ cdef class Doc:
            for j in range(n_attrs):
                if attr_ids[j] != TAG:
                    value = values[j * stride + i]
+                    if attr_ids[j] == MORPH:
+                        # add morph to morphology table
+                        self.vocab.morphology.add(self.vocab.strings[value])
                    Token.set_struct_attr(token, attr_ids[j], value)
        # Set flags
        self.is_parsed = bool(self.is_parsed or HEAD in attrs)