Serialize POS attribute when doc.is_tagged (#4092)

* fix and unit test for issue 3959 * additional unit test for manifestation of the same (resolved) bug
2026-02-02 05:26:01 +03:00 · 2019-08-21 21:59:30 +02:00 · 2019-08-21 21:59:30 +02:00 · 01c5980187
commit 01c5980187
parent 7539a4f3a8
3 changed files with 61 additions and 1 deletions
--- a/spacy/tests/regression/test_issue3959.py
+++ b/spacy/tests/regression/test_issue3959.py
@ -0,0 +1,29 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from spacy.lang.en import English
+from ..util import make_tempdir
+
+
+def test_issue3959():
+    """ Ensure that a modified pos attribute is serialized correctly."""
+    nlp = English()
+    doc = nlp(
+        "displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
+    )
+    assert doc[0].pos_ == ""
+
+    doc[0].pos_ = "NOUN"
+    assert doc[0].pos_ == "NOUN"
+
+    # usually this is already True when starting from proper models instead of blank English
+    doc.is_tagged = True
+
+    with make_tempdir() as tmp_dir:
+        file_path = tmp_dir / "my_doc"
+        doc.to_disk(file_path)
+
+        doc2 = nlp("")
+        doc2.from_disk(file_path)
+
+        assert doc2[0].pos_ == "NOUN"
--- a/spacy/tests/regression/test_issue4133.py
+++ b/spacy/tests/regression/test_issue4133.py
@ -0,0 +1,31 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from spacy.lang.en import English
+from spacy.tokens import Doc
+from spacy.vocab import Vocab
+
+
+def test_issue4133(en_vocab):
+    nlp = English()
+    vocab_bytes = nlp.vocab.to_bytes()
+    words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
+    pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
+    doc = Doc(en_vocab, words=words)
+    for i, token in enumerate(doc):
+        token.pos_ = pos[i]
+
+    # usually this is already True when starting from proper models instead of blank English
+    doc.is_tagged = True
+
+    doc_bytes = doc.to_bytes()
+
+    vocab = Vocab()
+    vocab = vocab.from_bytes(vocab_bytes)
+    doc = Doc(vocab).from_bytes(doc_bytes)
+
+    actual = []
+    for token in doc:
+        actual.append(token.pos_)
+
+    assert actual == pos
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -864,7 +864,7 @@ cdef class Doc:
        """
        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE]  # TODO: ENT_KB_ID ?
        if self.is_tagged:
-            array_head.append(TAG)
+            array_head.extend([TAG, POS])
        # If doc parsed add head and dep attribute
        if self.is_parsed:
            array_head.extend([HEAD, DEP])