Serialize POS attribute when doc.is_tagged (#4092)

* fix and unit test for issue 3959 * additional unit test for manifestation of the same (resolved) bug
2025-06-28 00:43:09 +03:00 · 2019-08-21 21:59:30 +02:00 · 2019-08-21 21:59:30 +02:00 · 01c5980187
commit 01c5980187
parent 7539a4f3a8
3 changed files with 61 additions and 1 deletions
--- a/spacy/tests/regression/test_issue3959.py
+++ b/spacy/tests/regression/test_issue3959.py
@ -0,0 +1,29 @@
 # coding: utf8
 from __future__ import unicode_literals
 from spacy.lang.en import English
 from ..util import make_tempdir
 def test_issue3959():
    """ Ensure that a modified pos attribute is serialized correctly."""
    nlp = English()
    doc = nlp(
        "displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
    )
    assert doc[0].pos_ == ""
    doc[0].pos_ = "NOUN"
    assert doc[0].pos_ == "NOUN"
    # usually this is already True when starting from proper models instead of blank English
    doc.is_tagged = True
    with make_tempdir() as tmp_dir:
        file_path = tmp_dir / "my_doc"
        doc.to_disk(file_path)
        doc2 = nlp("")
        doc2.from_disk(file_path)
        assert doc2[0].pos_ == "NOUN"
--- a/spacy/tests/regression/test_issue4133.py
+++ b/spacy/tests/regression/test_issue4133.py
@ -0,0 +1,31 @@
 # coding: utf8
 from __future__ import unicode_literals
 from spacy.lang.en import English
 from spacy.tokens import Doc
 from spacy.vocab import Vocab
 def test_issue4133(en_vocab):
    nlp = English()
    vocab_bytes = nlp.vocab.to_bytes()
    words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
    pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
    doc = Doc(en_vocab, words=words)
    for i, token in enumerate(doc):
        token.pos_ = pos[i]
    # usually this is already True when starting from proper models instead of blank English
    doc.is_tagged = True
    doc_bytes = doc.to_bytes()
    vocab = Vocab()
    vocab = vocab.from_bytes(vocab_bytes)
    doc = Doc(vocab).from_bytes(doc_bytes)
    actual = []
    for token in doc:
        actual.append(token.pos_)
    assert actual == pos
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -864,7 +864,7 @@ cdef class Doc:
        """
        array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE]  # TODO: ENT_KB_ID ?
        if self.is_tagged:
-            array_head.append(TAG)
+            array_head.extend([TAG, POS])
        # If doc parsed add head and dep attribute
        if self.is_parsed:
            array_head.extend([HEAD, DEP])