diff --git a/spacy/tests/regression/test_issue3959.py b/spacy/tests/regression/test_issue3959.py new file mode 100644 index 000000000..c1f7fe100 --- /dev/null +++ b/spacy/tests/regression/test_issue3959.py @@ -0,0 +1,29 @@ +# coding: utf8 +from __future__ import unicode_literals + +from spacy.lang.en import English +from ..util import make_tempdir + + +def test_issue3959(): + """ Ensure that a modified pos attribute is serialized correctly.""" + nlp = English() + doc = nlp( + "displaCy uses JavaScript, SVG and CSS to show you how computers understand language" + ) + assert doc[0].pos_ == "" + + doc[0].pos_ = "NOUN" + assert doc[0].pos_ == "NOUN" + + # usually this is already True when starting from proper models instead of blank English + doc.is_tagged = True + + with make_tempdir() as tmp_dir: + file_path = tmp_dir / "my_doc" + doc.to_disk(file_path) + + doc2 = nlp("") + doc2.from_disk(file_path) + + assert doc2[0].pos_ == "NOUN" diff --git a/spacy/tests/regression/test_issue4133.py b/spacy/tests/regression/test_issue4133.py new file mode 100644 index 000000000..93262f8cf --- /dev/null +++ b/spacy/tests/regression/test_issue4133.py @@ -0,0 +1,31 @@ +# coding: utf8 +from __future__ import unicode_literals + +from spacy.lang.en import English +from spacy.tokens import Doc +from spacy.vocab import Vocab + + +def test_issue4133(en_vocab): + nlp = English() + vocab_bytes = nlp.vocab.to_bytes() + words = ["Apple", "is", "looking", "at", "buying", "a", "startup"] + pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"] + doc = Doc(en_vocab, words=words) + for i, token in enumerate(doc): + token.pos_ = pos[i] + + # usually this is already True when starting from proper models instead of blank English + doc.is_tagged = True + + doc_bytes = doc.to_bytes() + + vocab = Vocab() + vocab = vocab.from_bytes(vocab_bytes) + doc = Doc(vocab).from_bytes(doc_bytes) + + actual = [] + for token in doc: + actual.append(token.pos_) + + assert actual == pos diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 21f29f304..e5c213383 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -864,7 +864,7 @@ cdef class Doc: """ array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE] # TODO: ENT_KB_ID ? if self.is_tagged: - array_head.append(TAG) + array_head.extend([TAG, POS]) # If doc parsed add head and dep attribute if self.is_parsed: array_head.extend([HEAD, DEP])