mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Serialize POS attribute when doc.is_tagged (#4092)
* fix and unit test for issue 3959 * additional unit test for manifestation of the same (resolved) bug
This commit is contained in:
parent
7539a4f3a8
commit
01c5980187
29
spacy/tests/regression/test_issue3959.py
Normal file
29
spacy/tests/regression/test_issue3959.py
Normal file
|
@ -0,0 +1,29 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.lang.en import English
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_issue3959():
|
||||
""" Ensure that a modified pos attribute is serialized correctly."""
|
||||
nlp = English()
|
||||
doc = nlp(
|
||||
"displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
|
||||
)
|
||||
assert doc[0].pos_ == ""
|
||||
|
||||
doc[0].pos_ = "NOUN"
|
||||
assert doc[0].pos_ == "NOUN"
|
||||
|
||||
# usually this is already True when starting from proper models instead of blank English
|
||||
doc.is_tagged = True
|
||||
|
||||
with make_tempdir() as tmp_dir:
|
||||
file_path = tmp_dir / "my_doc"
|
||||
doc.to_disk(file_path)
|
||||
|
||||
doc2 = nlp("")
|
||||
doc2.from_disk(file_path)
|
||||
|
||||
assert doc2[0].pos_ == "NOUN"
|
31
spacy/tests/regression/test_issue4133.py
Normal file
31
spacy/tests/regression/test_issue4133.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokens import Doc
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
|
||||
def test_issue4133(en_vocab):
|
||||
nlp = English()
|
||||
vocab_bytes = nlp.vocab.to_bytes()
|
||||
words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
|
||||
pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
|
||||
doc = Doc(en_vocab, words=words)
|
||||
for i, token in enumerate(doc):
|
||||
token.pos_ = pos[i]
|
||||
|
||||
# usually this is already True when starting from proper models instead of blank English
|
||||
doc.is_tagged = True
|
||||
|
||||
doc_bytes = doc.to_bytes()
|
||||
|
||||
vocab = Vocab()
|
||||
vocab = vocab.from_bytes(vocab_bytes)
|
||||
doc = Doc(vocab).from_bytes(doc_bytes)
|
||||
|
||||
actual = []
|
||||
for token in doc:
|
||||
actual.append(token.pos_)
|
||||
|
||||
assert actual == pos
|
|
@ -864,7 +864,7 @@ cdef class Doc:
|
|||
"""
|
||||
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE] # TODO: ENT_KB_ID ?
|
||||
if self.is_tagged:
|
||||
array_head.append(TAG)
|
||||
array_head.extend([TAG, POS])
|
||||
# If doc parsed add head and dep attribute
|
||||
if self.is_parsed:
|
||||
array_head.extend([HEAD, DEP])
|
||||
|
|
Loading…
Reference in New Issue
Block a user