mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Serialize POS attribute when doc.is_tagged (#4092)
* fix and unit test for issue 3959 * additional unit test for manifestation of the same (resolved) bug
This commit is contained in:
parent
7539a4f3a8
commit
01c5980187
29
spacy/tests/regression/test_issue3959.py
Normal file
29
spacy/tests/regression/test_issue3959.py
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from ..util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3959():
|
||||||
|
""" Ensure that a modified pos attribute is serialized correctly."""
|
||||||
|
nlp = English()
|
||||||
|
doc = nlp(
|
||||||
|
"displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
|
||||||
|
)
|
||||||
|
assert doc[0].pos_ == ""
|
||||||
|
|
||||||
|
doc[0].pos_ = "NOUN"
|
||||||
|
assert doc[0].pos_ == "NOUN"
|
||||||
|
|
||||||
|
# usually this is already True when starting from proper models instead of blank English
|
||||||
|
doc.is_tagged = True
|
||||||
|
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
file_path = tmp_dir / "my_doc"
|
||||||
|
doc.to_disk(file_path)
|
||||||
|
|
||||||
|
doc2 = nlp("")
|
||||||
|
doc2.from_disk(file_path)
|
||||||
|
|
||||||
|
assert doc2[0].pos_ == "NOUN"
|
31
spacy/tests/regression/test_issue4133.py
Normal file
31
spacy/tests/regression/test_issue4133.py
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from spacy.lang.en import English
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue4133(en_vocab):
|
||||||
|
nlp = English()
|
||||||
|
vocab_bytes = nlp.vocab.to_bytes()
|
||||||
|
words = ["Apple", "is", "looking", "at", "buying", "a", "startup"]
|
||||||
|
pos = ["NOUN", "VERB", "ADP", "VERB", "PROPN", "NOUN", "ADP"]
|
||||||
|
doc = Doc(en_vocab, words=words)
|
||||||
|
for i, token in enumerate(doc):
|
||||||
|
token.pos_ = pos[i]
|
||||||
|
|
||||||
|
# usually this is already True when starting from proper models instead of blank English
|
||||||
|
doc.is_tagged = True
|
||||||
|
|
||||||
|
doc_bytes = doc.to_bytes()
|
||||||
|
|
||||||
|
vocab = Vocab()
|
||||||
|
vocab = vocab.from_bytes(vocab_bytes)
|
||||||
|
doc = Doc(vocab).from_bytes(doc_bytes)
|
||||||
|
|
||||||
|
actual = []
|
||||||
|
for token in doc:
|
||||||
|
actual.append(token.pos_)
|
||||||
|
|
||||||
|
assert actual == pos
|
|
@ -864,7 +864,7 @@ cdef class Doc:
|
||||||
"""
|
"""
|
||||||
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE] # TODO: ENT_KB_ID ?
|
array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE] # TODO: ENT_KB_ID ?
|
||||||
if self.is_tagged:
|
if self.is_tagged:
|
||||||
array_head.append(TAG)
|
array_head.extend([TAG, POS])
|
||||||
# If doc parsed add head and dep attribute
|
# If doc parsed add head and dep attribute
|
||||||
if self.is_parsed:
|
if self.is_parsed:
|
||||||
array_head.extend([HEAD, DEP])
|
array_head.extend([HEAD, DEP])
|
||||||
|
|
Loading…
Reference in New Issue
Block a user