mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-10 09:16:31 +03:00
4a1ec332de
* Add Span.kb_id/Span.id strings to Doc/DocBin serialization if set * Format
226 lines
7.2 KiB
Python
226 lines
7.2 KiB
Python
import copy
|
|
import pickle
|
|
|
|
import numpy
|
|
import pytest
|
|
|
|
from spacy.attrs import DEP, HEAD
|
|
from spacy.lang.en import English
|
|
from spacy.language import Language
|
|
from spacy.matcher import Matcher, PhraseMatcher
|
|
from spacy.tokens import Doc
|
|
from spacy.vectors import Vectors
|
|
from spacy.vocab import Vocab
|
|
|
|
from ..util import make_tempdir
|
|
|
|
|
|
@pytest.mark.issue(1727)
|
|
def test_issue1727():
|
|
"""Test that models with no pretrained vectors can be deserialized
|
|
correctly after vectors are added."""
|
|
nlp = Language(Vocab())
|
|
data = numpy.ones((3, 300), dtype="f")
|
|
vectors = Vectors(data=data, keys=["I", "am", "Matt"])
|
|
tagger = nlp.create_pipe("tagger")
|
|
tagger.add_label("PRP")
|
|
assert tagger.cfg.get("pretrained_dims", 0) == 0
|
|
tagger.vocab.vectors = vectors
|
|
with make_tempdir() as path:
|
|
tagger.to_disk(path)
|
|
tagger = nlp.create_pipe("tagger").from_disk(path)
|
|
assert tagger.cfg.get("pretrained_dims", 0) == 0
|
|
|
|
|
|
@pytest.mark.issue(1799)
|
|
def test_issue1799():
|
|
"""Test sentence boundaries are deserialized correctly, even for
|
|
non-projective sentences."""
|
|
heads_deps = numpy.asarray(
|
|
[
|
|
[1, 397],
|
|
[4, 436],
|
|
[2, 426],
|
|
[1, 402],
|
|
[0, 8206900633647566924],
|
|
[18446744073709551615, 440],
|
|
[18446744073709551614, 442],
|
|
],
|
|
dtype="uint64",
|
|
)
|
|
doc = Doc(Vocab(), words="Just what I was looking for .".split())
|
|
doc.vocab.strings.add("ROOT")
|
|
doc = doc.from_array([HEAD, DEP], heads_deps)
|
|
assert len(list(doc.sents)) == 1
|
|
|
|
|
|
@pytest.mark.issue(1834)
|
|
def test_issue1834():
|
|
"""Test that sentence boundaries & parse/tag flags are not lost
|
|
during serialization."""
|
|
words = ["This", "is", "a", "first", "sentence", ".", "And", "another", "one"]
|
|
doc = Doc(Vocab(), words=words)
|
|
doc[6].is_sent_start = True
|
|
new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
|
|
assert new_doc[6].sent_start
|
|
assert not new_doc.has_annotation("DEP")
|
|
assert not new_doc.has_annotation("TAG")
|
|
doc = Doc(
|
|
Vocab(),
|
|
words=words,
|
|
tags=["TAG"] * len(words),
|
|
heads=[0, 0, 0, 0, 0, 0, 6, 6, 6],
|
|
deps=["dep"] * len(words),
|
|
)
|
|
new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes())
|
|
assert new_doc[6].sent_start
|
|
assert new_doc.has_annotation("DEP")
|
|
assert new_doc.has_annotation("TAG")
|
|
|
|
|
|
@pytest.mark.issue(1883)
|
|
def test_issue1883():
|
|
matcher = Matcher(Vocab())
|
|
matcher.add("pat1", [[{"orth": "hello"}]])
|
|
doc = Doc(matcher.vocab, words=["hello"])
|
|
assert len(matcher(doc)) == 1
|
|
new_matcher = copy.deepcopy(matcher)
|
|
new_doc = Doc(new_matcher.vocab, words=["hello"])
|
|
assert len(new_matcher(new_doc)) == 1
|
|
|
|
|
|
@pytest.mark.issue(2564)
|
|
def test_issue2564():
|
|
"""Test the tagger sets has_annotation("TAG") correctly when used via Language.pipe."""
|
|
nlp = Language()
|
|
tagger = nlp.add_pipe("tagger")
|
|
tagger.add_label("A")
|
|
nlp.initialize()
|
|
doc = nlp("hello world")
|
|
assert doc.has_annotation("TAG")
|
|
docs = nlp.pipe(["hello", "world"])
|
|
piped_doc = next(docs)
|
|
assert piped_doc.has_annotation("TAG")
|
|
|
|
|
|
@pytest.mark.issue(3248)
|
|
def test_issue3248_2():
|
|
"""Test that the PhraseMatcher can be pickled correctly."""
|
|
nlp = English()
|
|
matcher = PhraseMatcher(nlp.vocab)
|
|
matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
|
|
matcher.add("TEST2", [nlp("d")])
|
|
data = pickle.dumps(matcher)
|
|
new_matcher = pickle.loads(data)
|
|
assert len(new_matcher) == len(matcher)
|
|
|
|
|
|
@pytest.mark.issue(3289)
|
|
def test_issue3289():
|
|
"""Test that Language.to_bytes handles serializing a pipeline component
|
|
with an uninitialized model."""
|
|
nlp = English()
|
|
nlp.add_pipe("textcat")
|
|
bytes_data = nlp.to_bytes()
|
|
new_nlp = English()
|
|
new_nlp.add_pipe("textcat")
|
|
new_nlp.from_bytes(bytes_data)
|
|
|
|
|
|
@pytest.mark.issue(3468)
|
|
def test_issue3468():
|
|
"""Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can
|
|
be restored after serialization."""
|
|
nlp = English()
|
|
nlp.add_pipe("sentencizer")
|
|
doc = nlp("Hello world")
|
|
assert doc[0].is_sent_start
|
|
assert doc.has_annotation("SENT_START")
|
|
assert len(list(doc.sents)) == 1
|
|
doc_bytes = doc.to_bytes()
|
|
new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
|
|
assert new_doc[0].is_sent_start
|
|
assert new_doc.has_annotation("SENT_START")
|
|
assert len(list(new_doc.sents)) == 1
|
|
|
|
|
|
@pytest.mark.issue(3959)
|
|
def test_issue3959():
|
|
"""Ensure that a modified pos attribute is serialized correctly."""
|
|
nlp = English()
|
|
doc = nlp(
|
|
"displaCy uses JavaScript, SVG and CSS to show you how computers understand language"
|
|
)
|
|
assert doc[0].pos_ == ""
|
|
doc[0].pos_ = "NOUN"
|
|
assert doc[0].pos_ == "NOUN"
|
|
# usually this is already True when starting from proper models instead of blank English
|
|
with make_tempdir() as tmp_dir:
|
|
file_path = tmp_dir / "my_doc"
|
|
doc.to_disk(file_path)
|
|
doc2 = nlp("")
|
|
doc2.from_disk(file_path)
|
|
assert doc2[0].pos_ == "NOUN"
|
|
|
|
|
|
def test_serialize_empty_doc(en_vocab):
|
|
doc = Doc(en_vocab)
|
|
data = doc.to_bytes()
|
|
doc2 = Doc(en_vocab)
|
|
doc2.from_bytes(data)
|
|
assert len(doc) == len(doc2)
|
|
for token1, token2 in zip(doc, doc2):
|
|
assert token1.text == token2.text
|
|
|
|
|
|
def test_serialize_doc_roundtrip_bytes(en_vocab):
|
|
doc = Doc(en_vocab, words=["hello", "world"])
|
|
doc.cats = {"A": 0.5}
|
|
doc_b = doc.to_bytes()
|
|
new_doc = Doc(en_vocab).from_bytes(doc_b)
|
|
assert new_doc.to_bytes() == doc_b
|
|
|
|
|
|
def test_serialize_doc_roundtrip_disk(en_vocab):
|
|
doc = Doc(en_vocab, words=["hello", "world"])
|
|
with make_tempdir() as d:
|
|
file_path = d / "doc"
|
|
doc.to_disk(file_path)
|
|
doc_d = Doc(en_vocab).from_disk(file_path)
|
|
assert doc.to_bytes() == doc_d.to_bytes()
|
|
|
|
|
|
def test_serialize_doc_roundtrip_disk_str_path(en_vocab):
|
|
doc = Doc(en_vocab, words=["hello", "world"])
|
|
with make_tempdir() as d:
|
|
file_path = d / "doc"
|
|
file_path = str(file_path)
|
|
doc.to_disk(file_path)
|
|
doc_d = Doc(en_vocab).from_disk(file_path)
|
|
assert doc.to_bytes() == doc_d.to_bytes()
|
|
|
|
|
|
def test_serialize_doc_exclude(en_vocab):
|
|
doc = Doc(en_vocab, words=["hello", "world"])
|
|
doc.user_data["foo"] = "bar"
|
|
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
|
|
assert new_doc.user_data["foo"] == "bar"
|
|
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(), exclude=["user_data"])
|
|
assert not new_doc.user_data
|
|
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes(exclude=["user_data"]))
|
|
assert not new_doc.user_data
|
|
|
|
|
|
def test_serialize_doc_span_groups(en_vocab):
|
|
doc = Doc(en_vocab, words=["hello", "world", "!"])
|
|
span = doc[0:2]
|
|
span.label_ = "test_serialize_doc_span_groups_label"
|
|
span.id_ = "test_serialize_doc_span_groups_id"
|
|
span.kb_id_ = "test_serialize_doc_span_groups_kb_id"
|
|
doc.spans["content"] = [span]
|
|
new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
|
|
assert len(new_doc.spans["content"]) == 1
|
|
assert new_doc.spans["content"][0].label_ == "test_serialize_doc_span_groups_label"
|
|
assert new_doc.spans["content"][0].id_ == "test_serialize_doc_span_groups_id"
|
|
assert new_doc.spans["content"][0].kb_id_ == "test_serialize_doc_span_groups_kb_id"
|