mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 09:44:36 +03:00
Modernise and merge serialization tests
This commit is contained in:
parent
442237787c
commit
d084676cd0
|
@ -1,127 +1,40 @@
|
||||||
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ...serialize.packer import Packer
|
||||||
|
from ..util import get_doc, assert_docs_equal
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from spacy.tokens import Doc
|
|
||||||
import spacy.en
|
TEXT = ["This", "is", "a", "test", "sentence", "."]
|
||||||
from spacy.serialize.packer import Packer
|
TAGS = ['DT', 'VBZ', 'DT', 'NN', 'NN', '.']
|
||||||
|
DEPS = ['nsubj', 'ROOT', 'det', 'compound', 'attr', 'punct']
|
||||||
|
ENTS = [('hi', 'PERSON', 0, 1)]
|
||||||
|
|
||||||
|
|
||||||
def equal(doc1, doc2):
|
def test_serialize_empty_doc(en_vocab):
|
||||||
# tokens
|
doc = get_doc(en_vocab)
|
||||||
assert [ t.orth for t in doc1 ] == [ t.orth for t in doc2 ]
|
packer = Packer(en_vocab, {})
|
||||||
|
|
||||||
# tags
|
|
||||||
assert [ t.pos for t in doc1 ] == [ t.pos for t in doc2 ]
|
|
||||||
assert [ t.tag for t in doc1 ] == [ t.tag for t in doc2 ]
|
|
||||||
|
|
||||||
# parse
|
|
||||||
assert [ t.head.i for t in doc1 ] == [ t.head.i for t in doc2 ]
|
|
||||||
assert [ t.dep for t in doc1 ] == [ t.dep for t in doc2 ]
|
|
||||||
if doc1.is_parsed and doc2.is_parsed:
|
|
||||||
assert [ s for s in doc1.sents ] == [ s for s in doc2.sents ]
|
|
||||||
|
|
||||||
# entities
|
|
||||||
assert [ t.ent_type for t in doc1 ] == [ t.ent_type for t in doc2 ]
|
|
||||||
assert [ t.ent_iob for t in doc1 ] == [ t.ent_iob for t in doc2 ]
|
|
||||||
assert [ ent for ent in doc1.ents ] == [ ent for ent in doc2.ents ]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_serialize_tokens(EN):
|
|
||||||
doc1 = EN(u'This is a test sentence.',tag=False, parse=False, entity=False)
|
|
||||||
|
|
||||||
doc2 = Doc(EN.vocab).from_bytes(doc1.to_bytes())
|
|
||||||
equal(doc1, doc2)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_serialize_tokens_tags(EN):
|
|
||||||
doc1 = EN(u'This is a test sentence.',tag=True, parse=False, entity=False)
|
|
||||||
doc2 = Doc(EN.vocab).from_bytes(doc1.to_bytes())
|
|
||||||
equal(doc1, doc2)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_serialize_tokens_parse(EN):
|
|
||||||
doc1 = EN(u'This is a test sentence.',tag=False, parse=True, entity=False)
|
|
||||||
|
|
||||||
doc2 = Doc(EN.vocab).from_bytes(doc1.to_bytes())
|
|
||||||
equal(doc1, doc2)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_serialize_tokens_ner(EN):
|
|
||||||
doc1 = EN(u'This is a test sentence.', tag=False, parse=False, entity=True)
|
|
||||||
|
|
||||||
doc2 = Doc(EN.vocab).from_bytes(doc1.to_bytes())
|
|
||||||
equal(doc1, doc2)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_serialize_tokens_tags_parse(EN):
|
|
||||||
doc1 = EN(u'This is a test sentence.', tag=True, parse=True, entity=False)
|
|
||||||
|
|
||||||
doc2 = Doc(EN.vocab).from_bytes(doc1.to_bytes())
|
|
||||||
equal(doc1, doc2)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_serialize_tokens_tags_ner(EN):
|
|
||||||
doc1 = EN(u'This is a test sentence.', tag=True, parse=False, entity=True)
|
|
||||||
|
|
||||||
doc2 = Doc(EN.vocab).from_bytes(doc1.to_bytes())
|
|
||||||
equal(doc1, doc2)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_serialize_tokens_ner_parse(EN):
|
|
||||||
doc1 = EN(u'This is a test sentence.', tag=False, parse=True, entity=True)
|
|
||||||
|
|
||||||
doc2 = Doc(EN.vocab).from_bytes(doc1.to_bytes())
|
|
||||||
equal(doc1, doc2)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_serialize_tokens_tags_parse_ner(EN):
|
|
||||||
doc1 = EN(u'This is a test sentence.', tag=True, parse=True, entity=True)
|
|
||||||
|
|
||||||
doc2 = Doc(EN.vocab).from_bytes(doc1.to_bytes())
|
|
||||||
equal(doc1, doc2)
|
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_empty_doc():
|
|
||||||
vocab = spacy.en.English.Defaults.create_vocab()
|
|
||||||
doc = Doc(vocab)
|
|
||||||
packer = Packer(vocab, {})
|
|
||||||
b = packer.pack(doc)
|
b = packer.pack(doc)
|
||||||
assert b == b''
|
assert b == b''
|
||||||
loaded = Doc(vocab).from_bytes(b)
|
loaded = get_doc(en_vocab).from_bytes(b)
|
||||||
assert len(loaded) == 0
|
assert len(loaded) == 0
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_after_adding_entity():
|
@pytest.mark.parametrize('text', [TEXT])
|
||||||
# Re issue #514
|
def test_serialize_tokens(en_vocab, text):
|
||||||
vocab = spacy.en.English.Defaults.create_vocab()
|
doc1 = get_doc(en_vocab, [t for t in text])
|
||||||
entity_recognizer = spacy.en.English.Defaults.create_entity()
|
doc2 = get_doc(en_vocab).from_bytes(doc1.to_bytes())
|
||||||
|
assert_docs_equal(doc1, doc2)
|
||||||
doc = Doc(vocab, words=u'This is a sentence about pasta .'.split())
|
|
||||||
entity_recognizer.add_label('Food')
|
|
||||||
entity_recognizer(doc)
|
|
||||||
|
|
||||||
label_id = vocab.strings[u'Food']
|
|
||||||
doc.ents = [(label_id, 5,6)]
|
|
||||||
|
|
||||||
assert [(ent.label_, ent.text) for ent in doc.ents] == [(u'Food', u'pasta')]
|
|
||||||
|
|
||||||
byte_string = doc.to_bytes()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
@pytest.mark.models
|
||||||
def test_serialize_after_adding_entity(EN):
|
@pytest.mark.parametrize('text', [TEXT])
|
||||||
EN.entity.add_label(u'Food')
|
@pytest.mark.parametrize('tags', [TAGS, []])
|
||||||
doc = EN(u'This is a sentence about pasta.')
|
@pytest.mark.parametrize('deps', [DEPS, []])
|
||||||
label_id = EN.vocab.strings[u'Food']
|
@pytest.mark.parametrize('ents', [ENTS, []])
|
||||||
doc.ents = [(label_id, 5,6)]
|
def test_serialize_tokens_ner(EN, text, tags, deps, ents):
|
||||||
byte_string = doc.to_bytes()
|
doc1 = get_doc(EN.vocab, [t for t in text], tags=tags, deps=deps, ents=ents)
|
||||||
doc2 = Doc(EN.vocab).from_bytes(byte_string)
|
doc2 = get_doc(EN.vocab).from_bytes(doc1.to_bytes())
|
||||||
assert [(ent.label_, ent.text) for ent in doc2.ents] == [(u'Food', u'pasta')]
|
assert_docs_equal(doc1, doc2)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user