2017-01-12 23:57:19 +03:00
|
|
|
# coding: utf-8
|
2016-05-02 12:01:56 +03:00
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2017-01-12 23:57:19 +03:00
|
|
|
from ...serialize.packer import Packer
|
|
|
|
from ..util import get_doc, assert_docs_equal
|
2016-05-02 12:01:56 +03:00
|
|
|
|
2017-01-12 23:57:19 +03:00
|
|
|
import pytest
|
2016-05-02 12:01:56 +03:00
|
|
|
|
|
|
|
|
2017-01-12 23:57:19 +03:00
|
|
|
TEXT = ["This", "is", "a", "test", "sentence", "."]
|
|
|
|
TAGS = ['DT', 'VBZ', 'DT', 'NN', 'NN', '.']
|
|
|
|
DEPS = ['nsubj', 'ROOT', 'det', 'compound', 'attr', 'punct']
|
|
|
|
ENTS = [('hi', 'PERSON', 0, 1)]
|
2016-10-23 17:30:22 +03:00
|
|
|
|
|
|
|
|
2017-01-12 23:57:19 +03:00
|
|
|
def test_serialize_empty_doc(en_vocab):
|
|
|
|
doc = get_doc(en_vocab)
|
|
|
|
packer = Packer(en_vocab, {})
|
2016-10-23 17:30:22 +03:00
|
|
|
b = packer.pack(doc)
|
|
|
|
assert b == b''
|
2017-01-12 23:57:19 +03:00
|
|
|
loaded = get_doc(en_vocab).from_bytes(b)
|
2016-10-23 17:30:22 +03:00
|
|
|
assert len(loaded) == 0
|
2016-10-23 17:40:27 +03:00
|
|
|
|
|
|
|
|
2017-01-12 23:57:19 +03:00
|
|
|
@pytest.mark.parametrize('text', [TEXT])
|
|
|
|
def test_serialize_tokens(en_vocab, text):
|
|
|
|
doc1 = get_doc(en_vocab, [t for t in text])
|
|
|
|
doc2 = get_doc(en_vocab).from_bytes(doc1.to_bytes())
|
|
|
|
assert_docs_equal(doc1, doc2)
|
2016-10-23 18:41:32 +03:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.models
|
2017-01-12 23:57:19 +03:00
|
|
|
@pytest.mark.parametrize('text', [TEXT])
|
|
|
|
@pytest.mark.parametrize('tags', [TAGS, []])
|
|
|
|
@pytest.mark.parametrize('deps', [DEPS, []])
|
|
|
|
@pytest.mark.parametrize('ents', [ENTS, []])
|
|
|
|
def test_serialize_tokens_ner(EN, text, tags, deps, ents):
|
|
|
|
doc1 = get_doc(EN.vocab, [t for t in text], tags=tags, deps=deps, ents=ents)
|
|
|
|
doc2 = get_doc(EN.vocab).from_bytes(doc1.to_bytes())
|
|
|
|
assert_docs_equal(doc1, doc2)
|