mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-25 00:34:20 +03:00
49 lines
1.6 KiB
Python
49 lines
1.6 KiB
Python
# coding: utf-8
|
|
from __future__ import unicode_literals
|
|
|
|
from ...tokens import Doc
|
|
from ..util import get_doc
|
|
|
|
import pytest
|
|
|
|
|
|
def test_serialize_io_read_write(en_vocab, text_file_b):
|
|
text1 = ["This", "is", "a", "simple", "test", ".", "With", "a", "couple", "of", "sentences", "."]
|
|
text2 = ["This", "is", "another", "test", "document", "."]
|
|
|
|
doc1 = get_doc(en_vocab, text1)
|
|
doc2 = get_doc(en_vocab, text2)
|
|
text_file_b.write(doc1.to_bytes())
|
|
text_file_b.write(doc2.to_bytes())
|
|
text_file_b.seek(0)
|
|
bytes1, bytes2 = Doc.read_bytes(text_file_b)
|
|
result1 = get_doc(en_vocab).from_bytes(bytes1)
|
|
result2 = get_doc(en_vocab).from_bytes(bytes2)
|
|
assert result1.text_with_ws == doc1.text_with_ws
|
|
assert result2.text_with_ws == doc2.text_with_ws
|
|
|
|
|
|
def test_serialize_io_left_right(en_vocab):
|
|
text = ["This", "is", "a", "simple", "test", ".", "With", "a", "couple", "of", "sentences", "."]
|
|
doc = get_doc(en_vocab, text)
|
|
result = Doc(en_vocab).from_bytes(doc.to_bytes())
|
|
|
|
for token in result:
|
|
assert token.head.i == doc[token.i].head.i
|
|
if token.head is not token:
|
|
assert token.i in [w.i for w in token.head.children]
|
|
for child in token.lefts:
|
|
assert child.head.i == token.i
|
|
for child in token.rights:
|
|
assert child.head.i == token.i
|
|
|
|
|
|
@pytest.mark.models
|
|
def test_lemmas(EN):
|
|
text = "The geese are flying"
|
|
doc = EN(text)
|
|
result = Doc(doc.vocab).from_bytes(doc.to_bytes())
|
|
assert result[1].lemma_ == 'goose'
|
|
assert result[2].lemma_ == 'be'
|
|
assert result[3].lemma_ == 'fly'
|