spaCy/spacy/tests/serialize/test_packer.py

99 lines
2.9 KiB
Python

# coding: utf-8
from __future__ import unicode_literals
from ...attrs import TAG, DEP, HEAD
from ...serialize.packer import Packer
from ...serialize.bits import BitArray
from ..util import get_doc
import pytest
@pytest.fixture
def text():
return "the dog jumped"
@pytest.fixture
def text_b():
return b"the dog jumped"
def test_serialize_char_packer(en_vocab, text_b):
packer = Packer(en_vocab, [])
bits = BitArray()
bits.seek(0)
byte_str = bytearray(text_b)
packer.char_codec.encode(byte_str, bits)
bits.seek(0)
result = [b''] * len(byte_str)
packer.char_codec.decode(bits, result)
assert bytearray(result) == byte_str
def test_serialize_packer_unannotated(en_tokenizer, text):
packer = Packer(en_tokenizer.vocab, [])
tokens = en_tokenizer(text)
assert tokens.text_with_ws == text
bits = packer.pack(tokens)
result = packer.unpack(bits)
assert result.text_with_ws == text
def test_packer_annotated(en_vocab, text):
heads = [1, 1, 0]
deps = ['det', 'nsubj', 'ROOT']
tags = ['DT', 'NN', 'VBD']
attr_freqs = [
(TAG, [(en_vocab.strings['NN'], 0.1),
(en_vocab.strings['DT'], 0.2),
(en_vocab.strings['JJ'], 0.01),
(en_vocab.strings['VBD'], 0.05)]),
(DEP, {en_vocab.strings['det']: 0.2,
en_vocab.strings['nsubj']: 0.1,
en_vocab.strings['adj']: 0.05,
en_vocab.strings['ROOT']: 0.1}.items()),
(HEAD, {0: 0.05, 1: 0.2, -1: 0.2, -2: 0.1, 2: 0.1}.items())
]
packer = Packer(en_vocab, attr_freqs)
doc = get_doc(en_vocab, [t for t in text.split()], tags=tags, deps=deps, heads=heads)
# assert doc.text_with_ws == text
assert [t.tag_ for t in doc] == tags
assert [t.dep_ for t in doc] == deps
assert [(t.head.i-t.i) for t in doc] == heads
bits = packer.pack(doc)
result = packer.unpack(bits)
# assert result.text_with_ws == text
assert [t.tag_ for t in result] == tags
assert [t.dep_ for t in result] == deps
assert [(t.head.i-t.i) for t in result] == heads
def test_packer_bad_chars(en_tokenizer):
text = "naja gut, is eher bl\xf6d und nicht mit reddit.com/digg.com vergleichbar; vielleicht auf dem weg dahin"
packer = Packer(en_tokenizer.vocab, [])
doc = en_tokenizer(text)
bits = packer.pack(doc)
result = packer.unpack(bits)
assert result.string == doc.string
@pytest.mark.models
def test_packer_bad_chars_tags(EN):
text = "naja gut, is eher bl\xf6d und nicht mit reddit.com/digg.com vergleichbar; vielleicht auf dem weg dahin"
tags = ['JJ', 'NN', ',', 'VBZ', 'DT', 'NN', 'JJ', 'NN', 'NN',
'ADD', 'NN', ':', 'NN', 'NN', 'NN', 'NN', 'NN']
tokens = EN.tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags)
byte_string = doc.to_bytes()
result = get_doc(tokens.vocab).from_bytes(byte_string)
assert [t.tag_ for t in result] == [t.tag_ for t in doc]