spaCy/spacy/tests/serialize/test_packer.py
Matthew Honnibal aec130af56 Use util.Package class for io
Previous Sputnik integration caused API change: Vocab, Tagger, etc
were loaded via a from_package classmethod, that required a
sputnik.Package instance. This forced users to first create a
sputnik.Sputnik() instance, in order to acquire a Package via
sp.pool().

Instead I've created a small file-system shim, util.Package, which
allows classes to have a .load() classmethod, that accepts either
util.Package objects, or strings. We can later gut the internals
of this and make it a proxy for Sputnik if we need more functionality
that should live in the Sputnik library.

Sputnik is now only used to download and install the data, in
spacy.en.download
2015-12-29 18:00:48 +01:00

133 lines
3.5 KiB
Python

from __future__ import unicode_literals
import re
import pytest
import numpy
from spacy.language import Language
from spacy.en import English
from spacy.vocab import Vocab
from spacy.tokens.doc import Doc
from spacy.tokenizer import Tokenizer
from os import path
import os
from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD
from spacy.serialize.packer import Packer
from spacy.serialize.bits import BitArray
@pytest.fixture
def vocab():
if os.environ.get('SPACY_DATA'):
data_path = os.environ.get('SPACY_DATA')
else:
data_path = None
vocab = English.default_vocab(package=data_path)
lex = vocab['dog']
assert vocab[vocab.strings['dog']].orth_ == 'dog'
lex = vocab['the']
lex = vocab['quick']
lex = vocab['jumped']
return vocab
@pytest.fixture
def tokenizer(vocab):
null_re = re.compile(r'!!!!!!!!!')
tokenizer = Tokenizer(vocab, {}, null_re, null_re, null_re)
return tokenizer
def test_char_packer(vocab):
packer = Packer(vocab, [])
bits = BitArray()
bits.seek(0)
byte_str = bytearray(b'the dog jumped')
packer.char_codec.encode(byte_str, bits)
bits.seek(0)
result = [b''] * len(byte_str)
packer.char_codec.decode(bits, result)
assert bytearray(result) == byte_str
def test_packer_unannotated(tokenizer):
packer = Packer(tokenizer.vocab, [])
msg = tokenizer(u'the dog jumped')
assert msg.string == 'the dog jumped'
bits = packer.pack(msg)
result = packer.unpack(bits)
assert result.string == 'the dog jumped'
@pytest.mark.models
def test_packer_annotated(tokenizer):
vocab = tokenizer.vocab
nn = vocab.strings['NN']
dt = vocab.strings['DT']
vbd = vocab.strings['VBD']
jj = vocab.strings['JJ']
det = vocab.strings['det']
nsubj = vocab.strings['nsubj']
adj = vocab.strings['adj']
root = vocab.strings['ROOT']
attr_freqs = [
(TAG, [(nn, 0.1), (dt, 0.2), (jj, 0.01), (vbd, 0.05)]),
(DEP, {det: 0.2, nsubj: 0.1, adj: 0.05, root: 0.1}.items()),
(HEAD, {0: 0.05, 1: 0.2, -1: 0.2, -2: 0.1, 2: 0.1}.items())
]
packer = Packer(vocab, attr_freqs)
msg = tokenizer(u'the dog jumped')
msg.from_array(
[TAG, DEP, HEAD],
numpy.array([
[dt, det, 1],
[nn, nsubj, 1],
[vbd, root, 0]
], dtype=numpy.int32))
assert msg.string == 'the dog jumped'
assert [t.tag_ for t in msg] == ['DT', 'NN', 'VBD']
assert [t.dep_ for t in msg] == ['det', 'nsubj', 'ROOT']
assert [(t.head.i - t.i) for t in msg] == [1, 1, 0]
bits = packer.pack(msg)
result = packer.unpack(bits)
assert result.string == 'the dog jumped'
assert [t.tag_ for t in result] == ['DT', 'NN', 'VBD']
assert [t.dep_ for t in result] == ['det', 'nsubj', 'ROOT']
assert [(t.head.i - t.i) for t in result] == [1, 1, 0]
def test_packer_bad_chars(tokenizer):
string = u'naja gut, is eher bl\xf6d und nicht mit reddit.com/digg.com vergleichbar; vielleicht auf dem weg dahin'
packer = Packer(tokenizer.vocab, [])
doc = tokenizer(string)
bits = packer.pack(doc)
result = packer.unpack(bits)
assert result.string == doc.string
@pytest.mark.models
def test_packer_bad_chars(EN):
string = u'naja gut, is eher bl\xf6d und nicht mit reddit.com/digg.com vergleichbar; vielleicht auf dem weg dahin'
doc = EN(string)
byte_string = doc.to_bytes()
result = Doc(EN.vocab).from_bytes(byte_string)
assert [t.tag_ for t in result] == [t.tag_ for t in doc]