* Tests for serializer

This commit is contained in:
Matthew Honnibal 2015-07-17 21:21:10 +02:00
parent cf0c788892
commit e950f5a408
2 changed files with 168 additions and 0 deletions

View File

@ -0,0 +1,73 @@
from __future__ import unicode_literals
import pytest
import numpy
from spacy.vocab import Vocab
from spacy.serialize.packer import _BinaryCodec
from spacy.serialize.packer import make_vocab_codec
from spacy.serialize.packer import _AttributeCodec
from spacy.serialize.bits import BitArray
def test_binary():
codec = _BinaryCodec()
bits = BitArray()
msg = numpy.array([0, 1, 0, 1, 1], numpy.int32)
codec.encode(msg, bits)
result = numpy.array([0, 0, 0, 0, 0], numpy.int32)
codec.decode(iter(bits), result)
assert list(msg) == list(result)
def test_attribute():
freqs = {'the': 10, 'quick': 3, 'brown': 4, 'fox': 1, 'jumped': 5, 'over': 8,
'lazy': 1, 'dog': 2, '.': 9}
int_map = {'the': 0, 'quick': 1, 'brown': 2, 'fox': 3, 'jumped': 4, 'over': 5,
'lazy': 6, 'dog': 7, '.': 8}
codec = _AttributeCodec([(int_map[string], freq) for string, freq in freqs.items()])
bits = BitArray()
msg = numpy.array([1, 7], dtype=numpy.int32)
msg_list = list(msg)
codec.encode(msg, bits)
result = numpy.array([0, 0], dtype=numpy.int32)
codec.decode(bits, result)
assert msg_list == list(result)
def test_vocab_codec():
def get_lex_props(string, prob):
return {
'flags': 0,
'length': len(string),
'orth': string,
'lower': string,
'norm': string,
'shape': string,
'prefix': string[0],
'suffix': string[-3:],
'cluster': 0,
'prob': prob,
'sentiment': 0
}
vocab = Vocab()
vocab['dog'] = get_lex_props('dog', 0.001)
vocab['the'] = get_lex_props('the', 0.05)
vocab['jumped'] = get_lex_props('jumped', 0.005)
codec = make_vocab_codec(vocab)
bits = BitArray()
ids = [vocab[s].id for s in ('the', 'dog', 'jumped')]
msg = numpy.array(ids, dtype=numpy.int32)
msg_list = list(msg)
codec.encode(msg, bits)
result = numpy.array(range(len(msg)), dtype=numpy.int32)
codec.decode(bits, result)
assert msg_list == list(result)

View File

@ -0,0 +1,95 @@
from __future__ import unicode_literals
import pytest
import numpy
from spacy.vocab import Vocab
from spacy.tokens.doc import Doc
from spacy.attrs import ID, SPACY, TAG, DEP, HEAD
from spacy.serialize.packer import Packer
from spacy.serialize.bits import BitArray
def get_lex_props(string, prob=-22):
return {
'flags': 0,
'length': len(string),
'orth': string,
'lower': string,
'norm': string,
'shape': string,
'prefix': string[0],
'suffix': string[-3:],
'cluster': 0,
'prob': prob,
'sentiment': 0
}
@pytest.fixture
def vocab():
vocab = Vocab(get_lex_props=get_lex_props)
vocab['dog'] = get_lex_props('dog', 0.001)
vocab['the'] = get_lex_props('the', 0.01)
vocab['quick'] = get_lex_props('quick', 0.005)
vocab['jumped'] = get_lex_props('jumped', 0.007)
return vocab
def test_packer_unannotated(vocab):
packer = Packer(vocab, [(ID, {}), (SPACY, {})])
ids = [vocab[w].id for w in 'the dog jumped'.split()]
msg = Doc.from_ids(vocab, ids, [1, 1, 0])
assert msg.string == 'the dog jumped'
bits = packer.pack(msg)
result = packer.unpack(bits)
assert result.string == 'the dog jumped'
def test_packer_annotated(vocab):
nn = vocab.strings['NN']
dt = vocab.strings['DT']
vbd = vocab.strings['VBD']
jj = vocab.strings['JJ']
det = vocab.strings['det']
nsubj = vocab.strings['nsubj']
adj = vocab.strings['adj']
root = vocab.strings['ROOT']
attr_freqs = [
(ID, []),
(SPACY, []),
(TAG, [(nn, 0.1), (dt, 0.2), (jj, 0.01), (vbd, 0.05)]),
(DEP, {det: 0.2, nsubj: 0.1, adj: 0.05, root: 0.1}.items()),
(HEAD, {0: 0.05, 1: 0.2, -1: 0.2, -2: 0.1, 2: 0.1}.items())
]
packer = Packer(vocab, attr_freqs)
ids = [vocab[w].id for w in 'the dog jumped'.split()]
msg = Doc.from_ids(vocab, ids, [1, 1, 0])
msg.from_array(
[TAG, DEP, HEAD],
numpy.array([
[dt, det, 1],
[nn, nsubj, 1],
[vbd, root, 0]
], dtype=numpy.int32))
assert msg.string == 'the dog jumped'
assert [t.tag_ for t in msg] == ['DT', 'NN', 'VBD']
assert [t.dep_ for t in msg] == ['det', 'nsubj', 'ROOT']
assert [(t.head.i - t.i) for t in msg] == [1, 1, 0]
bits = packer.pack(msg)
result = packer.unpack(bits)
assert result.string == 'the dog jumped'
assert [t.tag_ for t in result] == ['DT', 'NN', 'VBD']
assert [t.dep_ for t in result] == ['det', 'nsubj', 'ROOT']
assert [(t.head.i - t.i) for t in result] == [1, 1, 0]