* Update test_packer

This commit is contained in:
Matthew Honnibal 2015-07-20 01:38:29 +02:00
parent fb7202a173
commit f13d5dae91

View File

@ -1,10 +1,16 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re
import pytest import pytest
import numpy import numpy
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.tokens.doc import Doc from spacy.tokens.doc import Doc
from spacy.tokenizer import Tokenizer
from spacy.en import LOCAL_DATA_DIR
from os import path
from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD
from spacy.serialize.packer import Packer from spacy.serialize.packer import Packer
@ -26,6 +32,7 @@ def get_lex_props(string, prob=-22):
'sentiment': 0 'sentiment': 0
} }
@pytest.fixture @pytest.fixture
def vocab(): def vocab():
vocab = Vocab(get_lex_props=get_lex_props) vocab = Vocab(get_lex_props=get_lex_props)
@ -37,23 +44,43 @@ def vocab():
return vocab return vocab
def test_packer_unannotated(vocab): @pytest.fixture
packer = Packer(vocab, [(ORTH, [(lex.orth, lex.prob) for lex in vocab]), def tokenizer(vocab):
(SPACY, [])]) null_re = re.compile(r'!!!!!!!!!')
tokenizer = Tokenizer(vocab, {}, null_re, null_re, null_re)
return tokenizer
ids = [vocab[w].orth for w in 'the dog jumped'.split()]
msg = Doc.from_ids(vocab, ids, [1, 1, 0]) def test_char_packer(vocab):
packer = Packer(vocab, [])
bits = BitArray()
bits.seek(0)
byte_str = b'the dog jumped'
packer.char_codec.encode(byte_str, bits)
bits.seek(0)
result = [b''] * len(byte_str)
packer.char_codec.decode(bits, result)
assert b''.join(result) == byte_str
def test_packer_unannotated(tokenizer):
packer = Packer(tokenizer.vocab, [])
msg = tokenizer(u'the dog jumped')
assert msg.string == 'the dog jumped' assert msg.string == 'the dog jumped'
bits = packer.pack(msg) bits = packer.pack(msg)
result = packer.unpack(bits) result = packer.unpack(bits)
assert result.string == 'the dog jumped' assert result.string == 'the dog jumped'
def test_packer_annotated(vocab):
def test_packer_annotated(tokenizer):
vocab = tokenizer.vocab
nn = vocab.strings['NN'] nn = vocab.strings['NN']
dt = vocab.strings['DT'] dt = vocab.strings['DT']
vbd = vocab.strings['VBD'] vbd = vocab.strings['VBD']
@ -64,8 +91,6 @@ def test_packer_annotated(vocab):
root = vocab.strings['ROOT'] root = vocab.strings['ROOT']
attr_freqs = [ attr_freqs = [
(ORTH, [(lex.orth, lex.prob) for lex in vocab]),
(SPACY, []),
(TAG, [(nn, 0.1), (dt, 0.2), (jj, 0.01), (vbd, 0.05)]), (TAG, [(nn, 0.1), (dt, 0.2), (jj, 0.01), (vbd, 0.05)]),
(DEP, {det: 0.2, nsubj: 0.1, adj: 0.05, root: 0.1}.items()), (DEP, {det: 0.2, nsubj: 0.1, adj: 0.05, root: 0.1}.items()),
(HEAD, {0: 0.05, 1: 0.2, -1: 0.2, -2: 0.1, 2: 0.1}.items()) (HEAD, {0: 0.05, 1: 0.2, -1: 0.2, -2: 0.1, 2: 0.1}.items())
@ -73,8 +98,8 @@ def test_packer_annotated(vocab):
packer = Packer(vocab, attr_freqs) packer = Packer(vocab, attr_freqs)
ids = [vocab[w].orth for w in 'the dog jumped'.split()] msg = tokenizer(u'the dog jumped')
msg = Doc.from_ids(vocab, ids, [1, 1, 0])
msg.from_array( msg.from_array(
[TAG, DEP, HEAD], [TAG, DEP, HEAD],
numpy.array([ numpy.array([
@ -95,3 +120,5 @@ def test_packer_annotated(vocab):
assert [t.tag_ for t in result] == ['DT', 'NN', 'VBD'] assert [t.tag_ for t in result] == ['DT', 'NN', 'VBD']
assert [t.dep_ for t in result] == ['det', 'nsubj', 'ROOT'] assert [t.dep_ for t in result] == ['det', 'nsubj', 'ROOT']
assert [(t.head.i - t.i) for t in result] == [1, 1, 0] assert [(t.head.i - t.i) for t in result] == [1, 1, 0]