From f13d5dae910bd1b5c4bdef81ee525eb7faa023a1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 20 Jul 2015 01:38:29 +0200 Subject: [PATCH] * Update test_packer --- tests/serialize/test_packer.py | 49 ++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/tests/serialize/test_packer.py b/tests/serialize/test_packer.py index 7be2f0fa5..3e377e9c8 100644 --- a/tests/serialize/test_packer.py +++ b/tests/serialize/test_packer.py @@ -1,10 +1,16 @@ from __future__ import unicode_literals +import re + import pytest import numpy from spacy.vocab import Vocab from spacy.tokens.doc import Doc +from spacy.tokenizer import Tokenizer +from spacy.en import LOCAL_DATA_DIR +from os import path + from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD from spacy.serialize.packer import Packer @@ -26,6 +32,7 @@ def get_lex_props(string, prob=-22): 'sentiment': 0 } + @pytest.fixture def vocab(): vocab = Vocab(get_lex_props=get_lex_props) @@ -37,23 +44,43 @@ def vocab(): return vocab -def test_packer_unannotated(vocab): - packer = Packer(vocab, [(ORTH, [(lex.orth, lex.prob) for lex in vocab]), - (SPACY, [])]) +@pytest.fixture +def tokenizer(vocab): + null_re = re.compile(r'!!!!!!!!!') + tokenizer = Tokenizer(vocab, {}, null_re, null_re, null_re) + return tokenizer - ids = [vocab[w].orth for w in 'the dog jumped'.split()] - msg = Doc.from_ids(vocab, ids, [1, 1, 0]) + +def test_char_packer(vocab): + packer = Packer(vocab, []) + bits = BitArray() + bits.seek(0) + + byte_str = b'the dog jumped' + packer.char_codec.encode(byte_str, bits) + bits.seek(0) + result = [b''] * len(byte_str) + packer.char_codec.decode(bits, result) + assert b''.join(result) == byte_str + + +def test_packer_unannotated(tokenizer): + packer = Packer(tokenizer.vocab, []) + + msg = tokenizer(u'the dog jumped') assert msg.string == 'the dog jumped' + bits = packer.pack(msg) result = packer.unpack(bits) assert result.string == 'the dog jumped' - -def test_packer_annotated(vocab): + +def test_packer_annotated(tokenizer): + vocab = tokenizer.vocab nn = vocab.strings['NN'] dt = vocab.strings['DT'] vbd = vocab.strings['VBD'] @@ -64,8 +91,6 @@ def test_packer_annotated(vocab): root = vocab.strings['ROOT'] attr_freqs = [ - (ORTH, [(lex.orth, lex.prob) for lex in vocab]), - (SPACY, []), (TAG, [(nn, 0.1), (dt, 0.2), (jj, 0.01), (vbd, 0.05)]), (DEP, {det: 0.2, nsubj: 0.1, adj: 0.05, root: 0.1}.items()), (HEAD, {0: 0.05, 1: 0.2, -1: 0.2, -2: 0.1, 2: 0.1}.items()) @@ -73,8 +98,8 @@ def test_packer_annotated(vocab): packer = Packer(vocab, attr_freqs) - ids = [vocab[w].orth for w in 'the dog jumped'.split()] - msg = Doc.from_ids(vocab, ids, [1, 1, 0]) + msg = tokenizer(u'the dog jumped') + msg.from_array( [TAG, DEP, HEAD], numpy.array([ @@ -95,3 +120,5 @@ def test_packer_annotated(vocab): assert [t.tag_ for t in result] == ['DT', 'NN', 'VBD'] assert [t.dep_ for t in result] == ['det', 'nsubj', 'ROOT'] assert [(t.head.i - t.i) for t in result] == [1, 1, 0] + +