mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
Remove old serialization tests
This commit is contained in:
parent
f9327343ce
commit
7253b4e649
|
@ -1,51 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...serialize.packer import _BinaryCodec
|
|
||||||
from ...serialize.huffman import HuffmanCodec
|
|
||||||
from ...serialize.bits import BitArray
|
|
||||||
|
|
||||||
import numpy
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_codecs_binary():
|
|
||||||
codec = _BinaryCodec()
|
|
||||||
bits = BitArray()
|
|
||||||
array = numpy.array([0, 1, 0, 1, 1], numpy.int32)
|
|
||||||
codec.encode(array, bits)
|
|
||||||
result = numpy.array([0, 0, 0, 0, 0], numpy.int32)
|
|
||||||
bits.seek(0)
|
|
||||||
codec.decode(bits, result)
|
|
||||||
assert list(array) == list(result)
|
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_codecs_attribute():
|
|
||||||
freqs = {'the': 10, 'quick': 3, 'brown': 4, 'fox': 1, 'jumped': 5,
|
|
||||||
'over': 8, 'lazy': 1, 'dog': 2, '.': 9}
|
|
||||||
int_map = {'the': 0, 'quick': 1, 'brown': 2, 'fox': 3, 'jumped': 4,
|
|
||||||
'over': 5, 'lazy': 6, 'dog': 7, '.': 8}
|
|
||||||
|
|
||||||
codec = HuffmanCodec([(int_map[string], freq) for string, freq in freqs.items()])
|
|
||||||
bits = BitArray()
|
|
||||||
array = numpy.array([1, 7], dtype=numpy.int32)
|
|
||||||
codec.encode(array, bits)
|
|
||||||
result = numpy.array([0, 0], dtype=numpy.int32)
|
|
||||||
bits.seek(0)
|
|
||||||
codec.decode(bits, result)
|
|
||||||
assert list(array) == list(result)
|
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_codecs_vocab(en_vocab):
|
|
||||||
words = ["the", "dog", "jumped"]
|
|
||||||
for word in words:
|
|
||||||
_ = en_vocab[word]
|
|
||||||
codec = HuffmanCodec([(lex.orth, lex.prob) for lex in en_vocab])
|
|
||||||
bits = BitArray()
|
|
||||||
ids = [en_vocab[s].orth for s in words]
|
|
||||||
array = numpy.array(ids, dtype=numpy.int32)
|
|
||||||
codec.encode(array, bits)
|
|
||||||
result = numpy.array(range(len(array)), dtype=numpy.int32)
|
|
||||||
bits.seek(0)
|
|
||||||
codec.decode(bits, result)
|
|
||||||
assert list(array) == list(result)
|
|
|
@ -1,110 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
from __future__ import division
|
|
||||||
|
|
||||||
from ...serialize.huffman import HuffmanCodec
|
|
||||||
from ...serialize.bits import BitArray
|
|
||||||
|
|
||||||
|
|
||||||
from heapq import heappush, heappop, heapify
|
|
||||||
from collections import defaultdict
|
|
||||||
import numpy
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
def py_encode(symb2freq):
|
|
||||||
"""Huffman encode the given dict mapping symbols to weights
|
|
||||||
From Rosetta Code
|
|
||||||
"""
|
|
||||||
heap = [[wt, [sym, ""]] for sym, wt in symb2freq.items()]
|
|
||||||
heapify(heap)
|
|
||||||
while len(heap) > 1:
|
|
||||||
lo = heappop(heap)
|
|
||||||
hi = heappop(heap)
|
|
||||||
for pair in lo[1:]:
|
|
||||||
pair[1] = '0' + pair[1]
|
|
||||||
for pair in hi[1:]:
|
|
||||||
pair[1] = '1' + pair[1]
|
|
||||||
heappush(heap, [lo[0] + hi[0]] + lo[1:] + hi[1:])
|
|
||||||
return dict(heappop(heap)[1:])
|
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_huffman_1():
|
|
||||||
probs = numpy.zeros(shape=(10,), dtype=numpy.float32)
|
|
||||||
probs[0] = 0.3
|
|
||||||
probs[1] = 0.2
|
|
||||||
probs[2] = 0.15
|
|
||||||
probs[3] = 0.1
|
|
||||||
probs[4] = 0.06
|
|
||||||
probs[5] = 0.02
|
|
||||||
probs[6] = 0.01
|
|
||||||
probs[7] = 0.005
|
|
||||||
probs[8] = 0.0001
|
|
||||||
probs[9] = 0.000001
|
|
||||||
|
|
||||||
codec = HuffmanCodec(list(enumerate(probs)))
|
|
||||||
py_codes = py_encode(dict(enumerate(probs)))
|
|
||||||
py_codes = list(py_codes.items())
|
|
||||||
py_codes.sort()
|
|
||||||
assert codec.strings == [c for i, c in py_codes]
|
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_huffman_empty():
|
|
||||||
codec = HuffmanCodec({})
|
|
||||||
assert codec.strings == []
|
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_huffman_round_trip():
|
|
||||||
words = ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'the',
|
|
||||||
'lazy', 'dog', '.']
|
|
||||||
freqs = {'the': 10, 'quick': 3, 'brown': 4, 'fox': 1, 'jumped': 5,
|
|
||||||
'over': 8, 'lazy': 1, 'dog': 2, '.': 9}
|
|
||||||
|
|
||||||
codec = HuffmanCodec(freqs.items())
|
|
||||||
strings = list(codec.strings)
|
|
||||||
codes = dict([(codec.leaves[i], strings[i]) for i in range(len(codec.leaves))])
|
|
||||||
bits = codec.encode(words)
|
|
||||||
string = ''.join('{0:b}'.format(c).rjust(8, '0')[::-1] for c in bits.as_bytes())
|
|
||||||
for word in words:
|
|
||||||
code = codes[word]
|
|
||||||
assert string[:len(code)] == code
|
|
||||||
string = string[len(code):]
|
|
||||||
unpacked = [0] * len(words)
|
|
||||||
bits.seek(0)
|
|
||||||
codec.decode(bits, unpacked)
|
|
||||||
assert words == unpacked
|
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_huffman_rosetta():
|
|
||||||
text = "this is an example for huffman encoding"
|
|
||||||
symb2freq = defaultdict(int)
|
|
||||||
for ch in text:
|
|
||||||
symb2freq[ch] += 1
|
|
||||||
by_freq = list(symb2freq.items())
|
|
||||||
by_freq.sort(reverse=True, key=lambda item: item[1])
|
|
||||||
symbols = [sym for sym, prob in by_freq]
|
|
||||||
|
|
||||||
codec = HuffmanCodec(symb2freq.items())
|
|
||||||
py_codec = py_encode(symb2freq)
|
|
||||||
|
|
||||||
codes = dict([(codec.leaves[i], codec.strings[i]) for i in range(len(codec.leaves))])
|
|
||||||
|
|
||||||
my_lengths = defaultdict(int)
|
|
||||||
py_lengths = defaultdict(int)
|
|
||||||
for symb, freq in symb2freq.items():
|
|
||||||
my = codes[symb]
|
|
||||||
my_lengths[len(my)] += freq
|
|
||||||
py_lengths[len(py_codec[symb])] += freq
|
|
||||||
my_exp_len = sum(length * weight for length, weight in my_lengths.items())
|
|
||||||
py_exp_len = sum(length * weight for length, weight in py_lengths.items())
|
|
||||||
assert my_exp_len == py_exp_len
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_vocab(EN):
|
|
||||||
codec = HuffmanCodec([(w.orth, numpy.exp(w.prob)) for w in EN.vocab])
|
|
||||||
expected_length = 0
|
|
||||||
for i, code in enumerate(codec.strings):
|
|
||||||
leaf = codec.leaves[i]
|
|
||||||
expected_length += len(code) * numpy.exp(EN.vocab[leaf].prob)
|
|
||||||
assert 8 < expected_length < 15
|
|
|
@ -1,48 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...tokens import Doc
|
|
||||||
from ..util import get_doc
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_io_read_write(en_vocab, text_file_b):
|
|
||||||
text1 = ["This", "is", "a", "simple", "test", ".", "With", "a", "couple", "of", "sentences", "."]
|
|
||||||
text2 = ["This", "is", "another", "test", "document", "."]
|
|
||||||
|
|
||||||
doc1 = get_doc(en_vocab, text1)
|
|
||||||
doc2 = get_doc(en_vocab, text2)
|
|
||||||
text_file_b.write(doc1.to_bytes())
|
|
||||||
text_file_b.write(doc2.to_bytes())
|
|
||||||
text_file_b.seek(0)
|
|
||||||
bytes1, bytes2 = Doc.read_bytes(text_file_b)
|
|
||||||
result1 = get_doc(en_vocab).from_bytes(bytes1)
|
|
||||||
result2 = get_doc(en_vocab).from_bytes(bytes2)
|
|
||||||
assert result1.text_with_ws == doc1.text_with_ws
|
|
||||||
assert result2.text_with_ws == doc2.text_with_ws
|
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_io_left_right(en_vocab):
|
|
||||||
text = ["This", "is", "a", "simple", "test", ".", "With", "a", "couple", "of", "sentences", "."]
|
|
||||||
doc = get_doc(en_vocab, text)
|
|
||||||
result = Doc(en_vocab).from_bytes(doc.to_bytes())
|
|
||||||
|
|
||||||
for token in result:
|
|
||||||
assert token.head.i == doc[token.i].head.i
|
|
||||||
if token.head is not token:
|
|
||||||
assert token.i in [w.i for w in token.head.children]
|
|
||||||
for child in token.lefts:
|
|
||||||
assert child.head.i == token.i
|
|
||||||
for child in token.rights:
|
|
||||||
assert child.head.i == token.i
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_lemmas(EN):
|
|
||||||
text = "The geese are flying"
|
|
||||||
doc = EN(text)
|
|
||||||
result = Doc(doc.vocab).from_bytes(doc.to_bytes())
|
|
||||||
assert result[1].lemma_ == 'goose'
|
|
||||||
assert result[2].lemma_ == 'be'
|
|
||||||
assert result[3].lemma_ == 'fly'
|
|
|
@ -1,98 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ...attrs import TAG, DEP, HEAD
|
|
||||||
from ...serialize.packer import Packer
|
|
||||||
from ...serialize.bits import BitArray
|
|
||||||
|
|
||||||
from ..util import get_doc
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def text():
|
|
||||||
return "the dog jumped"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def text_b():
|
|
||||||
return b"the dog jumped"
|
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_char_packer(en_vocab, text_b):
|
|
||||||
packer = Packer(en_vocab, [])
|
|
||||||
bits = BitArray()
|
|
||||||
bits.seek(0)
|
|
||||||
byte_str = bytearray(text_b)
|
|
||||||
packer.char_codec.encode(byte_str, bits)
|
|
||||||
bits.seek(0)
|
|
||||||
result = [b''] * len(byte_str)
|
|
||||||
packer.char_codec.decode(bits, result)
|
|
||||||
assert bytearray(result) == byte_str
|
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_packer_unannotated(en_tokenizer, text):
|
|
||||||
packer = Packer(en_tokenizer.vocab, [])
|
|
||||||
tokens = en_tokenizer(text)
|
|
||||||
assert tokens.text_with_ws == text
|
|
||||||
bits = packer.pack(tokens)
|
|
||||||
result = packer.unpack(bits)
|
|
||||||
assert result.text_with_ws == text
|
|
||||||
|
|
||||||
|
|
||||||
def test_packer_annotated(en_vocab, text):
|
|
||||||
heads = [1, 1, 0]
|
|
||||||
deps = ['det', 'nsubj', 'ROOT']
|
|
||||||
tags = ['DT', 'NN', 'VBD']
|
|
||||||
|
|
||||||
attr_freqs = [
|
|
||||||
(TAG, [(en_vocab.strings['NN'], 0.1),
|
|
||||||
(en_vocab.strings['DT'], 0.2),
|
|
||||||
(en_vocab.strings['JJ'], 0.01),
|
|
||||||
(en_vocab.strings['VBD'], 0.05)]),
|
|
||||||
(DEP, {en_vocab.strings['det']: 0.2,
|
|
||||||
en_vocab.strings['nsubj']: 0.1,
|
|
||||||
en_vocab.strings['adj']: 0.05,
|
|
||||||
en_vocab.strings['ROOT']: 0.1}.items()),
|
|
||||||
(HEAD, {0: 0.05, 1: 0.2, -1: 0.2, -2: 0.1, 2: 0.1}.items())
|
|
||||||
]
|
|
||||||
|
|
||||||
packer = Packer(en_vocab, attr_freqs)
|
|
||||||
doc = get_doc(en_vocab, [t for t in text.split()], tags=tags, deps=deps, heads=heads)
|
|
||||||
|
|
||||||
# assert doc.text_with_ws == text
|
|
||||||
assert [t.tag_ for t in doc] == tags
|
|
||||||
assert [t.dep_ for t in doc] == deps
|
|
||||||
assert [(t.head.i-t.i) for t in doc] == heads
|
|
||||||
|
|
||||||
bits = packer.pack(doc)
|
|
||||||
result = packer.unpack(bits)
|
|
||||||
|
|
||||||
# assert result.text_with_ws == text
|
|
||||||
assert [t.tag_ for t in result] == tags
|
|
||||||
assert [t.dep_ for t in result] == deps
|
|
||||||
assert [(t.head.i-t.i) for t in result] == heads
|
|
||||||
|
|
||||||
|
|
||||||
def test_packer_bad_chars(en_tokenizer):
|
|
||||||
text = "naja gut, is eher bl\xf6d und nicht mit reddit.com/digg.com vergleichbar; vielleicht auf dem weg dahin"
|
|
||||||
packer = Packer(en_tokenizer.vocab, [])
|
|
||||||
|
|
||||||
doc = en_tokenizer(text)
|
|
||||||
bits = packer.pack(doc)
|
|
||||||
result = packer.unpack(bits)
|
|
||||||
assert result.string == doc.string
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.models
|
|
||||||
def test_packer_bad_chars_tags(EN):
|
|
||||||
text = "naja gut, is eher bl\xf6d und nicht mit reddit.com/digg.com vergleichbar; vielleicht auf dem weg dahin"
|
|
||||||
tags = ['JJ', 'NN', ',', 'VBZ', 'DT', 'NN', 'JJ', 'NN', 'NN',
|
|
||||||
'ADD', 'NN', ':', 'NN', 'NN', 'NN', 'NN', 'NN']
|
|
||||||
|
|
||||||
tokens = EN.tokenizer(text)
|
|
||||||
doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags)
|
|
||||||
byte_string = doc.to_bytes()
|
|
||||||
result = get_doc(tokens.vocab).from_bytes(byte_string)
|
|
||||||
assert [t.tag_ for t in result] == [t.tag_ for t in doc]
|
|
Loading…
Reference in New Issue
Block a user