mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Remove old serialization tests
This commit is contained in:
		
							parent
							
								
									f9327343ce
								
							
						
					
					
						commit
						7253b4e649
					
				| 
						 | 
				
			
			@ -1,51 +0,0 @@
 | 
			
		|||
# coding: utf-8
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
from ...serialize.packer import _BinaryCodec
 | 
			
		||||
from ...serialize.huffman import HuffmanCodec
 | 
			
		||||
from ...serialize.bits import BitArray
 | 
			
		||||
 | 
			
		||||
import numpy
 | 
			
		||||
import pytest
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_serialize_codecs_binary():
 | 
			
		||||
    codec = _BinaryCodec()
 | 
			
		||||
    bits = BitArray()
 | 
			
		||||
    array = numpy.array([0, 1, 0, 1, 1], numpy.int32)
 | 
			
		||||
    codec.encode(array, bits)
 | 
			
		||||
    result = numpy.array([0, 0, 0, 0, 0], numpy.int32)
 | 
			
		||||
    bits.seek(0)
 | 
			
		||||
    codec.decode(bits, result)
 | 
			
		||||
    assert list(array) == list(result)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_serialize_codecs_attribute():
 | 
			
		||||
    freqs = {'the': 10, 'quick': 3, 'brown': 4, 'fox': 1, 'jumped': 5,
 | 
			
		||||
             'over': 8, 'lazy': 1, 'dog': 2, '.': 9}
 | 
			
		||||
    int_map = {'the': 0, 'quick': 1, 'brown': 2, 'fox': 3, 'jumped': 4,
 | 
			
		||||
               'over': 5, 'lazy': 6, 'dog': 7, '.': 8}
 | 
			
		||||
 | 
			
		||||
    codec = HuffmanCodec([(int_map[string], freq) for string, freq in freqs.items()])
 | 
			
		||||
    bits = BitArray()
 | 
			
		||||
    array = numpy.array([1, 7], dtype=numpy.int32)
 | 
			
		||||
    codec.encode(array, bits)
 | 
			
		||||
    result = numpy.array([0, 0], dtype=numpy.int32)
 | 
			
		||||
    bits.seek(0)
 | 
			
		||||
    codec.decode(bits, result)
 | 
			
		||||
    assert list(array) == list(result)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_serialize_codecs_vocab(en_vocab):
 | 
			
		||||
    words = ["the", "dog", "jumped"]
 | 
			
		||||
    for word in words:
 | 
			
		||||
        _ = en_vocab[word]
 | 
			
		||||
    codec = HuffmanCodec([(lex.orth, lex.prob) for lex in en_vocab])
 | 
			
		||||
    bits = BitArray()
 | 
			
		||||
    ids = [en_vocab[s].orth for s in words]
 | 
			
		||||
    array = numpy.array(ids, dtype=numpy.int32)
 | 
			
		||||
    codec.encode(array, bits)
 | 
			
		||||
    result = numpy.array(range(len(array)), dtype=numpy.int32)
 | 
			
		||||
    bits.seek(0)
 | 
			
		||||
    codec.decode(bits, result)
 | 
			
		||||
    assert list(array) == list(result)
 | 
			
		||||
| 
						 | 
				
			
			@ -1,110 +0,0 @@
 | 
			
		|||
# coding: utf-8
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
from __future__ import division
 | 
			
		||||
 | 
			
		||||
from ...serialize.huffman import HuffmanCodec
 | 
			
		||||
from ...serialize.bits import BitArray
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
from heapq import heappush, heappop, heapify
 | 
			
		||||
from collections import defaultdict
 | 
			
		||||
import numpy
 | 
			
		||||
import pytest
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def py_encode(symb2freq):
 | 
			
		||||
    """Huffman encode the given dict mapping symbols to weights
 | 
			
		||||
    From Rosetta Code
 | 
			
		||||
    """
 | 
			
		||||
    heap = [[wt, [sym, ""]] for sym, wt in symb2freq.items()]
 | 
			
		||||
    heapify(heap)
 | 
			
		||||
    while len(heap) > 1:
 | 
			
		||||
        lo = heappop(heap)
 | 
			
		||||
        hi = heappop(heap)
 | 
			
		||||
        for pair in lo[1:]:
 | 
			
		||||
            pair[1] = '0' + pair[1]
 | 
			
		||||
        for pair in hi[1:]:
 | 
			
		||||
            pair[1] = '1' + pair[1]
 | 
			
		||||
        heappush(heap, [lo[0] + hi[0]] + lo[1:] + hi[1:])
 | 
			
		||||
    return dict(heappop(heap)[1:])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_serialize_huffman_1():
 | 
			
		||||
    probs = numpy.zeros(shape=(10,), dtype=numpy.float32)
 | 
			
		||||
    probs[0] = 0.3
 | 
			
		||||
    probs[1] = 0.2
 | 
			
		||||
    probs[2] = 0.15
 | 
			
		||||
    probs[3] = 0.1
 | 
			
		||||
    probs[4] = 0.06
 | 
			
		||||
    probs[5] = 0.02
 | 
			
		||||
    probs[6] = 0.01
 | 
			
		||||
    probs[7] = 0.005
 | 
			
		||||
    probs[8] = 0.0001
 | 
			
		||||
    probs[9] = 0.000001
 | 
			
		||||
 | 
			
		||||
    codec = HuffmanCodec(list(enumerate(probs)))
 | 
			
		||||
    py_codes = py_encode(dict(enumerate(probs)))
 | 
			
		||||
    py_codes = list(py_codes.items())
 | 
			
		||||
    py_codes.sort()
 | 
			
		||||
    assert codec.strings == [c for i, c in py_codes]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_serialize_huffman_empty():
 | 
			
		||||
    codec = HuffmanCodec({})
 | 
			
		||||
    assert codec.strings == []
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_serialize_huffman_round_trip():
 | 
			
		||||
    words = ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'the',
 | 
			
		||||
             'lazy', 'dog', '.']
 | 
			
		||||
    freqs = {'the': 10, 'quick': 3, 'brown': 4, 'fox': 1, 'jumped': 5,
 | 
			
		||||
             'over': 8, 'lazy': 1, 'dog': 2, '.': 9}
 | 
			
		||||
 | 
			
		||||
    codec = HuffmanCodec(freqs.items())
 | 
			
		||||
    strings = list(codec.strings)
 | 
			
		||||
    codes = dict([(codec.leaves[i], strings[i]) for i in range(len(codec.leaves))])
 | 
			
		||||
    bits = codec.encode(words)
 | 
			
		||||
    string = ''.join('{0:b}'.format(c).rjust(8, '0')[::-1] for c in bits.as_bytes())
 | 
			
		||||
    for word in words:
 | 
			
		||||
        code = codes[word]
 | 
			
		||||
        assert string[:len(code)] == code
 | 
			
		||||
        string = string[len(code):]
 | 
			
		||||
    unpacked = [0] * len(words)
 | 
			
		||||
    bits.seek(0)
 | 
			
		||||
    codec.decode(bits, unpacked)
 | 
			
		||||
    assert words == unpacked
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_serialize_huffman_rosetta():
 | 
			
		||||
    text = "this is an example for huffman encoding"
 | 
			
		||||
    symb2freq = defaultdict(int)
 | 
			
		||||
    for ch in text:
 | 
			
		||||
        symb2freq[ch] += 1
 | 
			
		||||
    by_freq = list(symb2freq.items())
 | 
			
		||||
    by_freq.sort(reverse=True, key=lambda item: item[1])
 | 
			
		||||
    symbols = [sym for sym, prob in by_freq]
 | 
			
		||||
 | 
			
		||||
    codec = HuffmanCodec(symb2freq.items())
 | 
			
		||||
    py_codec = py_encode(symb2freq)
 | 
			
		||||
 | 
			
		||||
    codes = dict([(codec.leaves[i], codec.strings[i]) for i in range(len(codec.leaves))])
 | 
			
		||||
 | 
			
		||||
    my_lengths = defaultdict(int)
 | 
			
		||||
    py_lengths = defaultdict(int)
 | 
			
		||||
    for symb, freq in symb2freq.items():
 | 
			
		||||
        my = codes[symb]
 | 
			
		||||
        my_lengths[len(my)] += freq
 | 
			
		||||
        py_lengths[len(py_codec[symb])] += freq
 | 
			
		||||
    my_exp_len = sum(length * weight for length, weight in my_lengths.items())
 | 
			
		||||
    py_exp_len = sum(length * weight for length, weight in py_lengths.items())
 | 
			
		||||
    assert my_exp_len == py_exp_len
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.models
 | 
			
		||||
def test_vocab(EN):
 | 
			
		||||
    codec = HuffmanCodec([(w.orth, numpy.exp(w.prob)) for w in EN.vocab])
 | 
			
		||||
    expected_length = 0
 | 
			
		||||
    for i, code in enumerate(codec.strings):
 | 
			
		||||
        leaf = codec.leaves[i]
 | 
			
		||||
        expected_length += len(code) * numpy.exp(EN.vocab[leaf].prob)
 | 
			
		||||
    assert 8 < expected_length < 15
 | 
			
		||||
| 
						 | 
				
			
			@ -1,48 +0,0 @@
 | 
			
		|||
# coding: utf-8
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
from ...tokens import Doc
 | 
			
		||||
from ..util import get_doc
 | 
			
		||||
 | 
			
		||||
import pytest
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_serialize_io_read_write(en_vocab, text_file_b):
 | 
			
		||||
    text1 = ["This", "is", "a", "simple", "test", ".", "With", "a", "couple", "of", "sentences", "."]
 | 
			
		||||
    text2 = ["This", "is", "another", "test", "document", "."]
 | 
			
		||||
 | 
			
		||||
    doc1 = get_doc(en_vocab, text1)
 | 
			
		||||
    doc2 = get_doc(en_vocab, text2)
 | 
			
		||||
    text_file_b.write(doc1.to_bytes())
 | 
			
		||||
    text_file_b.write(doc2.to_bytes())
 | 
			
		||||
    text_file_b.seek(0)
 | 
			
		||||
    bytes1, bytes2 = Doc.read_bytes(text_file_b)
 | 
			
		||||
    result1 = get_doc(en_vocab).from_bytes(bytes1)
 | 
			
		||||
    result2 = get_doc(en_vocab).from_bytes(bytes2)
 | 
			
		||||
    assert result1.text_with_ws == doc1.text_with_ws
 | 
			
		||||
    assert result2.text_with_ws == doc2.text_with_ws
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_serialize_io_left_right(en_vocab):
 | 
			
		||||
    text = ["This", "is", "a", "simple", "test", ".", "With", "a",  "couple", "of", "sentences", "."]
 | 
			
		||||
    doc = get_doc(en_vocab, text)
 | 
			
		||||
    result = Doc(en_vocab).from_bytes(doc.to_bytes())
 | 
			
		||||
 | 
			
		||||
    for token in result:
 | 
			
		||||
        assert token.head.i == doc[token.i].head.i
 | 
			
		||||
        if token.head is not token:
 | 
			
		||||
            assert token.i in [w.i for w in token.head.children]
 | 
			
		||||
        for child in token.lefts:
 | 
			
		||||
            assert child.head.i == token.i
 | 
			
		||||
        for child in token.rights:
 | 
			
		||||
            assert child.head.i == token.i
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.models
 | 
			
		||||
def test_lemmas(EN):
 | 
			
		||||
    text = "The geese are flying"
 | 
			
		||||
    doc = EN(text)
 | 
			
		||||
    result = Doc(doc.vocab).from_bytes(doc.to_bytes())
 | 
			
		||||
    assert result[1].lemma_ == 'goose'
 | 
			
		||||
    assert result[2].lemma_ == 'be'
 | 
			
		||||
    assert result[3].lemma_ == 'fly'
 | 
			
		||||
| 
						 | 
				
			
			@ -1,98 +0,0 @@
 | 
			
		|||
# coding: utf-8
 | 
			
		||||
from __future__ import unicode_literals
 | 
			
		||||
 | 
			
		||||
from ...attrs import TAG, DEP, HEAD
 | 
			
		||||
from ...serialize.packer import Packer
 | 
			
		||||
from ...serialize.bits import BitArray
 | 
			
		||||
 | 
			
		||||
from ..util import get_doc
 | 
			
		||||
 | 
			
		||||
import pytest
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture
 | 
			
		||||
def text():
 | 
			
		||||
    return "the dog jumped"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture
 | 
			
		||||
def text_b():
 | 
			
		||||
    return b"the dog jumped"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_serialize_char_packer(en_vocab, text_b):
 | 
			
		||||
    packer = Packer(en_vocab, [])
 | 
			
		||||
    bits = BitArray()
 | 
			
		||||
    bits.seek(0)
 | 
			
		||||
    byte_str = bytearray(text_b)
 | 
			
		||||
    packer.char_codec.encode(byte_str, bits)
 | 
			
		||||
    bits.seek(0)
 | 
			
		||||
    result = [b''] * len(byte_str)
 | 
			
		||||
    packer.char_codec.decode(bits, result)
 | 
			
		||||
    assert bytearray(result) == byte_str
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_serialize_packer_unannotated(en_tokenizer, text):
 | 
			
		||||
    packer = Packer(en_tokenizer.vocab, [])
 | 
			
		||||
    tokens = en_tokenizer(text)
 | 
			
		||||
    assert tokens.text_with_ws == text
 | 
			
		||||
    bits = packer.pack(tokens)
 | 
			
		||||
    result = packer.unpack(bits)
 | 
			
		||||
    assert result.text_with_ws == text
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_packer_annotated(en_vocab, text):
 | 
			
		||||
    heads = [1, 1, 0]
 | 
			
		||||
    deps = ['det', 'nsubj', 'ROOT']
 | 
			
		||||
    tags = ['DT', 'NN', 'VBD']
 | 
			
		||||
 | 
			
		||||
    attr_freqs = [
 | 
			
		||||
        (TAG, [(en_vocab.strings['NN'], 0.1),
 | 
			
		||||
               (en_vocab.strings['DT'], 0.2),
 | 
			
		||||
               (en_vocab.strings['JJ'], 0.01),
 | 
			
		||||
               (en_vocab.strings['VBD'], 0.05)]),
 | 
			
		||||
        (DEP, {en_vocab.strings['det']: 0.2,
 | 
			
		||||
               en_vocab.strings['nsubj']: 0.1,
 | 
			
		||||
               en_vocab.strings['adj']: 0.05,
 | 
			
		||||
               en_vocab.strings['ROOT']: 0.1}.items()),
 | 
			
		||||
        (HEAD, {0: 0.05, 1: 0.2, -1: 0.2, -2: 0.1, 2: 0.1}.items())
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    packer = Packer(en_vocab, attr_freqs)
 | 
			
		||||
    doc = get_doc(en_vocab, [t for t in text.split()], tags=tags, deps=deps, heads=heads)
 | 
			
		||||
 | 
			
		||||
    # assert doc.text_with_ws == text
 | 
			
		||||
    assert [t.tag_ for t in doc] == tags
 | 
			
		||||
    assert [t.dep_ for t in doc] == deps
 | 
			
		||||
    assert [(t.head.i-t.i) for t in doc] == heads
 | 
			
		||||
 | 
			
		||||
    bits = packer.pack(doc)
 | 
			
		||||
    result = packer.unpack(bits)
 | 
			
		||||
 | 
			
		||||
    # assert result.text_with_ws == text
 | 
			
		||||
    assert [t.tag_ for t in result] == tags
 | 
			
		||||
    assert [t.dep_ for t in result] == deps
 | 
			
		||||
    assert [(t.head.i-t.i) for t in result] == heads
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_packer_bad_chars(en_tokenizer):
 | 
			
		||||
    text = "naja gut, is eher bl\xf6d und nicht mit reddit.com/digg.com vergleichbar; vielleicht auf dem weg dahin"
 | 
			
		||||
    packer = Packer(en_tokenizer.vocab, [])
 | 
			
		||||
 | 
			
		||||
    doc = en_tokenizer(text)
 | 
			
		||||
    bits = packer.pack(doc)
 | 
			
		||||
    result = packer.unpack(bits)
 | 
			
		||||
    assert result.string == doc.string
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.models
 | 
			
		||||
def test_packer_bad_chars_tags(EN):
 | 
			
		||||
    text = "naja gut, is eher bl\xf6d und nicht mit reddit.com/digg.com vergleichbar; vielleicht auf dem weg dahin"
 | 
			
		||||
    tags = ['JJ', 'NN', ',', 'VBZ', 'DT', 'NN', 'JJ', 'NN', 'NN',
 | 
			
		||||
            'ADD', 'NN', ':', 'NN', 'NN', 'NN', 'NN', 'NN']
 | 
			
		||||
 | 
			
		||||
    tokens = EN.tokenizer(text)
 | 
			
		||||
    doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags)
 | 
			
		||||
    byte_string = doc.to_bytes()
 | 
			
		||||
    result = get_doc(tokens.vocab).from_bytes(byte_string)
 | 
			
		||||
    assert [t.tag_ for t in result] == [t.tag_ for t in doc]
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user