mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 18:07:26 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			99 lines
		
	
	
		
			2.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			99 lines
		
	
	
		
			2.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# coding: utf-8
 | 
						|
from __future__ import unicode_literals
 | 
						|
 | 
						|
from ...attrs import TAG, DEP, HEAD
 | 
						|
from ...serialize.packer import Packer
 | 
						|
from ...serialize.bits import BitArray
 | 
						|
 | 
						|
from ..util import get_doc
 | 
						|
 | 
						|
import pytest
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture
 | 
						|
def text():
 | 
						|
    return "the dog jumped"
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture
 | 
						|
def text_b():
 | 
						|
    return b"the dog jumped"
 | 
						|
 | 
						|
 | 
						|
def test_serialize_char_packer(en_vocab, text_b):
 | 
						|
    packer = Packer(en_vocab, [])
 | 
						|
    bits = BitArray()
 | 
						|
    bits.seek(0)
 | 
						|
    byte_str = bytearray(text_b)
 | 
						|
    packer.char_codec.encode(byte_str, bits)
 | 
						|
    bits.seek(0)
 | 
						|
    result = [b''] * len(byte_str)
 | 
						|
    packer.char_codec.decode(bits, result)
 | 
						|
    assert bytearray(result) == byte_str
 | 
						|
 | 
						|
 | 
						|
def test_serialize_packer_unannotated(en_tokenizer, text):
 | 
						|
    packer = Packer(en_tokenizer.vocab, [])
 | 
						|
    tokens = en_tokenizer(text)
 | 
						|
    assert tokens.text_with_ws == text
 | 
						|
    bits = packer.pack(tokens)
 | 
						|
    result = packer.unpack(bits)
 | 
						|
    assert result.text_with_ws == text
 | 
						|
 | 
						|
 | 
						|
def test_packer_annotated(en_vocab, text):
 | 
						|
    heads = [1, 1, 0]
 | 
						|
    deps = ['det', 'nsubj', 'ROOT']
 | 
						|
    tags = ['DT', 'NN', 'VBD']
 | 
						|
 | 
						|
    attr_freqs = [
 | 
						|
        (TAG, [(en_vocab.strings['NN'], 0.1),
 | 
						|
               (en_vocab.strings['DT'], 0.2),
 | 
						|
               (en_vocab.strings['JJ'], 0.01),
 | 
						|
               (en_vocab.strings['VBD'], 0.05)]),
 | 
						|
        (DEP, {en_vocab.strings['det']: 0.2,
 | 
						|
               en_vocab.strings['nsubj']: 0.1,
 | 
						|
               en_vocab.strings['adj']: 0.05,
 | 
						|
               en_vocab.strings['ROOT']: 0.1}.items()),
 | 
						|
        (HEAD, {0: 0.05, 1: 0.2, -1: 0.2, -2: 0.1, 2: 0.1}.items())
 | 
						|
    ]
 | 
						|
 | 
						|
    packer = Packer(en_vocab, attr_freqs)
 | 
						|
    doc = get_doc(en_vocab, [t for t in text.split()], tags=tags, deps=deps, heads=heads)
 | 
						|
 | 
						|
    # assert doc.text_with_ws == text
 | 
						|
    assert [t.tag_ for t in doc] == tags
 | 
						|
    assert [t.dep_ for t in doc] == deps
 | 
						|
    assert [(t.head.i-t.i) for t in doc] == heads
 | 
						|
 | 
						|
    bits = packer.pack(doc)
 | 
						|
    result = packer.unpack(bits)
 | 
						|
 | 
						|
    # assert result.text_with_ws == text
 | 
						|
    assert [t.tag_ for t in result] == tags
 | 
						|
    assert [t.dep_ for t in result] == deps
 | 
						|
    assert [(t.head.i-t.i) for t in result] == heads
 | 
						|
 | 
						|
 | 
						|
def test_packer_bad_chars(en_tokenizer):
 | 
						|
    text = "naja gut, is eher bl\xf6d und nicht mit reddit.com/digg.com vergleichbar; vielleicht auf dem weg dahin"
 | 
						|
    packer = Packer(en_tokenizer.vocab, [])
 | 
						|
 | 
						|
    doc = en_tokenizer(text)
 | 
						|
    bits = packer.pack(doc)
 | 
						|
    result = packer.unpack(bits)
 | 
						|
    assert result.string == doc.string
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.models
 | 
						|
def test_packer_bad_chars_tags(EN):
 | 
						|
    text = "naja gut, is eher bl\xf6d und nicht mit reddit.com/digg.com vergleichbar; vielleicht auf dem weg dahin"
 | 
						|
    tags = ['JJ', 'NN', ',', 'VBZ', 'DT', 'NN', 'JJ', 'NN', 'NN',
 | 
						|
            'ADD', 'NN', ':', 'NN', 'NN', 'NN', 'NN', 'NN']
 | 
						|
 | 
						|
    tokens = EN.tokenizer(text)
 | 
						|
    doc = get_doc(tokens.vocab, [t.text for t in tokens], tags=tags)
 | 
						|
    byte_string = doc.to_bytes()
 | 
						|
    result = get_doc(tokens.vocab).from_bytes(byte_string)
 | 
						|
    assert [t.tag_ for t in result] == [t.tag_ for t in doc]
 |