mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Modernise serializer I/O tests and don't depend on models where possible
This commit is contained in:
		
							parent
							
								
									4bb5b89ee4
								
							
						
					
					
						commit
						38d60f6b90
					
				| 
						 | 
					@ -1,58 +1,48 @@
 | 
				
			||||||
 | 
					# coding: utf-8
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ...tokens import Doc
 | 
				
			||||||
 | 
					from ..util import get_doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.serialize.packer import Packer
 | 
					
 | 
				
			||||||
from spacy.attrs import ORTH, SPACY
 | 
					def test_serialize_io_read_write(en_vocab, text_file_b):
 | 
				
			||||||
from spacy.tokens import Doc
 | 
					    text1 = ["This", "is", "a", "simple", "test", ".", "With", "a", "couple", "of", "sentences", "."]
 | 
				
			||||||
import math
 | 
					    text2 = ["This", "is", "another", "test", "document", "."]
 | 
				
			||||||
import tempfile
 | 
					
 | 
				
			||||||
import shutil
 | 
					    doc1 = get_doc(en_vocab, text1)
 | 
				
			||||||
import os
 | 
					    doc2 = get_doc(en_vocab, text2)
 | 
				
			||||||
 | 
					    text_file_b.write(doc1.to_bytes())
 | 
				
			||||||
 | 
					    text_file_b.write(doc2.to_bytes())
 | 
				
			||||||
 | 
					    text_file_b.seek(0)
 | 
				
			||||||
 | 
					    bytes1, bytes2 = Doc.read_bytes(text_file_b)
 | 
				
			||||||
 | 
					    result1 = get_doc(en_vocab).from_bytes(bytes1)
 | 
				
			||||||
 | 
					    result2 = get_doc(en_vocab).from_bytes(bytes2)
 | 
				
			||||||
 | 
					    assert result1.text_with_ws == doc1.text_with_ws
 | 
				
			||||||
 | 
					    assert result2.text_with_ws == doc2.text_with_ws
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.models
 | 
					def test_serialize_io_left_right(en_vocab):
 | 
				
			||||||
def test_read_write(EN):
 | 
					    text = ["This", "is", "a", "simple", "test", ".", "With", "a",  "couple", "of", "sentences", "."]
 | 
				
			||||||
    doc1 = EN(u'This is a simple test. With a couple of sentences.')
 | 
					    doc = get_doc(en_vocab, text)
 | 
				
			||||||
    doc2 = EN(u'This is another test document.')
 | 
					    result = Doc(en_vocab).from_bytes(doc.to_bytes())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    try:
 | 
					    for token in result:
 | 
				
			||||||
        tmp_dir = tempfile.mkdtemp()
 | 
					        assert token.head.i == doc[token.i].head.i
 | 
				
			||||||
        with open(os.path.join(tmp_dir, 'spacy_docs.bin'), 'wb') as file_:
 | 
					        if token.head is not token:
 | 
				
			||||||
            file_.write(doc1.to_bytes())
 | 
					            assert token.i in [w.i for w in token.head.children]
 | 
				
			||||||
            file_.write(doc2.to_bytes())
 | 
					        for child in token.lefts:
 | 
				
			||||||
 | 
					            assert child.head.i == token.i
 | 
				
			||||||
        with open(os.path.join(tmp_dir, 'spacy_docs.bin'), 'rb') as file_:
 | 
					        for child in token.rights:
 | 
				
			||||||
            bytes1, bytes2 = Doc.read_bytes(file_)
 | 
					            assert child.head.i == token.i
 | 
				
			||||||
            r1 = Doc(EN.vocab).from_bytes(bytes1)
 | 
					 | 
				
			||||||
            r2 = Doc(EN.vocab).from_bytes(bytes2)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        assert r1.string == doc1.string
 | 
					 | 
				
			||||||
        assert r2.string == doc2.string
 | 
					 | 
				
			||||||
    finally:
 | 
					 | 
				
			||||||
        shutil.rmtree(tmp_dir)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@pytest.mark.models
 | 
					 | 
				
			||||||
def test_left_right(EN):
 | 
					 | 
				
			||||||
    orig = EN(u'This is a simple test. With a couple of sentences.')
 | 
					 | 
				
			||||||
    result = Doc(orig.vocab).from_bytes(orig.to_bytes())
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for word in result:
 | 
					 | 
				
			||||||
        assert word.head.i == orig[word.i].head.i
 | 
					 | 
				
			||||||
        if word.head is not word:
 | 
					 | 
				
			||||||
            assert word.i in [w.i for w in word.head.children]
 | 
					 | 
				
			||||||
        for child in word.lefts:
 | 
					 | 
				
			||||||
            assert child.head.i == word.i
 | 
					 | 
				
			||||||
        for child in word.rights:
 | 
					 | 
				
			||||||
            assert child.head.i == word.i
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.models
 | 
					@pytest.mark.models
 | 
				
			||||||
def test_lemmas(EN):
 | 
					def test_lemmas(EN):
 | 
				
			||||||
    orig = EN(u'The geese are flying')
 | 
					    text = "The geese are flying"
 | 
				
			||||||
    result = Doc(orig.vocab).from_bytes(orig.to_bytes())
 | 
					    doc = EN(text)
 | 
				
			||||||
    the, geese, are, flying = result
 | 
					    result = Doc(doc.vocab).from_bytes(doc.to_bytes())
 | 
				
			||||||
    assert geese.lemma_ == 'goose'
 | 
					    assert result[1].lemma_ == 'goose'
 | 
				
			||||||
    assert are.lemma_ == 'be'
 | 
					    assert result[2].lemma_ == 'be'
 | 
				
			||||||
    assert flying.lemma_ == 'fly'
 | 
					    assert result[3].lemma_ == 'fly'
 | 
				
			||||||
 | 
					 | 
				
			||||||
 
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user