mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-01 00:17:44 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			49 lines
		
	
	
		
			1.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			49 lines
		
	
	
		
			1.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # coding: utf-8
 | |
| from __future__ import unicode_literals
 | |
| 
 | |
| from ...tokens import Doc
 | |
| from ..util import get_doc
 | |
| 
 | |
| import pytest
 | |
| 
 | |
| 
 | |
| def test_serialize_io_read_write(en_vocab, text_file_b):
 | |
|     text1 = ["This", "is", "a", "simple", "test", ".", "With", "a", "couple", "of", "sentences", "."]
 | |
|     text2 = ["This", "is", "another", "test", "document", "."]
 | |
| 
 | |
|     doc1 = get_doc(en_vocab, text1)
 | |
|     doc2 = get_doc(en_vocab, text2)
 | |
|     text_file_b.write(doc1.to_bytes())
 | |
|     text_file_b.write(doc2.to_bytes())
 | |
|     text_file_b.seek(0)
 | |
|     bytes1, bytes2 = Doc.read_bytes(text_file_b)
 | |
|     result1 = get_doc(en_vocab).from_bytes(bytes1)
 | |
|     result2 = get_doc(en_vocab).from_bytes(bytes2)
 | |
|     assert result1.text_with_ws == doc1.text_with_ws
 | |
|     assert result2.text_with_ws == doc2.text_with_ws
 | |
| 
 | |
| 
 | |
| def test_serialize_io_left_right(en_vocab):
 | |
|     text = ["This", "is", "a", "simple", "test", ".", "With", "a",  "couple", "of", "sentences", "."]
 | |
|     doc = get_doc(en_vocab, text)
 | |
|     result = Doc(en_vocab).from_bytes(doc.to_bytes())
 | |
| 
 | |
|     for token in result:
 | |
|         assert token.head.i == doc[token.i].head.i
 | |
|         if token.head is not token:
 | |
|             assert token.i in [w.i for w in token.head.children]
 | |
|         for child in token.lefts:
 | |
|             assert child.head.i == token.i
 | |
|         for child in token.rights:
 | |
|             assert child.head.i == token.i
 | |
| 
 | |
| 
 | |
| @pytest.mark.models
 | |
| def test_lemmas(EN):
 | |
|     text = "The geese are flying"
 | |
|     doc = EN(text)
 | |
|     result = Doc(doc.vocab).from_bytes(doc.to_bytes())
 | |
|     assert result[1].lemma_ == 'goose'
 | |
|     assert result[2].lemma_ == 'be'
 | |
|     assert result[3].lemma_ == 'fly'
 |