mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	space tokens are now always attached to the previous non-space token there are two exceptions: leading space tokens are attached to the first following non-space token in input that consists exclusively of space tokens, the last space token is the head of all others.
		
			
				
	
	
		
			91 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			91 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from __future__ import unicode_literals
 | |
| 
 | |
| import pytest
 | |
| import numpy
 | |
| from spacy.attrs import HEAD
 | |
| 
 | |
| def make_doc(EN, sentstr):
 | |
| 	sent = sentstr.split(' ')
 | |
| 	doc = EN.tokenizer.tokens_from_list(sent)
 | |
| 	EN.tagger(doc)
 | |
| 	return doc
 | |
| 
 | |
| 
 | |
| @pytest.mark.models
 | |
| def test_space_attachment(EN):
 | |
|     sentence = 'This is a test.\nTo ensure  spaces are attached well.'
 | |
|     doc = EN(sentence)
 | |
| 
 | |
|     for sent in doc.sents:
 | |
|         if len(sent) == 1:
 | |
|             assert not sent[-1].is_space
 | |
| 
 | |
| 
 | |
| @pytest.mark.models
 | |
| def test_sentence_space(EN):
 | |
|     text = ('''I look forward to using Thingamajig.  I've been told it will '''
 | |
|             '''make my life easier...''')
 | |
|     doc = EN(text)
 | |
|     assert len(list(doc.sents)) == 2
 | |
| 
 | |
| 
 | |
| @pytest.mark.models
 | |
| def test_space_attachment_leading_space(EN):
 | |
| 	# leading space token
 | |
| 	doc = make_doc(EN, '\t \n This is a sentence .')
 | |
| 	assert doc[0].is_space
 | |
| 	assert doc[1].is_space
 | |
| 	assert doc[2].orth_ == 'This'
 | |
| 	with EN.parser.step_through(doc) as stepwise:
 | |
| 		pass
 | |
| 	assert doc[0].head.i == 2
 | |
| 	assert doc[1].head.i == 2
 | |
| 	assert stepwise.stack == set([2])
 | |
| 
 | |
| 
 | |
| @pytest.mark.models
 | |
| def test_space_attachment_intermediate_and_trailing_space(EN):
 | |
| 	# intermediate and trailing space tokens
 | |
| 	doc = make_doc(EN, 'This is \t a \t\n \n sentence . \n\n \n')
 | |
| 	assert doc[2].is_space
 | |
| 	assert doc[4].is_space
 | |
| 	assert doc[5].is_space
 | |
| 	assert doc[8].is_space
 | |
| 	assert doc[9].is_space
 | |
| 	with EN.parser.step_through(doc) as stepwise:
 | |
| 		stepwise.transition('L-nsubj')
 | |
| 		stepwise.transition('S')
 | |
| 		stepwise.transition('L-det')
 | |
| 		stepwise.transition('R-attr')
 | |
| 		stepwise.transition('D')
 | |
| 		stepwise.transition('R-punct')
 | |
| 	assert stepwise.stack == set([])
 | |
| 	for tok in doc:
 | |
| 		assert tok.dep != 0 or tok.is_space
 | |
| 	assert [ tok.head.i for tok in doc ] == [1,1,1,6,3,3,1,1,7,7]
 | |
| 
 | |
| 
 | |
| @pytest.mark.models
 | |
| def test_space_attachment_one_space_sentence(EN):
 | |
| 	# one space token sentence
 | |
| 	doc = make_doc(EN, '\n')
 | |
| 	assert len(doc) == 1
 | |
| 	with EN.parser.step_through(doc) as _:
 | |
| 		pass
 | |
| 	assert doc[0].is_space
 | |
| 	assert doc[0].head.i == 0
 | |
| 
 | |
| 
 | |
| @pytest.mark.models
 | |
| def test_space_attachment_only_space_sentence(EN):
 | |
| 	# space-exclusive sentence
 | |
| 	doc = make_doc(EN, '\n \t \n\n \t')
 | |
| 	assert len(doc) == 4
 | |
| 	for tok in doc:
 | |
| 		assert tok.is_space
 | |
| 	with EN.parser.step_through(doc) as _:
 | |
| 		pass
 | |
| 	# all tokens are attached to the last one
 | |
| 	for tok in doc:
 | |
| 		assert tok.head.i == 3
 |