mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	space tokens are now always attached to the previous non-space token there are two exceptions: leading space tokens are attached to the first following non-space token in input that consists exclusively of space tokens, the last space token is the head of all others.
		
			
				
	
	
		
			91 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			91 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from __future__ import unicode_literals
 | 
						|
 | 
						|
import pytest
 | 
						|
import numpy
 | 
						|
from spacy.attrs import HEAD
 | 
						|
 | 
						|
def make_doc(EN, sentstr):
 | 
						|
	sent = sentstr.split(' ')
 | 
						|
	doc = EN.tokenizer.tokens_from_list(sent)
 | 
						|
	EN.tagger(doc)
 | 
						|
	return doc
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.models
 | 
						|
def test_space_attachment(EN):
 | 
						|
    sentence = 'This is a test.\nTo ensure  spaces are attached well.'
 | 
						|
    doc = EN(sentence)
 | 
						|
 | 
						|
    for sent in doc.sents:
 | 
						|
        if len(sent) == 1:
 | 
						|
            assert not sent[-1].is_space
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.models
 | 
						|
def test_sentence_space(EN):
 | 
						|
    text = ('''I look forward to using Thingamajig.  I've been told it will '''
 | 
						|
            '''make my life easier...''')
 | 
						|
    doc = EN(text)
 | 
						|
    assert len(list(doc.sents)) == 2
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.models
 | 
						|
def test_space_attachment_leading_space(EN):
 | 
						|
	# leading space token
 | 
						|
	doc = make_doc(EN, '\t \n This is a sentence .')
 | 
						|
	assert doc[0].is_space
 | 
						|
	assert doc[1].is_space
 | 
						|
	assert doc[2].orth_ == 'This'
 | 
						|
	with EN.parser.step_through(doc) as stepwise:
 | 
						|
		pass
 | 
						|
	assert doc[0].head.i == 2
 | 
						|
	assert doc[1].head.i == 2
 | 
						|
	assert stepwise.stack == set([2])
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.models
 | 
						|
def test_space_attachment_intermediate_and_trailing_space(EN):
 | 
						|
	# intermediate and trailing space tokens
 | 
						|
	doc = make_doc(EN, 'This is \t a \t\n \n sentence . \n\n \n')
 | 
						|
	assert doc[2].is_space
 | 
						|
	assert doc[4].is_space
 | 
						|
	assert doc[5].is_space
 | 
						|
	assert doc[8].is_space
 | 
						|
	assert doc[9].is_space
 | 
						|
	with EN.parser.step_through(doc) as stepwise:
 | 
						|
		stepwise.transition('L-nsubj')
 | 
						|
		stepwise.transition('S')
 | 
						|
		stepwise.transition('L-det')
 | 
						|
		stepwise.transition('R-attr')
 | 
						|
		stepwise.transition('D')
 | 
						|
		stepwise.transition('R-punct')
 | 
						|
	assert stepwise.stack == set([])
 | 
						|
	for tok in doc:
 | 
						|
		assert tok.dep != 0 or tok.is_space
 | 
						|
	assert [ tok.head.i for tok in doc ] == [1,1,1,6,3,3,1,1,7,7]
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.models
 | 
						|
def test_space_attachment_one_space_sentence(EN):
 | 
						|
	# one space token sentence
 | 
						|
	doc = make_doc(EN, '\n')
 | 
						|
	assert len(doc) == 1
 | 
						|
	with EN.parser.step_through(doc) as _:
 | 
						|
		pass
 | 
						|
	assert doc[0].is_space
 | 
						|
	assert doc[0].head.i == 0
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.models
 | 
						|
def test_space_attachment_only_space_sentence(EN):
 | 
						|
	# space-exclusive sentence
 | 
						|
	doc = make_doc(EN, '\n \t \n\n \t')
 | 
						|
	assert len(doc) == 4
 | 
						|
	for tok in doc:
 | 
						|
		assert tok.is_space
 | 
						|
	with EN.parser.step_through(doc) as _:
 | 
						|
		pass
 | 
						|
	# all tokens are attached to the last one
 | 
						|
	for tok in doc:
 | 
						|
		assert tok.head.i == 3
 |