spaCy/spacy/tests/parser/test_space_attachment.py

from __future__ import unicode_literals

import pytest
import numpy
from spacy.attrs import HEAD

def make_doc(EN, sentstr):
	sent = sentstr.split(' ')
	doc = EN.tokenizer.tokens_from_list(sent)
	EN.tagger(doc)
	return doc


@pytest.mark.models
def test_space_attachment(EN):
    sentence = 'This is a test.\nTo ensure  spaces are attached well.'
    doc = EN(sentence)

    for sent in doc.sents:
        if len(sent) == 1:
            assert not sent[-1].is_space


@pytest.mark.models
def test_sentence_space(EN):
    text = ('''I look forward to using Thingamajig.  I've been told it will '''
            '''make my life easier...''')
    doc = EN(text)
    assert len(list(doc.sents)) == 2


@pytest.mark.models
def test_space_attachment_leading_space(EN):
	# leading space token
	doc = make_doc(EN, '\t \n This is a sentence .')
	assert doc[0].is_space
	assert doc[1].is_space
	assert doc[2].orth_ == 'This'
	with EN.parser.step_through(doc) as stepwise:
		pass
	assert doc[0].head.i == 2
	assert doc[1].head.i == 2
	assert stepwise.stack == set([2])


@pytest.mark.models
def test_space_attachment_intermediate_and_trailing_space(EN):
	# intermediate and trailing space tokens
	doc = make_doc(EN, 'This is \t a \t\n \n sentence . \n\n \n')
	assert doc[2].is_space
	assert doc[4].is_space
	assert doc[5].is_space
	assert doc[8].is_space
	assert doc[9].is_space
	with EN.parser.step_through(doc) as stepwise:
		stepwise.transition('L-nsubj')
		stepwise.transition('S')
		stepwise.transition('L-det')
		stepwise.transition('R-attr')
		stepwise.transition('D')
		stepwise.transition('R-punct')
	assert stepwise.stack == set([])
	for tok in doc:
		assert tok.dep != 0 or tok.is_space
	assert [ tok.head.i for tok in doc ] == [1,1,1,6,3,3,1,1,7,7]


@pytest.mark.models
def test_space_attachment_one_space_sentence(EN):
	# one space token sentence
	doc = make_doc(EN, '\n')
	assert len(doc) == 1
	with EN.parser.step_through(doc) as _:
		pass
	assert doc[0].is_space
	assert doc[0].head.i == 0


@pytest.mark.models
def test_space_attachment_only_space_sentence(EN):
	# space-exclusive sentence
	doc = make_doc(EN, '\n \t \n\n \t')
	assert len(doc) == 4
	for tok in doc:
		assert tok.is_space
	with EN.parser.step_through(doc) as _:
		pass
	# all tokens are attached to the last one
	for tok in doc:
		assert tok.head.i == 3
* Add test for how spaces are attached by the parser. 2015-10-10 08:03:13 +03:00			`from __future__ import unicode_literals`

			`import pytest`
* Add test for Issue #184: Whitespace at sentence boundary causes sentence boundary error. 2016-01-19 01:04:38 +03:00			`import numpy`
			`from spacy.attrs import HEAD`

different handling of space tokens space tokens are now always attached to the previous non-space token there are two exceptions: leading space tokens are attached to the first following non-space token in input that consists exclusively of space tokens, the last space token is the head of all others. 2016-04-13 16:28:28 +03:00			`def make_doc(EN, sentstr):`
			`sent = sentstr.split(' ')`
			`doc = EN.tokenizer.tokens_from_list(sent)`
			`EN.tagger(doc)`
			`return doc`

* Add test for how spaces are attached by the parser. 2015-10-10 08:03:13 +03:00
			`@pytest.mark.models`
			`def test_space_attachment(EN):`
			`sentence = 'This is a test.\nTo ensure spaces are attached well.'`
			`doc = EN(sentence)`

* Fix test_space_attachment 2015-10-14 19:20:51 +03:00			`for sent in doc.sents:`
* Fix test_space_attachment 2015-10-14 19:24:57 +03:00			`if len(sent) == 1:`
			`assert not sent[-1].is_space`
* Add test for Issue #184: Whitespace at sentence boundary causes sentence boundary error. 2016-01-19 01:04:38 +03:00

mark test_sentence_space() as model test 2016-02-10 09:49:11 +03:00			`@pytest.mark.models`
* Add test for Issue #184: Whitespace at sentence boundary causes sentence boundary error. 2016-01-19 01:04:38 +03:00			`def test_sentence_space(EN):`
			`text = ('''I look forward to using Thingamajig. I've been told it will '''`
			`'''make my life easier...''')`
			`doc = EN(text)`
			`assert len(list(doc.sents)) == 2`

different handling of space tokens space tokens are now always attached to the previous non-space token there are two exceptions: leading space tokens are attached to the first following non-space token in input that consists exclusively of space tokens, the last space token is the head of all others. 2016-04-13 16:28:28 +03:00
			`@pytest.mark.models`
			`def test_space_attachment_leading_space(EN):`
			`# leading space token`
			`doc = make_doc(EN, '\t \n This is a sentence .')`
			`assert doc[0].is_space`
			`assert doc[1].is_space`
			`assert doc[2].orth_ == 'This'`
			`with EN.parser.step_through(doc) as stepwise:`
			`pass`
			`assert doc[0].head.i == 2`
			`assert doc[1].head.i == 2`
			`assert stepwise.stack == set([2])`


			`@pytest.mark.models`
			`def test_space_attachment_intermediate_and_trailing_space(EN):`
			`# intermediate and trailing space tokens`
			`doc = make_doc(EN, 'This is \t a \t\n \n sentence . \n\n \n')`
			`assert doc[2].is_space`
			`assert doc[4].is_space`
			`assert doc[5].is_space`
			`assert doc[8].is_space`
			`assert doc[9].is_space`
			`with EN.parser.step_through(doc) as stepwise:`
			`stepwise.transition('L-nsubj')`
			`stepwise.transition('S')`
			`stepwise.transition('L-det')`
			`stepwise.transition('R-attr')`
			`stepwise.transition('D')`
			`stepwise.transition('R-punct')`
			`assert stepwise.stack == set([])`
			`for tok in doc:`
			`assert tok.dep != 0 or tok.is_space`
			`assert [ tok.head.i for tok in doc ] == [1,1,1,6,3,3,1,1,7,7]`


			`@pytest.mark.models`
			`def test_space_attachment_one_space_sentence(EN):`
			`# one space token sentence`
			`doc = make_doc(EN, '\n')`
			`assert len(doc) == 1`
			`with EN.parser.step_through(doc) as _:`
			`pass`
			`assert doc[0].is_space`
			`assert doc[0].head.i == 0`


			`@pytest.mark.models`
			`def test_space_attachment_only_space_sentence(EN):`
			`# space-exclusive sentence`
			`doc = make_doc(EN, '\n \t \n\n \t')`
			`assert len(doc) == 4`
			`for tok in doc:`
			`assert tok.is_space`
			`with EN.parser.step_through(doc) as _:`
			`pass`
			`# all tokens are attached to the last one`
			`for tok in doc:`
			`assert tok.head.i == 3`