2015-10-10 08:03:13 +03:00
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
import pytest
|
2016-01-19 01:04:38 +03:00
|
|
|
import numpy
|
|
|
|
from spacy.attrs import HEAD
|
|
|
|
|
2016-04-13 16:28:28 +03:00
|
|
|
def make_doc(EN, sentstr):
|
|
|
|
sent = sentstr.split(' ')
|
|
|
|
doc = EN.tokenizer.tokens_from_list(sent)
|
|
|
|
EN.tagger(doc)
|
|
|
|
return doc
|
|
|
|
|
2015-10-10 08:03:13 +03:00
|
|
|
|
|
|
|
@pytest.mark.models
|
|
|
|
def test_space_attachment(EN):
|
|
|
|
sentence = 'This is a test.\nTo ensure spaces are attached well.'
|
|
|
|
doc = EN(sentence)
|
|
|
|
|
2015-10-14 19:20:51 +03:00
|
|
|
for sent in doc.sents:
|
2015-10-14 19:24:57 +03:00
|
|
|
if len(sent) == 1:
|
|
|
|
assert not sent[-1].is_space
|
2016-01-19 01:04:38 +03:00
|
|
|
|
|
|
|
|
2016-02-10 09:49:11 +03:00
|
|
|
@pytest.mark.models
|
2016-01-19 01:04:38 +03:00
|
|
|
def test_sentence_space(EN):
|
|
|
|
text = ('''I look forward to using Thingamajig. I've been told it will '''
|
|
|
|
'''make my life easier...''')
|
|
|
|
doc = EN(text)
|
|
|
|
assert len(list(doc.sents)) == 2
|
|
|
|
|
2016-04-13 16:28:28 +03:00
|
|
|
|
|
|
|
@pytest.mark.models
|
|
|
|
def test_space_attachment_leading_space(EN):
|
|
|
|
# leading space token
|
|
|
|
doc = make_doc(EN, '\t \n This is a sentence .')
|
|
|
|
assert doc[0].is_space
|
|
|
|
assert doc[1].is_space
|
|
|
|
assert doc[2].orth_ == 'This'
|
|
|
|
with EN.parser.step_through(doc) as stepwise:
|
|
|
|
pass
|
|
|
|
assert doc[0].head.i == 2
|
|
|
|
assert doc[1].head.i == 2
|
|
|
|
assert stepwise.stack == set([2])
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.models
|
|
|
|
def test_space_attachment_intermediate_and_trailing_space(EN):
|
|
|
|
# intermediate and trailing space tokens
|
|
|
|
doc = make_doc(EN, 'This is \t a \t\n \n sentence . \n\n \n')
|
|
|
|
assert doc[2].is_space
|
|
|
|
assert doc[4].is_space
|
|
|
|
assert doc[5].is_space
|
|
|
|
assert doc[8].is_space
|
|
|
|
assert doc[9].is_space
|
|
|
|
with EN.parser.step_through(doc) as stepwise:
|
|
|
|
stepwise.transition('L-nsubj')
|
|
|
|
stepwise.transition('S')
|
|
|
|
stepwise.transition('L-det')
|
|
|
|
stepwise.transition('R-attr')
|
|
|
|
stepwise.transition('D')
|
|
|
|
stepwise.transition('R-punct')
|
|
|
|
assert stepwise.stack == set([])
|
|
|
|
for tok in doc:
|
|
|
|
assert tok.dep != 0 or tok.is_space
|
|
|
|
assert [ tok.head.i for tok in doc ] == [1,1,1,6,3,3,1,1,7,7]
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.models
|
|
|
|
def test_space_attachment_one_space_sentence(EN):
|
|
|
|
# one space token sentence
|
|
|
|
doc = make_doc(EN, '\n')
|
|
|
|
assert len(doc) == 1
|
|
|
|
with EN.parser.step_through(doc) as _:
|
|
|
|
pass
|
|
|
|
assert doc[0].is_space
|
|
|
|
assert doc[0].head.i == 0
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.models
|
|
|
|
def test_space_attachment_only_space_sentence(EN):
|
|
|
|
# space-exclusive sentence
|
|
|
|
doc = make_doc(EN, '\n \t \n\n \t')
|
|
|
|
assert len(doc) == 4
|
|
|
|
for tok in doc:
|
|
|
|
assert tok.is_space
|
|
|
|
with EN.parser.step_through(doc) as _:
|
|
|
|
pass
|
|
|
|
# all tokens are attached to the last one
|
|
|
|
for tok in doc:
|
|
|
|
assert tok.head.i == 3
|