mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-14 05:37:03 +03:00
d99a9cbce9
space tokens are now always attached to the previous non-space token there are two exceptions: leading space tokens are attached to the first following non-space token in input that consists exclusively of space tokens, the last space token is the head of all others.
91 lines
2.3 KiB
Python
91 lines
2.3 KiB
Python
from __future__ import unicode_literals
|
|
|
|
import pytest
|
|
import numpy
|
|
from spacy.attrs import HEAD
|
|
|
|
def make_doc(EN, sentstr):
|
|
sent = sentstr.split(' ')
|
|
doc = EN.tokenizer.tokens_from_list(sent)
|
|
EN.tagger(doc)
|
|
return doc
|
|
|
|
|
|
@pytest.mark.models
|
|
def test_space_attachment(EN):
|
|
sentence = 'This is a test.\nTo ensure spaces are attached well.'
|
|
doc = EN(sentence)
|
|
|
|
for sent in doc.sents:
|
|
if len(sent) == 1:
|
|
assert not sent[-1].is_space
|
|
|
|
|
|
@pytest.mark.models
|
|
def test_sentence_space(EN):
|
|
text = ('''I look forward to using Thingamajig. I've been told it will '''
|
|
'''make my life easier...''')
|
|
doc = EN(text)
|
|
assert len(list(doc.sents)) == 2
|
|
|
|
|
|
@pytest.mark.models
|
|
def test_space_attachment_leading_space(EN):
|
|
# leading space token
|
|
doc = make_doc(EN, '\t \n This is a sentence .')
|
|
assert doc[0].is_space
|
|
assert doc[1].is_space
|
|
assert doc[2].orth_ == 'This'
|
|
with EN.parser.step_through(doc) as stepwise:
|
|
pass
|
|
assert doc[0].head.i == 2
|
|
assert doc[1].head.i == 2
|
|
assert stepwise.stack == set([2])
|
|
|
|
|
|
@pytest.mark.models
|
|
def test_space_attachment_intermediate_and_trailing_space(EN):
|
|
# intermediate and trailing space tokens
|
|
doc = make_doc(EN, 'This is \t a \t\n \n sentence . \n\n \n')
|
|
assert doc[2].is_space
|
|
assert doc[4].is_space
|
|
assert doc[5].is_space
|
|
assert doc[8].is_space
|
|
assert doc[9].is_space
|
|
with EN.parser.step_through(doc) as stepwise:
|
|
stepwise.transition('L-nsubj')
|
|
stepwise.transition('S')
|
|
stepwise.transition('L-det')
|
|
stepwise.transition('R-attr')
|
|
stepwise.transition('D')
|
|
stepwise.transition('R-punct')
|
|
assert stepwise.stack == set([])
|
|
for tok in doc:
|
|
assert tok.dep != 0 or tok.is_space
|
|
assert [ tok.head.i for tok in doc ] == [1,1,1,6,3,3,1,1,7,7]
|
|
|
|
|
|
@pytest.mark.models
|
|
def test_space_attachment_one_space_sentence(EN):
|
|
# one space token sentence
|
|
doc = make_doc(EN, '\n')
|
|
assert len(doc) == 1
|
|
with EN.parser.step_through(doc) as _:
|
|
pass
|
|
assert doc[0].is_space
|
|
assert doc[0].head.i == 0
|
|
|
|
|
|
@pytest.mark.models
|
|
def test_space_attachment_only_space_sentence(EN):
|
|
# space-exclusive sentence
|
|
doc = make_doc(EN, '\n \t \n\n \t')
|
|
assert len(doc) == 4
|
|
for tok in doc:
|
|
assert tok.is_space
|
|
with EN.parser.step_through(doc) as _:
|
|
pass
|
|
# all tokens are attached to the last one
|
|
for tok in doc:
|
|
assert tok.head.i == 3
|