mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
* Add test for Issue #351: Indices off when leading whitespace
This commit is contained in:
parent
76021cb853
commit
b4bfc6ae55
|
@ -33,3 +33,35 @@ def test_newline_double_space(en_tokenizer):
|
|||
def test_newline_space_wrap(en_tokenizer):
|
||||
tokens = en_tokenizer('hello \n possums')
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
def test_leading_space_offsets(en_tokenizer):
|
||||
'''Issue #351
|
||||
# this works
|
||||
|
||||
text1 = u"This is a cat."
|
||||
a = english_spacy(text1)
|
||||
|
||||
tok0 = list(a.sents)[0][0]
|
||||
print tok0, tok0.idx, text1[tok0.idx]
|
||||
|
||||
tok1 = list(a.sents)[0][1]
|
||||
print tok1, tok1.idx, text1[tok1.idx]
|
||||
|
||||
print "=="
|
||||
|
||||
# this does not work
|
||||
|
||||
text2 = u" This is a cat."
|
||||
b = english_spacy(text2)
|
||||
|
||||
tok0 = list(b.sents)[0][0]
|
||||
print tok0, tok0.idx, text2[tok0.idx]
|
||||
|
||||
tok1 = list(b.sents)[0][1]
|
||||
print tok1, tok1.idx, text2[tok1.idx]
|
||||
'''
|
||||
doc = en_tokenizer(u" This is a cat.")
|
||||
assert doc[0].idx == 0
|
||||
assert len(doc[0]) == 3
|
||||
assert doc[1].idx == 3
|
||||
|
|
Loading…
Reference in New Issue
Block a user