diff --git a/spacy/tests/regression/test_issue351.py b/spacy/tests/regression/test_issue351.py new file mode 100644 index 000000000..84d4398c5 --- /dev/null +++ b/spacy/tests/regression/test_issue351.py @@ -0,0 +1,16 @@ +from __future__ import unicode_literals +from ...en import English + +import pytest + + +@pytest.fixture +def en_tokenizer(): + return English.Defaults.create_tokenizer() + + +def test_issue351(en_tokenizer): + doc = en_tokenizer(" This is a cat.") + assert doc[0].idx == 0 + assert len(doc[0]) == 3 + assert doc[1].idx == 3 diff --git a/spacy/tests/tokenizer/test_whitespace.py b/spacy/tests/tokenizer/test_whitespace.py index 906ad310c..8ba138b0c 100644 --- a/spacy/tests/tokenizer/test_whitespace.py +++ b/spacy/tests/tokenizer/test_whitespace.py @@ -42,35 +42,3 @@ def test_tokenizer_splits_newline_double_space(en_tokenizer, text): def test_tokenizer_splits_newline_space_wrap(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 3 - - -def test_leading_space_offsets(en_tokenizer): - '''Issue #351 - # this works - - text1 = u"This is a cat." - a = english_spacy(text1) - - tok0 = list(a.sents)[0][0] - print tok0, tok0.idx, text1[tok0.idx] - - tok1 = list(a.sents)[0][1] - print tok1, tok1.idx, text1[tok1.idx] - - print "==" - - # this does not work - - text2 = u" This is a cat." - b = english_spacy(text2) - - tok0 = list(b.sents)[0][0] -print tok0, tok0.idx, text2[tok0.idx] - - tok1 = list(b.sents)[0][1] - print tok1, tok1.idx, text2[tok1.idx] - ''' - doc = en_tokenizer(u" This is a cat.") - assert doc[0].idx == 0 - assert len(doc[0]) == 3 - assert doc[1].idx == 3