Modernize tokenizer tests for whitespace

This commit is contained in:
Ines Montani 2017-01-04 00:46:35 +01:00
parent aafc894285
commit 667051375d

View File

@ -1,37 +1,46 @@
"""Test that tokens are created correctly for whitespace.""" """Test that tokens are created correctly for whitespace."""
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
def test_single_space(en_tokenizer): @pytest.mark.parametrize('text', ["hello possums"])
tokens = en_tokenizer('hello possums') def test_tokenizer_splits_single_space(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 2 assert len(tokens) == 2
def test_double_space(en_tokenizer): @pytest.mark.parametrize('text', ["hello possums"])
tokens = en_tokenizer('hello possums') def test_tokenizer_splits_double_space(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3 assert len(tokens) == 3
assert tokens[1].orth_ == ' ' assert tokens[1].text == " "
def test_newline(en_tokenizer): @pytest.mark.parametrize('text', ["hello\npossums"])
tokens = en_tokenizer('hello\npossums') def test_tokenizer_splits_newline(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3 assert len(tokens) == 3
assert tokens[1].text == "\n"
def test_newline_space(en_tokenizer): @pytest.mark.parametrize('text', ["hello \npossums"])
def test_tokenizer_splits_newline_space(en_tokenizer, text):
tokens = en_tokenizer('hello \npossums') tokens = en_tokenizer('hello \npossums')
assert len(tokens) == 3 assert len(tokens) == 3
def test_newline_double_space(en_tokenizer): @pytest.mark.parametrize('text', ["hello \npossums"])
tokens = en_tokenizer('hello \npossums') def test_tokenizer_splits_newline_double_space(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3 assert len(tokens) == 3
def test_newline_space_wrap(en_tokenizer): @pytest.mark.parametrize('text', ["hello \n possums"])
tokens = en_tokenizer('hello \n possums') def test_tokenizer_splits_newline_space_wrap(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 3 assert len(tokens) == 3