2017-01-31 16:47:42 +03:00
|
|
|
# coding: utf-8
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2017-01-31 17:18:30 +03:00
|
|
|
import pytest
|
2017-01-31 16:47:42 +03:00
|
|
|
|
2017-01-31 17:19:33 +03:00
|
|
|
|
2017-01-31 17:14:42 +03:00
|
|
|
@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
|
|
|
|
def test_issue792(en_tokenizer, text):
|
2017-03-08 17:01:40 +03:00
|
|
|
"""Test for Issue #792: Trailing whitespace is removed after tokenization."""
|
2017-01-31 17:14:42 +03:00
|
|
|
doc = en_tokenizer(text)
|
2017-03-08 17:01:40 +03:00
|
|
|
assert ''.join([token.text_with_ws for token in doc]) == text
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"])
|
|
|
|
def test_control_issue792(en_tokenizer, text):
|
|
|
|
"""Test base case for Issue #792: Non-trailing whitespace"""
|
|
|
|
doc = en_tokenizer(text)
|
|
|
|
assert ''.join([token.text_with_ws for token in doc]) == text
|