mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Fix handling of trailing whitespace
Fix off-by-one error that meant trailing spaces were being dropped. Closes #792
This commit is contained in:
parent
77f0594761
commit
0ac3d27689
|
@ -4,9 +4,15 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
|
||||
def test_issue792(en_tokenizer, text):
|
||||
"""Test for Issue #792: Trailing whitespace is removed after parsing."""
|
||||
"""Test for Issue #792: Trailing whitespace is removed after tokenization."""
|
||||
doc = en_tokenizer(text)
|
||||
assert doc.text_with_ws == text
|
||||
assert ''.join([token.text_with_ws for token in doc]) == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"])
|
||||
def test_control_issue792(en_tokenizer, text):
|
||||
"""Test base case for Issue #792: Non-trailing whitespace"""
|
||||
doc = en_tokenizer(text)
|
||||
assert ''.join([token.text_with_ws for token in doc]) == text
|
||||
|
|
|
@ -163,7 +163,6 @@ cdef class Tokenizer:
|
|||
start = i
|
||||
in_ws = not in_ws
|
||||
i += 1
|
||||
i += 1
|
||||
if start < i:
|
||||
span = string[start:]
|
||||
key = hash_string(span)
|
||||
|
|
Loading…
Reference in New Issue
Block a user