diff --git a/requirements.txt b/requirements.txt index 538862aed..4a75f6be3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ six ujson>=1.35 cloudpickle sputnik>=0.9.2,<0.10.0 +dill>=0.2,<0.3 diff --git a/setup.py b/setup.py index fc316e72f..49ea639e2 100644 --- a/setup.py +++ b/setup.py @@ -241,7 +241,8 @@ def setup_package(): 'cloudpickle', 'pathlib', 'sputnik>=0.9.2,<0.10.0', - 'ujson>=1.35'], + 'ujson>=1.35', + 'dill>=0.2,<0.3'], classifiers=[ 'Development Status :: 5 - Production/Stable', 'Environment :: Console', diff --git a/spacy/tests/regression/test_issue792.py b/spacy/tests/regression/test_issue792.py index 563e061a6..df8b5ef50 100644 --- a/spacy/tests/regression/test_issue792.py +++ b/spacy/tests/regression/test_issue792.py @@ -4,9 +4,15 @@ from __future__ import unicode_literals import pytest -@pytest.mark.xfail @pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"]) def test_issue792(en_tokenizer, text): - """Test for Issue #792: Trailing whitespace is removed after parsing.""" + """Test for Issue #792: Trailing whitespace is removed after tokenization.""" doc = en_tokenizer(text) - assert doc.text_with_ws == text + assert ''.join([token.text_with_ws for token in doc]) == text + + +@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"]) +def test_control_issue792(en_tokenizer, text): + """Test base case for Issue #792: Non-trailing whitespace""" + doc = en_tokenizer(text) + assert ''.join([token.text_with_ws for token in doc]) == text diff --git a/spacy/tests/regression/test_issue859.py b/spacy/tests/regression/test_issue859.py new file mode 100644 index 000000000..4a2d08df7 --- /dev/null +++ b/spacy/tests/regression/test_issue859.py @@ -0,0 +1,12 @@ +# encoding: utf8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize('text', ["aaabbb@ccc.com\nThank you!", + "aaabbb@ccc.com \nThank you!"]) +def test_issue859(en_tokenizer, text): + """Test that no extra space is added in doc.text method.""" + doc = en_tokenizer(text) + assert doc.text == text diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 8f2f111e7..5a4eb844a 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -163,7 +163,6 @@ cdef class Tokenizer: start = i in_ws = not in_ws i += 1 - i += 1 if start < i: span = string[start:] key = hash_string(span) @@ -275,7 +274,10 @@ cdef class Tokenizer: if cache_hit: pass elif self.token_match and self.token_match(string): - tokens.push_back(self.vocab.get(tokens.mem, string), not suffixes.size()) + # We're always saying 'no' to spaces here -- the caller will + # fix up the outermost one, with reference to the original. + # See Issue #859 + tokens.push_back(self.vocab.get(tokens.mem, string), False) else: matches = self.find_infix(string) if not matches: