Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-08-09 06:34:54 +03:00 · 2017-03-09 18:45:43 -06:00 · 2017-03-09 18:45:43 -06:00 · b5247c49eb
commit b5247c49eb
parent 35124b144a 5b0b968d13
5 changed files with 28 additions and 6 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -10,3 +10,4 @@ six
 ujson>=1.35
 cloudpickle
 sputnik>=0.9.2,<0.10.0
+dill>=0.2,<0.3
--- a/setup.py
+++ b/setup.py
@ -241,7 +241,8 @@ def setup_package():
                'cloudpickle',
                'pathlib',
                'sputnik>=0.9.2,<0.10.0',
-                'ujson>=1.35'],
+                'ujson>=1.35',
+                'dill>=0.2,<0.3'],
            classifiers=[
                'Development Status :: 5 - Production/Stable',
                'Environment :: Console',
--- a/spacy/tests/regression/test_issue792.py
+++ b/spacy/tests/regression/test_issue792.py
@ -4,9 +4,15 @@ from __future__ import unicode_literals
 import pytest


-@pytest.mark.xfail
@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
 def test_issue792(en_tokenizer, text):
-    """Test for Issue #792: Trailing whitespace is removed after parsing."""
+    """Test for Issue #792: Trailing whitespace is removed after tokenization."""
    doc = en_tokenizer(text)
-    assert doc.text_with_ws == text
+    assert ''.join([token.text_with_ws for token in doc]) == text
+
+
+@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"])
+def test_control_issue792(en_tokenizer, text):
+    """Test base case for Issue #792: Non-trailing whitespace"""
+    doc = en_tokenizer(text)
+    assert ''.join([token.text_with_ws for token in doc]) == text
--- a/spacy/tests/regression/test_issue859.py
+++ b/spacy/tests/regression/test_issue859.py
@ -0,0 +1,12 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+import pytest
+
+
+@pytest.mark.parametrize('text', ["aaabbb@ccc.com\nThank you!",
+                                  "aaabbb@ccc.com \nThank you!"])
+def test_issue859(en_tokenizer, text):
+    """Test that no extra space is added in doc.text method."""
+    doc = en_tokenizer(text)
+    assert doc.text == text
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -163,7 +163,6 @@ cdef class Tokenizer:
                    start = i
                in_ws = not in_ws
            i += 1
-        i += 1
        if start < i:
            span = string[start:]
            key = hash_string(span)
@ -275,7 +274,10 @@ cdef class Tokenizer:
            if cache_hit:
                pass
            elif self.token_match and self.token_match(string): 
-                tokens.push_back(self.vocab.get(tokens.mem, string), not suffixes.size())
+                # We're always saying 'no' to spaces here -- the caller will
+                # fix up the outermost one, with reference to the original.
+                # See Issue #859
+                tokens.push_back(self.vocab.get(tokens.mem, string), False)
            else:
                matches = self.find_infix(string)
                if not matches: