Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-03-09 18:45:43 -06:00
commit b5247c49eb
5 changed files with 28 additions and 6 deletions

View File

@ -10,3 +10,4 @@ six
ujson>=1.35
cloudpickle
sputnik>=0.9.2,<0.10.0
dill>=0.2,<0.3

View File

@ -241,7 +241,8 @@ def setup_package():
'cloudpickle',
'pathlib',
'sputnik>=0.9.2,<0.10.0',
'ujson>=1.35'],
'ujson>=1.35',
'dill>=0.2,<0.3'],
classifiers=[
'Development Status :: 5 - Production/Stable',
'Environment :: Console',

View File

@ -4,9 +4,15 @@ from __future__ import unicode_literals
import pytest
@pytest.mark.xfail
@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
def test_issue792(en_tokenizer, text):
"""Test for Issue #792: Trailing whitespace is removed after parsing."""
"""Test for Issue #792: Trailing whitespace is removed after tokenization."""
doc = en_tokenizer(text)
assert doc.text_with_ws == text
assert ''.join([token.text_with_ws for token in doc]) == text
@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"])
def test_control_issue792(en_tokenizer, text):
"""Test base case for Issue #792: Non-trailing whitespace"""
doc = en_tokenizer(text)
assert ''.join([token.text_with_ws for token in doc]) == text

View File

@ -0,0 +1,12 @@
# encoding: utf8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize('text', ["aaabbb@ccc.com\nThank you!",
"aaabbb@ccc.com \nThank you!"])
def test_issue859(en_tokenizer, text):
"""Test that no extra space is added in doc.text method."""
doc = en_tokenizer(text)
assert doc.text == text

View File

@ -163,7 +163,6 @@ cdef class Tokenizer:
start = i
in_ws = not in_ws
i += 1
i += 1
if start < i:
span = string[start:]
key = hash_string(span)
@ -275,7 +274,10 @@ cdef class Tokenizer:
if cache_hit:
pass
elif self.token_match and self.token_match(string):
tokens.push_back(self.vocab.get(tokens.mem, string), not suffixes.size())
# We're always saying 'no' to spaces here -- the caller will
# fix up the outermost one, with reference to the original.
# See Issue #859
tokens.push_back(self.vocab.get(tokens.mem, string), False)
else:
matches = self.find_infix(string)
if not matches: