mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-25 00:34:20 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
b5247c49eb
|
@ -10,3 +10,4 @@ six
|
|||
ujson>=1.35
|
||||
cloudpickle
|
||||
sputnik>=0.9.2,<0.10.0
|
||||
dill>=0.2,<0.3
|
||||
|
|
3
setup.py
3
setup.py
|
@ -241,7 +241,8 @@ def setup_package():
|
|||
'cloudpickle',
|
||||
'pathlib',
|
||||
'sputnik>=0.9.2,<0.10.0',
|
||||
'ujson>=1.35'],
|
||||
'ujson>=1.35',
|
||||
'dill>=0.2,<0.3'],
|
||||
classifiers=[
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
'Environment :: Console',
|
||||
|
|
|
@ -4,9 +4,15 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
|
||||
def test_issue792(en_tokenizer, text):
|
||||
"""Test for Issue #792: Trailing whitespace is removed after parsing."""
|
||||
"""Test for Issue #792: Trailing whitespace is removed after tokenization."""
|
||||
doc = en_tokenizer(text)
|
||||
assert doc.text_with_ws == text
|
||||
assert ''.join([token.text_with_ws for token in doc]) == text
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"])
|
||||
def test_control_issue792(en_tokenizer, text):
|
||||
"""Test base case for Issue #792: Non-trailing whitespace"""
|
||||
doc = en_tokenizer(text)
|
||||
assert ''.join([token.text_with_ws for token in doc]) == text
|
||||
|
|
12
spacy/tests/regression/test_issue859.py
Normal file
12
spacy/tests/regression/test_issue859.py
Normal file
|
@ -0,0 +1,12 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text', ["aaabbb@ccc.com\nThank you!",
|
||||
"aaabbb@ccc.com \nThank you!"])
|
||||
def test_issue859(en_tokenizer, text):
|
||||
"""Test that no extra space is added in doc.text method."""
|
||||
doc = en_tokenizer(text)
|
||||
assert doc.text == text
|
|
@ -163,7 +163,6 @@ cdef class Tokenizer:
|
|||
start = i
|
||||
in_ws = not in_ws
|
||||
i += 1
|
||||
i += 1
|
||||
if start < i:
|
||||
span = string[start:]
|
||||
key = hash_string(span)
|
||||
|
@ -275,7 +274,10 @@ cdef class Tokenizer:
|
|||
if cache_hit:
|
||||
pass
|
||||
elif self.token_match and self.token_match(string):
|
||||
tokens.push_back(self.vocab.get(tokens.mem, string), not suffixes.size())
|
||||
# We're always saying 'no' to spaces here -- the caller will
|
||||
# fix up the outermost one, with reference to the original.
|
||||
# See Issue #859
|
||||
tokens.push_back(self.vocab.get(tokens.mem, string), False)
|
||||
else:
|
||||
matches = self.find_infix(string)
|
||||
if not matches:
|
||||
|
|
Loading…
Reference in New Issue
Block a user