mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-04 21:50:35 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
b5247c49eb
|
@ -10,3 +10,4 @@ six
|
||||||
ujson>=1.35
|
ujson>=1.35
|
||||||
cloudpickle
|
cloudpickle
|
||||||
sputnik>=0.9.2,<0.10.0
|
sputnik>=0.9.2,<0.10.0
|
||||||
|
dill>=0.2,<0.3
|
||||||
|
|
3
setup.py
3
setup.py
|
@ -241,7 +241,8 @@ def setup_package():
|
||||||
'cloudpickle',
|
'cloudpickle',
|
||||||
'pathlib',
|
'pathlib',
|
||||||
'sputnik>=0.9.2,<0.10.0',
|
'sputnik>=0.9.2,<0.10.0',
|
||||||
'ujson>=1.35'],
|
'ujson>=1.35',
|
||||||
|
'dill>=0.2,<0.3'],
|
||||||
classifiers=[
|
classifiers=[
|
||||||
'Development Status :: 5 - Production/Stable',
|
'Development Status :: 5 - Production/Stable',
|
||||||
'Environment :: Console',
|
'Environment :: Console',
|
||||||
|
|
|
@ -4,9 +4,15 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail
|
|
||||||
@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
|
@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"])
|
||||||
def test_issue792(en_tokenizer, text):
|
def test_issue792(en_tokenizer, text):
|
||||||
"""Test for Issue #792: Trailing whitespace is removed after parsing."""
|
"""Test for Issue #792: Trailing whitespace is removed after tokenization."""
|
||||||
doc = en_tokenizer(text)
|
doc = en_tokenizer(text)
|
||||||
assert doc.text_with_ws == text
|
assert ''.join([token.text_with_ws for token in doc]) == text
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"])
|
||||||
|
def test_control_issue792(en_tokenizer, text):
|
||||||
|
"""Test base case for Issue #792: Non-trailing whitespace"""
|
||||||
|
doc = en_tokenizer(text)
|
||||||
|
assert ''.join([token.text_with_ws for token in doc]) == text
|
||||||
|
|
12
spacy/tests/regression/test_issue859.py
Normal file
12
spacy/tests/regression/test_issue859.py
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text', ["aaabbb@ccc.com\nThank you!",
|
||||||
|
"aaabbb@ccc.com \nThank you!"])
|
||||||
|
def test_issue859(en_tokenizer, text):
|
||||||
|
"""Test that no extra space is added in doc.text method."""
|
||||||
|
doc = en_tokenizer(text)
|
||||||
|
assert doc.text == text
|
|
@ -163,7 +163,6 @@ cdef class Tokenizer:
|
||||||
start = i
|
start = i
|
||||||
in_ws = not in_ws
|
in_ws = not in_ws
|
||||||
i += 1
|
i += 1
|
||||||
i += 1
|
|
||||||
if start < i:
|
if start < i:
|
||||||
span = string[start:]
|
span = string[start:]
|
||||||
key = hash_string(span)
|
key = hash_string(span)
|
||||||
|
@ -275,7 +274,10 @@ cdef class Tokenizer:
|
||||||
if cache_hit:
|
if cache_hit:
|
||||||
pass
|
pass
|
||||||
elif self.token_match and self.token_match(string):
|
elif self.token_match and self.token_match(string):
|
||||||
tokens.push_back(self.vocab.get(tokens.mem, string), not suffixes.size())
|
# We're always saying 'no' to spaces here -- the caller will
|
||||||
|
# fix up the outermost one, with reference to the original.
|
||||||
|
# See Issue #859
|
||||||
|
tokens.push_back(self.vocab.get(tokens.mem, string), False)
|
||||||
else:
|
else:
|
||||||
matches = self.find_infix(string)
|
matches = self.find_infix(string)
|
||||||
if not matches:
|
if not matches:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user