mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-10 16:22:29 +03:00
* Fix Issue #323: Incorrect semantics of Token.__str__ built-in. Add flag to allow users to switch the old semantics back on, to ease transition.
This commit is contained in:
parent
13a6899fc6
commit
6df3858dbc
|
@ -38,6 +38,12 @@ def test_single_token_string(EN):
|
||||||
assert tokens[0].text == 'foobar'
|
assert tokens[0].text == 'foobar'
|
||||||
|
|
||||||
|
|
||||||
|
def test_str_builtin(EN):
|
||||||
|
tokens = EN('one two')
|
||||||
|
assert str(tokens[0]) == u'one'
|
||||||
|
assert str(tokens[1]) == u'two'
|
||||||
|
|
||||||
|
|
||||||
def test_is_properties(EN):
|
def test_is_properties(EN):
|
||||||
Hi, comma, my, email, is_, addr = EN(u'Hi, my email is test@me.com')
|
Hi, comma, my, email, is_, addr = EN(u'Hi, my email is test@me.com')
|
||||||
assert Hi.is_title
|
assert Hi.is_title
|
||||||
|
|
|
@ -28,6 +28,27 @@ from ..attrs cimport IS_OOV
|
||||||
from ..lexeme cimport Lexeme
|
from ..lexeme cimport Lexeme
|
||||||
|
|
||||||
|
|
||||||
|
_STR_TRAILING_WHITESPACE = False
|
||||||
|
|
||||||
|
def use_deprecated_Token__str__semantics(value):
|
||||||
|
'''
|
||||||
|
Preserve deprecated semantics for Token.__str__ and Token.__unicode__ methods.
|
||||||
|
|
||||||
|
spaCy < 0.100.7 had a bug in the semantics of the Token.__str__ and Token.__unicode__
|
||||||
|
built-ins: they included a trailing space. To ease the transition to the
|
||||||
|
new semantics, you can use this function to switch the old semantics back on.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
from spacy.tokens.token import keep_deprecated_Token.__str__semantics
|
||||||
|
keep_deprecated_Token.__str__semantics(True)
|
||||||
|
|
||||||
|
This function will not remain in future versions --- it's a temporary shim.
|
||||||
|
'''
|
||||||
|
global _STR_TRAILING_WHITESPACE
|
||||||
|
_STR_TRAILING_WHITESPACE = value
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
|
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
|
||||||
via Doc.__getitem__ and Doc.__iter__.
|
via Doc.__getitem__ and Doc.__iter__.
|
||||||
|
@ -43,10 +64,20 @@ cdef class Token:
|
||||||
return self.c.lex.length
|
return self.c.lex.length
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
return self.string
|
# Users can toggle this on to preserve former buggy semantics.
|
||||||
|
# Remove this in future versions.
|
||||||
|
if _STR_TRAILING_WHITESPACE:
|
||||||
|
return self.text_with_ws
|
||||||
|
else:
|
||||||
|
return self.text
|
||||||
|
|
||||||
def __bytes__(self):
|
def __bytes__(self):
|
||||||
return self.string.encode('utf-8')
|
# Users can toggle this on to preserve former buggy semantics.
|
||||||
|
# Remove this in future versions.
|
||||||
|
if _STR_TRAILING_WHITESPACE:
|
||||||
|
return self.text_with_ws.encode('utf8')
|
||||||
|
else:
|
||||||
|
return self.text.encode('utf8')
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
if six.PY3:
|
if six.PY3:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user