mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
* Fix Issue #323: Incorrect semantics of Token.__str__ built-in. Add flag to allow users to switch the old semantics back on, to ease transition.
This commit is contained in:
parent
13a6899fc6
commit
6df3858dbc
|
@ -38,6 +38,12 @@ def test_single_token_string(EN):
|
|||
assert tokens[0].text == 'foobar'
|
||||
|
||||
|
||||
def test_str_builtin(EN):
|
||||
tokens = EN('one two')
|
||||
assert str(tokens[0]) == u'one'
|
||||
assert str(tokens[1]) == u'two'
|
||||
|
||||
|
||||
def test_is_properties(EN):
|
||||
Hi, comma, my, email, is_, addr = EN(u'Hi, my email is test@me.com')
|
||||
assert Hi.is_title
|
||||
|
|
|
@ -28,6 +28,27 @@ from ..attrs cimport IS_OOV
|
|||
from ..lexeme cimport Lexeme
|
||||
|
||||
|
||||
_STR_TRAILING_WHITESPACE = False
|
||||
|
||||
def use_deprecated_Token__str__semantics(value):
|
||||
'''
|
||||
Preserve deprecated semantics for Token.__str__ and Token.__unicode__ methods.
|
||||
|
||||
spaCy < 0.100.7 had a bug in the semantics of the Token.__str__ and Token.__unicode__
|
||||
built-ins: they included a trailing space. To ease the transition to the
|
||||
new semantics, you can use this function to switch the old semantics back on.
|
||||
|
||||
Example:
|
||||
|
||||
from spacy.tokens.token import keep_deprecated_Token.__str__semantics
|
||||
keep_deprecated_Token.__str__semantics(True)
|
||||
|
||||
This function will not remain in future versions --- it's a temporary shim.
|
||||
'''
|
||||
global _STR_TRAILING_WHITESPACE
|
||||
_STR_TRAILING_WHITESPACE = value
|
||||
|
||||
|
||||
cdef class Token:
|
||||
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
|
||||
via Doc.__getitem__ and Doc.__iter__.
|
||||
|
@ -43,10 +64,20 @@ cdef class Token:
|
|||
return self.c.lex.length
|
||||
|
||||
def __unicode__(self):
|
||||
return self.string
|
||||
# Users can toggle this on to preserve former buggy semantics.
|
||||
# Remove this in future versions.
|
||||
if _STR_TRAILING_WHITESPACE:
|
||||
return self.text_with_ws
|
||||
else:
|
||||
return self.text
|
||||
|
||||
def __bytes__(self):
|
||||
return self.string.encode('utf-8')
|
||||
# Users can toggle this on to preserve former buggy semantics.
|
||||
# Remove this in future versions.
|
||||
if _STR_TRAILING_WHITESPACE:
|
||||
return self.text_with_ws.encode('utf8')
|
||||
else:
|
||||
return self.text.encode('utf8')
|
||||
|
||||
def __str__(self):
|
||||
if six.PY3:
|
||||
|
|
Loading…
Reference in New Issue
Block a user