* Fix Issue #323: Incorrect semantics of Token.__str__ built-in. Add flag to allow users to switch the old semantics back on, to ease transition.

This commit is contained in:
Matthew Honnibal 2016-04-12 13:17:59 +10:00
parent 13a6899fc6
commit 6df3858dbc
2 changed files with 39 additions and 2 deletions

View File

@ -38,6 +38,12 @@ def test_single_token_string(EN):
assert tokens[0].text == 'foobar'
def test_str_builtin(EN):
tokens = EN('one two')
assert str(tokens[0]) == u'one'
assert str(tokens[1]) == u'two'
def test_is_properties(EN):
Hi, comma, my, email, is_, addr = EN(u'Hi, my email is test@me.com')
assert Hi.is_title

View File

@ -28,6 +28,27 @@ from ..attrs cimport IS_OOV
from ..lexeme cimport Lexeme
_STR_TRAILING_WHITESPACE = False
def use_deprecated_Token__str__semantics(value):
'''
Preserve deprecated semantics for Token.__str__ and Token.__unicode__ methods.
spaCy < 0.100.7 had a bug in the semantics of the Token.__str__ and Token.__unicode__
built-ins: they included a trailing space. To ease the transition to the
new semantics, you can use this function to switch the old semantics back on.
Example:
from spacy.tokens.token import keep_deprecated_Token.__str__semantics
keep_deprecated_Token.__str__semantics(True)
This function will not remain in future versions --- it's a temporary shim.
'''
global _STR_TRAILING_WHITESPACE
_STR_TRAILING_WHITESPACE = value
cdef class Token:
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
via Doc.__getitem__ and Doc.__iter__.
@ -43,10 +64,20 @@ cdef class Token:
return self.c.lex.length
def __unicode__(self):
return self.string
# Users can toggle this on to preserve former buggy semantics.
# Remove this in future versions.
if _STR_TRAILING_WHITESPACE:
return self.text_with_ws
else:
return self.text
def __bytes__(self):
return self.string.encode('utf-8')
# Users can toggle this on to preserve former buggy semantics.
# Remove this in future versions.
if _STR_TRAILING_WHITESPACE:
return self.text_with_ws.encode('utf8')
else:
return self.text.encode('utf8')
def __str__(self):
if six.PY3: