diff --git a/spacy/tests/tokens/test_token_api.py b/spacy/tests/tokens/test_token_api.py index fba8a4d67..0d9fca1db 100644 --- a/spacy/tests/tokens/test_token_api.py +++ b/spacy/tests/tokens/test_token_api.py @@ -38,6 +38,12 @@ def test_single_token_string(EN): assert tokens[0].text == 'foobar' +def test_str_builtin(EN): + tokens = EN('one two') + assert str(tokens[0]) == u'one' + assert str(tokens[1]) == u'two' + + def test_is_properties(EN): Hi, comma, my, email, is_, addr = EN(u'Hi, my email is test@me.com') assert Hi.is_title diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 085fb30ec..0221a1eb9 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -28,6 +28,27 @@ from ..attrs cimport IS_OOV from ..lexeme cimport Lexeme +_STR_TRAILING_WHITESPACE = False + +def use_deprecated_Token__str__semantics(value): + ''' + Preserve deprecated semantics for Token.__str__ and Token.__unicode__ methods. + + spaCy < 0.100.7 had a bug in the semantics of the Token.__str__ and Token.__unicode__ + built-ins: they included a trailing space. To ease the transition to the + new semantics, you can use this function to switch the old semantics back on. + + Example: + + from spacy.tokens.token import keep_deprecated_Token.__str__semantics + keep_deprecated_Token.__str__semantics(True) + + This function will not remain in future versions --- it's a temporary shim. + ''' + global _STR_TRAILING_WHITESPACE + _STR_TRAILING_WHITESPACE = value + + cdef class Token: """An individual token --- i.e. a word, a punctuation symbol, etc. Created via Doc.__getitem__ and Doc.__iter__. @@ -43,10 +64,20 @@ cdef class Token: return self.c.lex.length def __unicode__(self): - return self.string + # Users can toggle this on to preserve former buggy semantics. + # Remove this in future versions. + if _STR_TRAILING_WHITESPACE: + return self.text_with_ws + else: + return self.text def __bytes__(self): - return self.string.encode('utf-8') + # Users can toggle this on to preserve former buggy semantics. + # Remove this in future versions. + if _STR_TRAILING_WHITESPACE: + return self.text_with_ws.encode('utf8') + else: + return self.text.encode('utf8') def __str__(self): if six.PY3: