* Fix Issue #323: Incorrect semantics of Token.__str__ built-in. Add flag to allow users to switch the old semantics back on, to ease transition.

2025-11-06 10:57:34 +03:00 · 2016-04-12 13:17:59 +10:00 · 2016-04-12 13:17:59 +10:00 · 6df3858dbc
commit 6df3858dbc
parent 13a6899fc6
2 changed files with 39 additions and 2 deletions
--- a/spacy/tests/tokens/test_token_api.py
+++ b/spacy/tests/tokens/test_token_api.py
@ -38,6 +38,12 @@ def test_single_token_string(EN):
    assert tokens[0].text == 'foobar'
 def test_str_builtin(EN):
    tokens = EN('one two')
    assert str(tokens[0]) == u'one'
    assert str(tokens[1]) == u'two'
 def test_is_properties(EN):
    Hi, comma, my, email, is_, addr = EN(u'Hi, my email is test@me.com')
    assert Hi.is_title
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -28,6 +28,27 @@ from ..attrs cimport IS_OOV
 from ..lexeme cimport Lexeme
 _STR_TRAILING_WHITESPACE = False
 def use_deprecated_Token__str__semantics(value):
    '''
    Preserve deprecated semantics for Token.__str__ and Token.__unicode__ methods.
    spaCy < 0.100.7 had a bug in the semantics of the Token.__str__ and Token.__unicode__
    built-ins: they included a trailing space. To ease the transition to the
    new semantics, you can use this function to switch the old semantics back on.
    Example:
        from spacy.tokens.token import keep_deprecated_Token.__str__semantics
        keep_deprecated_Token.__str__semantics(True)
    This function will not remain in future versions --- it's a temporary shim.
    '''
    global _STR_TRAILING_WHITESPACE
    _STR_TRAILING_WHITESPACE = value
 cdef class Token:
    """An individual token --- i.e. a word, a punctuation symbol, etc.  Created
    via Doc.__getitem__ and Doc.__iter__.
@ -43,10 +64,20 @@ cdef class Token:
        return self.c.lex.length
    def __unicode__(self):
-        return self.string
+        # Users can toggle this on to preserve former buggy semantics.
        # Remove this in future versions.
        if _STR_TRAILING_WHITESPACE:
            return self.text_with_ws
        else:
            return self.text
    def __bytes__(self):
-        return self.string.encode('utf-8')
+        # Users can toggle this on to preserve former buggy semantics.
        # Remove this in future versions.
        if _STR_TRAILING_WHITESPACE:
            return self.text_with_ws.encode('utf8')
        else:
            return self.text.encode('utf8')
    def __str__(self):
        if six.PY3: