* Fix Issue #323: Incorrect semantics of Token.__str__ built-in. Add flag to allow users to switch the old semantics back on, to ease transition.

2025-08-02 03:10:22 +03:00 · 2016-04-12 13:17:59 +10:00 · 2016-04-12 13:17:59 +10:00 · 6df3858dbc
commit 6df3858dbc
parent 13a6899fc6
2 changed files with 39 additions and 2 deletions
--- a/spacy/tests/tokens/test_token_api.py
+++ b/spacy/tests/tokens/test_token_api.py
@ -38,6 +38,12 @@ def test_single_token_string(EN):
    assert tokens[0].text == 'foobar'


+def test_str_builtin(EN):
+    tokens = EN('one two')
+    assert str(tokens[0]) == u'one'
+    assert str(tokens[1]) == u'two'
+
+
 def test_is_properties(EN):
    Hi, comma, my, email, is_, addr = EN(u'Hi, my email is test@me.com')
    assert Hi.is_title
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -28,6 +28,27 @@ from ..attrs cimport IS_OOV
 from ..lexeme cimport Lexeme


+_STR_TRAILING_WHITESPACE = False
+
+def use_deprecated_Token__str__semantics(value):
+    '''
+    Preserve deprecated semantics for Token.__str__ and Token.__unicode__ methods.
+    
+    spaCy < 0.100.7 had a bug in the semantics of the Token.__str__ and Token.__unicode__
+    built-ins: they included a trailing space. To ease the transition to the
+    new semantics, you can use this function to switch the old semantics back on.
+    
+    Example:
+
+        from spacy.tokens.token import keep_deprecated_Token.__str__semantics
+        keep_deprecated_Token.__str__semantics(True)
+
+    This function will not remain in future versions --- it's a temporary shim.
+    '''
+    global _STR_TRAILING_WHITESPACE
+    _STR_TRAILING_WHITESPACE = value
+
+
 cdef class Token:
    """An individual token --- i.e. a word, a punctuation symbol, etc.  Created
    via Doc.__getitem__ and Doc.__iter__.
@ -43,10 +64,20 @@ cdef class Token:
        return self.c.lex.length

    def __unicode__(self):
-        return self.string
+        # Users can toggle this on to preserve former buggy semantics.
+        # Remove this in future versions.
+        if _STR_TRAILING_WHITESPACE:
+            return self.text_with_ws
+        else:
+            return self.text

    def __bytes__(self):
-        return self.string.encode('utf-8')
+        # Users can toggle this on to preserve former buggy semantics.
+        # Remove this in future versions.
+        if _STR_TRAILING_WHITESPACE:
+            return self.text_with_ws.encode('utf8')
+        else:
+            return self.text.encode('utf8')

    def __str__(self):
        if six.PY3: