mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Fix Issue #323: Incorrect semantics of Token.__str__ built-in. Add flag to allow users to switch the old semantics back on, to ease transition.
This commit is contained in:
		
							parent
							
								
									13a6899fc6
								
							
						
					
					
						commit
						6df3858dbc
					
				| 
						 | 
					@ -38,6 +38,12 @@ def test_single_token_string(EN):
 | 
				
			||||||
    assert tokens[0].text == 'foobar'
 | 
					    assert tokens[0].text == 'foobar'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_str_builtin(EN):
 | 
				
			||||||
 | 
					    tokens = EN('one two')
 | 
				
			||||||
 | 
					    assert str(tokens[0]) == u'one'
 | 
				
			||||||
 | 
					    assert str(tokens[1]) == u'two'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_is_properties(EN):
 | 
					def test_is_properties(EN):
 | 
				
			||||||
    Hi, comma, my, email, is_, addr = EN(u'Hi, my email is test@me.com')
 | 
					    Hi, comma, my, email, is_, addr = EN(u'Hi, my email is test@me.com')
 | 
				
			||||||
    assert Hi.is_title
 | 
					    assert Hi.is_title
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -28,6 +28,27 @@ from ..attrs cimport IS_OOV
 | 
				
			||||||
from ..lexeme cimport Lexeme
 | 
					from ..lexeme cimport Lexeme
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_STR_TRAILING_WHITESPACE = False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def use_deprecated_Token__str__semantics(value):
 | 
				
			||||||
 | 
					    '''
 | 
				
			||||||
 | 
					    Preserve deprecated semantics for Token.__str__ and Token.__unicode__ methods.
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    spaCy < 0.100.7 had a bug in the semantics of the Token.__str__ and Token.__unicode__
 | 
				
			||||||
 | 
					    built-ins: they included a trailing space. To ease the transition to the
 | 
				
			||||||
 | 
					    new semantics, you can use this function to switch the old semantics back on.
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    Example:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        from spacy.tokens.token import keep_deprecated_Token.__str__semantics
 | 
				
			||||||
 | 
					        keep_deprecated_Token.__str__semantics(True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    This function will not remain in future versions --- it's a temporary shim.
 | 
				
			||||||
 | 
					    '''
 | 
				
			||||||
 | 
					    global _STR_TRAILING_WHITESPACE
 | 
				
			||||||
 | 
					    _STR_TRAILING_WHITESPACE = value
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Token:
 | 
					cdef class Token:
 | 
				
			||||||
    """An individual token --- i.e. a word, a punctuation symbol, etc.  Created
 | 
					    """An individual token --- i.e. a word, a punctuation symbol, etc.  Created
 | 
				
			||||||
    via Doc.__getitem__ and Doc.__iter__.
 | 
					    via Doc.__getitem__ and Doc.__iter__.
 | 
				
			||||||
| 
						 | 
					@ -43,10 +64,20 @@ cdef class Token:
 | 
				
			||||||
        return self.c.lex.length
 | 
					        return self.c.lex.length
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __unicode__(self):
 | 
					    def __unicode__(self):
 | 
				
			||||||
        return self.string
 | 
					        # Users can toggle this on to preserve former buggy semantics.
 | 
				
			||||||
 | 
					        # Remove this in future versions.
 | 
				
			||||||
 | 
					        if _STR_TRAILING_WHITESPACE:
 | 
				
			||||||
 | 
					            return self.text_with_ws
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            return self.text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __bytes__(self):
 | 
					    def __bytes__(self):
 | 
				
			||||||
        return self.string.encode('utf-8')
 | 
					        # Users can toggle this on to preserve former buggy semantics.
 | 
				
			||||||
 | 
					        # Remove this in future versions.
 | 
				
			||||||
 | 
					        if _STR_TRAILING_WHITESPACE:
 | 
				
			||||||
 | 
					            return self.text_with_ws.encode('utf8')
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            return self.text.encode('utf8')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __str__(self):
 | 
					    def __str__(self):
 | 
				
			||||||
        if six.PY3:
 | 
					        if six.PY3:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user