mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	* Fix Issue #323: Incorrect semantics of Token.__str__ built-in. Add flag to allow users to switch the old semantics back on, to ease transition.
This commit is contained in:
		
							parent
							
								
									13a6899fc6
								
							
						
					
					
						commit
						6df3858dbc
					
				| 
						 | 
				
			
			@ -38,6 +38,12 @@ def test_single_token_string(EN):
 | 
			
		|||
    assert tokens[0].text == 'foobar'
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_str_builtin(EN):
 | 
			
		||||
    tokens = EN('one two')
 | 
			
		||||
    assert str(tokens[0]) == u'one'
 | 
			
		||||
    assert str(tokens[1]) == u'two'
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_is_properties(EN):
 | 
			
		||||
    Hi, comma, my, email, is_, addr = EN(u'Hi, my email is test@me.com')
 | 
			
		||||
    assert Hi.is_title
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -28,6 +28,27 @@ from ..attrs cimport IS_OOV
 | 
			
		|||
from ..lexeme cimport Lexeme
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
_STR_TRAILING_WHITESPACE = False
 | 
			
		||||
 | 
			
		||||
def use_deprecated_Token__str__semantics(value):
 | 
			
		||||
    '''
 | 
			
		||||
    Preserve deprecated semantics for Token.__str__ and Token.__unicode__ methods.
 | 
			
		||||
    
 | 
			
		||||
    spaCy < 0.100.7 had a bug in the semantics of the Token.__str__ and Token.__unicode__
 | 
			
		||||
    built-ins: they included a trailing space. To ease the transition to the
 | 
			
		||||
    new semantics, you can use this function to switch the old semantics back on.
 | 
			
		||||
    
 | 
			
		||||
    Example:
 | 
			
		||||
 | 
			
		||||
        from spacy.tokens.token import keep_deprecated_Token.__str__semantics
 | 
			
		||||
        keep_deprecated_Token.__str__semantics(True)
 | 
			
		||||
 | 
			
		||||
    This function will not remain in future versions --- it's a temporary shim.
 | 
			
		||||
    '''
 | 
			
		||||
    global _STR_TRAILING_WHITESPACE
 | 
			
		||||
    _STR_TRAILING_WHITESPACE = value
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
cdef class Token:
 | 
			
		||||
    """An individual token --- i.e. a word, a punctuation symbol, etc.  Created
 | 
			
		||||
    via Doc.__getitem__ and Doc.__iter__.
 | 
			
		||||
| 
						 | 
				
			
			@ -43,10 +64,20 @@ cdef class Token:
 | 
			
		|||
        return self.c.lex.length
 | 
			
		||||
 | 
			
		||||
    def __unicode__(self):
 | 
			
		||||
        return self.string
 | 
			
		||||
        # Users can toggle this on to preserve former buggy semantics.
 | 
			
		||||
        # Remove this in future versions.
 | 
			
		||||
        if _STR_TRAILING_WHITESPACE:
 | 
			
		||||
            return self.text_with_ws
 | 
			
		||||
        else:
 | 
			
		||||
            return self.text
 | 
			
		||||
 | 
			
		||||
    def __bytes__(self):
 | 
			
		||||
        return self.string.encode('utf-8')
 | 
			
		||||
        # Users can toggle this on to preserve former buggy semantics.
 | 
			
		||||
        # Remove this in future versions.
 | 
			
		||||
        if _STR_TRAILING_WHITESPACE:
 | 
			
		||||
            return self.text_with_ws.encode('utf8')
 | 
			
		||||
        else:
 | 
			
		||||
            return self.text.encode('utf8')
 | 
			
		||||
 | 
			
		||||
    def __str__(self):
 | 
			
		||||
        if six.PY3:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user