mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Tokenizer cache can have be different keys than string That modification can slow down tokenizer and need to be measured
		
			
				
	
	
		
			36 lines
		
	
	
		
			918 B
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			36 lines
		
	
	
		
			918 B
		
	
	
	
		
			Python
		
	
	
	
	
	
# coding: utf8
 | 
						|
from __future__ import unicode_literals
 | 
						|
 | 
						|
import gc
 | 
						|
 | 
						|
from ...lang.en import English
 | 
						|
 | 
						|
 | 
						|
def test_issue1506():
 | 
						|
    nlp = English()
 | 
						|
 | 
						|
    def string_generator():
 | 
						|
        for _ in range(10001):
 | 
						|
            yield u"It's sentence produced by that bug."
 | 
						|
 | 
						|
        for _ in range(10001):
 | 
						|
            yield u"I erase some hbdsaj lemmas."
 | 
						|
 | 
						|
        for _ in range(10001):
 | 
						|
            yield u"I erase lemmas."
 | 
						|
 | 
						|
        for _ in range(10001):
 | 
						|
            yield u"It's sentence produced by that bug."
 | 
						|
 | 
						|
        for _ in range(10001):
 | 
						|
            yield u"It's sentence produced by that bug."
 | 
						|
 | 
						|
    for i, d in enumerate(nlp.pipe(string_generator())):
 | 
						|
        # We should run cleanup more than one time to actually cleanup data.
 | 
						|
        # In first run — clean up only mark strings as «not hitted».
 | 
						|
        if i == 10000 or i == 20000 or i == 30000:
 | 
						|
            gc.collect()
 | 
						|
 | 
						|
        for t in d:
 | 
						|
            str(t.lemma_)
 |