mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 21:21:10 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			51 lines
		
	
	
		
			1.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			51 lines
		
	
	
		
			1.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # coding: utf8
 | |
| from __future__ import unicode_literals, division, print_function
 | |
| 
 | |
| import plac
 | |
| from pathlib import Path
 | |
| import ujson
 | |
| import cProfile
 | |
| import pstats
 | |
| 
 | |
| import spacy
 | |
| import sys
 | |
| import tqdm
 | |
| import cytoolz
 | |
| import thinc.extra.datasets
 | |
| 
 | |
| 
 | |
| def read_inputs(loc):
 | |
|     if loc is None:
 | |
|         file_ = sys.stdin
 | |
|         file_ = (line.encode('utf8') for line in file_)
 | |
|     else:
 | |
|         file_ = Path(loc).open()
 | |
|     for line in file_:
 | |
|         data = ujson.loads(line)
 | |
|         text = data['text']
 | |
|         yield text
 | |
| 
 | |
| 
 | |
| @plac.annotations(
 | |
|     lang=("model/language", "positional", None, str),
 | |
|     inputs=("Location of input file", "positional", None, read_inputs))
 | |
| def profile(lang, inputs=None):
 | |
|     """
 | |
|     Profile a spaCy pipeline, to find out which functions take the most time.
 | |
|     """
 | |
|     if inputs is None:
 | |
|         imdb_train, _ = thinc.extra.datasets.imdb()
 | |
|         inputs, _ = zip(*imdb_train)
 | |
|         inputs = inputs[:25000]
 | |
|     nlp = spacy.load(lang)
 | |
|     texts = list(cytoolz.take(10000, inputs))
 | |
|     cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),
 | |
|                     "Profile.prof")
 | |
|     s = pstats.Stats("Profile.prof")
 | |
|     s.strip_dirs().sort_stats("time").print_stats()
 | |
| 
 | |
| 
 | |
| def parse_texts(nlp, texts):
 | |
|     for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
 | |
|         pass
 |