diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 480b27a23..e58c94642 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -2,6 +2,7 @@ from .download import download from .info import info from .link import link from .package import package +from .profile import profile from .train import train from .convert import convert from .model import model diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py new file mode 100644 index 000000000..db6fc5b41 --- /dev/null +++ b/spacy/cli/profile.py @@ -0,0 +1,45 @@ +# coding: utf8 +from __future__ import unicode_literals, division, print_function + +import plac +from pathlib import Path +import ujson +import cProfile +import pstats + +import spacy +import sys +import tqdm +import cytoolz + + +def read_inputs(loc): + if loc is None: + file_ = sys.stdin + file_ = (line.encode('utf8') for line in file_) + else: + file_ = Path(loc).open() + for line in file_: + data = ujson.loads(line) + text = data['text'] + yield text + + +@plac.annotations( + lang=("model/language", "positional", None, str), + inputs=("Location of input file", "positional", None, read_inputs) +) +def profile(cmd, lang, inputs=None): + """ + Profile a spaCy pipeline, to find out which functions take the most time. + """ + nlp = spacy.load(lang) + texts = list(cytoolz.take(10000, inputs)) + cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof") + s = pstats.Stats("Profile.prof") + s.strip_dirs().sort_stats("time").print_stats() + + +def parse_texts(nlp, texts): + for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=128): + pass