spaCy/spacy/cli/profile.py

# coding: utf8
from __future__ import unicode_literals, division, print_function

import plac
from pathlib import Path
import ujson
import cProfile
import pstats

import spacy
import sys
import tqdm
import cytoolz


def read_inputs(loc):
    if loc is None:
        file_ = sys.stdin
        file_ = (line.encode('utf8') for line in file_)
    else:
        file_ = Path(loc).open()
    for line in file_:
        data = ujson.loads(line)
        text = data['text']
        yield text


@plac.annotations(
    lang=("model/language", "positional", None, str),
    inputs=("Location of input file", "positional", None, read_inputs)
)
def profile(cmd, lang, inputs=None):
    """
    Profile a spaCy pipeline, to find out which functions take the most time.
    """
    nlp = spacy.load(lang) 
    texts = list(cytoolz.take(10000, inputs))
    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
    s = pstats.Stats("Profile.prof")
    s.strip_dirs().sort_stats("time").print_stats()


def parse_texts(nlp, texts):
    for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=128):
        pass
Add profile function 2017-08-22 00:22:49 +03:00			`# coding: utf8`
			`from __future__ import unicode_literals, division, print_function`

			`import plac`
			`from pathlib import Path`
			`import ujson`
			`import cProfile`
			`import pstats`

			`import spacy`
			`import sys`
			`import tqdm`
			`import cytoolz`


			`def read_inputs(loc):`
			`if loc is None:`
			`file_ = sys.stdin`
			`file_ = (line.encode('utf8') for line in file_)`
			`else:`
			`file_ = Path(loc).open()`
			`for line in file_:`
			`data = ujson.loads(line)`
			`text = data['text']`
			`yield text`


			`@plac.annotations(`
			`lang=("model/language", "positional", None, str),`
			`inputs=("Location of input file", "positional", None, read_inputs)`
			`)`
			`def profile(cmd, lang, inputs=None):`
			`"""`
			`Profile a spaCy pipeline, to find out which functions take the most time.`
			`"""`
			`nlp = spacy.load(lang)`
			`texts = list(cytoolz.take(10000, inputs))`
			`cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")`
			`s = pstats.Stats("Profile.prof")`
			`s.strip_dirs().sort_stats("time").print_stats()`


			`def parse_texts(nlp, texts):`
			`for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=128):`
			`pass`