spaCy/spacy/cli/profile.py

# coding: utf8
from __future__ import unicode_literals, division, print_function

import plac
from pathlib import Path
import ujson
import cProfile
import pstats

import spacy
import sys
import tqdm
import cytoolz
import thinc.extra.datasets


def read_inputs(loc):
    if loc is None:
        file_ = sys.stdin
        file_ = (line.encode('utf8') for line in file_)
    else:
        file_ = Path(loc).open()
    for line in file_:
        data = ujson.loads(line)
        text = data['text']
        yield text


@plac.annotations(
    lang=("model/language", "positional", None, str),
    inputs=("Location of input file", "positional", None, read_inputs))
def profile(cmd, lang, inputs=None):
    """
    Profile a spaCy pipeline, to find out which functions take the most time.
    """
    if inputs is None:
        imdb_train, _ = thinc.extra.datasets.imdb()
        inputs, _ = zip(*imdb_train)
        inputs = inputs[:2000]
    nlp = spacy.load(lang)
    texts = list(cytoolz.take(10000, inputs))
    cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),
                    "Profile.prof")
    s = pstats.Stats("Profile.prof")
    s.strip_dirs().sort_stats("cumtime").print_stats()


def parse_texts(nlp, texts):
    for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
        pass
Add profile function 2017-08-22 00:22:49 +03:00			`# coding: utf8`
			`from __future__ import unicode_literals, division, print_function`

			`import plac`
			`from pathlib import Path`
			`import ujson`
			`import cProfile`
			`import pstats`

			`import spacy`
			`import sys`
			`import tqdm`
			`import cytoolz`
Improve profiling 2017-11-15 15:51:25 +03:00			`import thinc.extra.datasets`
Add profile function 2017-08-22 00:22:49 +03:00

			`def read_inputs(loc):`
			`if loc is None:`
			`file_ = sys.stdin`
			`file_ = (line.encode('utf8') for line in file_)`
			`else:`
			`file_ = Path(loc).open()`
			`for line in file_:`
			`data = ujson.loads(line)`
			`text = data['text']`
			`yield text`


			`@plac.annotations(`
			`lang=("model/language", "positional", None, str),`
Tidy up CLI 2017-10-27 15:38:39 +03:00			`inputs=("Location of input file", "positional", None, read_inputs))`
Add profile function 2017-08-22 00:22:49 +03:00			`def profile(cmd, lang, inputs=None):`
			`"""`
			`Profile a spaCy pipeline, to find out which functions take the most time.`
			`"""`
Improve profiling 2017-11-15 15:51:25 +03:00			`if inputs is None:`
			`imdb_train, _ = thinc.extra.datasets.imdb()`
			`inputs, _ = zip(*imdb_train)`
			`inputs = inputs[:2000]`
Tidy up CLI 2017-10-27 15:38:39 +03:00			`nlp = spacy.load(lang)`
Add profile function 2017-08-22 00:22:49 +03:00			`texts = list(cytoolz.take(10000, inputs))`
Tidy up CLI 2017-10-27 15:38:39 +03:00			`cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(),`
			`"Profile.prof")`
Add profile function 2017-08-22 00:22:49 +03:00			`s = pstats.Stats("Profile.prof")`
Improve profiling 2017-11-15 15:51:25 +03:00			`s.strip_dirs().sort_stats("cumtime").print_stats()`
Add profile function 2017-08-22 00:22:49 +03:00

			`def parse_texts(nlp, texts):`
Improve profiling 2017-11-15 15:51:25 +03:00			`for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):`
Add profile function 2017-08-22 00:22:49 +03:00			`pass`