2020-06-21 22:35:01 +03:00
|
|
|
from typing import Optional, Sequence, Union, Iterator
|
2019-12-16 15:12:19 +03:00
|
|
|
import tqdm
|
2017-08-22 00:22:49 +03:00
|
|
|
from pathlib import Path
|
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 03:28:22 +03:00
|
|
|
import srsly
|
2017-08-22 00:22:49 +03:00
|
|
|
import cProfile
|
|
|
|
import pstats
|
|
|
|
import sys
|
2018-12-06 18:04:12 +03:00
|
|
|
import itertools
|
2020-06-21 22:35:01 +03:00
|
|
|
from wasabi import msg, Printer
|
2020-07-12 14:53:41 +03:00
|
|
|
import typer
|
2017-08-22 00:22:49 +03:00
|
|
|
|
2020-07-12 14:53:41 +03:00
|
|
|
from ._util import app, debug_cli, Arg, Opt, NAME
|
2020-06-21 22:35:01 +03:00
|
|
|
from ..language import Language
|
2018-11-30 22:16:14 +03:00
|
|
|
from ..util import load_model
|
2017-08-22 00:22:49 +03:00
|
|
|
|
|
|
|
|
2020-07-12 14:53:41 +03:00
|
|
|
@debug_cli.command("profile")
|
|
|
|
@app.command("profile", hidden=True)
|
2020-06-21 22:35:01 +03:00
|
|
|
def profile_cli(
|
2020-01-01 15:15:46 +03:00
|
|
|
# fmt: off
|
2020-07-12 14:53:41 +03:00
|
|
|
ctx: typer.Context, # This is only used to read current calling context
|
2020-06-21 14:44:00 +03:00
|
|
|
model: str = Arg(..., help="Model to load"),
|
2020-06-21 22:35:01 +03:00
|
|
|
inputs: Optional[Path] = Arg(None, help="Location of input file. '-' for stdin.", exists=True, allow_dash=True),
|
2020-06-21 14:44:00 +03:00
|
|
|
n_texts: int = Opt(10000, "--n-texts", "-n", help="Maximum number of texts to use if available"),
|
2020-01-01 15:15:46 +03:00
|
|
|
# fmt: on
|
|
|
|
):
|
2017-08-22 00:22:49 +03:00
|
|
|
"""
|
|
|
|
Profile a spaCy pipeline, to find out which functions take the most time.
|
2018-11-30 22:16:14 +03:00
|
|
|
Input should be formatted as one JSON object per line with a key "text".
|
|
|
|
It can either be provided as a JSONL file, or be read from sys.sytdin.
|
|
|
|
If no input file is specified, the IMDB dataset is loaded via Thinc.
|
2017-08-22 00:22:49 +03:00
|
|
|
"""
|
2020-07-12 14:53:41 +03:00
|
|
|
if ctx.parent.command.name == NAME: # called as top-level command
|
|
|
|
msg.warn(
|
|
|
|
"The profile command is now available via the 'debug profile' "
|
|
|
|
"subcommand. You can run python -m spacy debug --help for an "
|
|
|
|
"overview of the other available debugging commands."
|
|
|
|
)
|
2020-06-21 22:35:01 +03:00
|
|
|
profile(model, inputs=inputs, n_texts=n_texts)
|
|
|
|
|
|
|
|
|
|
|
|
def profile(model: str, inputs: Optional[Path] = None, n_texts: int = 10000) -> None:
|
2020-07-06 14:02:36 +03:00
|
|
|
|
2018-11-30 22:16:14 +03:00
|
|
|
if inputs is not None:
|
|
|
|
inputs = _read_inputs(inputs, msg)
|
2017-11-15 15:51:25 +03:00
|
|
|
if inputs is None:
|
2020-07-06 14:02:36 +03:00
|
|
|
try:
|
|
|
|
import ml_datasets
|
|
|
|
except ImportError:
|
|
|
|
msg.fail(
|
|
|
|
"This command, when run without an input file, "
|
|
|
|
"requires the ml_datasets library to be installed: "
|
|
|
|
"pip install ml_datasets",
|
|
|
|
exits=1,
|
|
|
|
)
|
|
|
|
|
2018-11-30 22:16:14 +03:00
|
|
|
n_inputs = 25000
|
|
|
|
with msg.loading("Loading IMDB dataset via Thinc..."):
|
2020-01-29 19:06:46 +03:00
|
|
|
imdb_train, _ = ml_datasets.imdb()
|
2018-11-30 22:16:14 +03:00
|
|
|
inputs, _ = zip(*imdb_train)
|
2019-12-22 03:53:56 +03:00
|
|
|
msg.info(f"Loaded IMDB dataset and using {n_inputs} examples")
|
2018-11-30 22:16:14 +03:00
|
|
|
inputs = inputs[:n_inputs]
|
2019-12-22 03:53:56 +03:00
|
|
|
with msg.loading(f"Loading model '{model}'..."):
|
2018-11-30 22:16:14 +03:00
|
|
|
nlp = load_model(model)
|
2019-12-22 03:53:56 +03:00
|
|
|
msg.good(f"Loaded model '{model}'")
|
2018-12-06 18:04:12 +03:00
|
|
|
texts = list(itertools.islice(inputs, n_texts))
|
2018-11-30 22:16:14 +03:00
|
|
|
cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof")
|
2017-08-22 00:22:49 +03:00
|
|
|
s = pstats.Stats("Profile.prof")
|
2018-11-30 22:16:14 +03:00
|
|
|
msg.divider("Profile stats")
|
2017-11-17 21:13:00 +03:00
|
|
|
s.strip_dirs().sort_stats("time").print_stats()
|
2017-08-22 00:22:49 +03:00
|
|
|
|
|
|
|
|
2020-06-21 22:35:01 +03:00
|
|
|
def parse_texts(nlp: Language, texts: Sequence[str]) -> None:
|
2017-11-15 15:51:25 +03:00
|
|
|
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
|
2017-08-22 00:22:49 +03:00
|
|
|
pass
|
2018-11-30 22:16:14 +03:00
|
|
|
|
|
|
|
|
2020-06-21 22:35:01 +03:00
|
|
|
def _read_inputs(loc: Union[Path, str], msg: Printer) -> Iterator[str]:
|
2018-11-30 22:16:14 +03:00
|
|
|
if loc == "-":
|
|
|
|
msg.info("Reading input from sys.stdin")
|
|
|
|
file_ = sys.stdin
|
|
|
|
file_ = (line.encode("utf8") for line in file_)
|
|
|
|
else:
|
|
|
|
input_path = Path(loc)
|
|
|
|
if not input_path.exists() or not input_path.is_file():
|
|
|
|
msg.fail("Not a valid input data file", loc, exits=1)
|
2019-12-22 03:53:56 +03:00
|
|
|
msg.info(f"Using data from {input_path.parts[-1]}")
|
2018-11-30 22:16:14 +03:00
|
|
|
file_ = input_path.open()
|
|
|
|
for line in file_:
|
💫 Replace ujson, msgpack and dill/pickle/cloudpickle with srsly (#3003)
Remove hacks and wrappers, keep code in sync across our libraries and move spaCy a few steps closer to only depending on packages with binary wheels 🎉
See here: https://github.com/explosion/srsly
Serialization is hard, especially across Python versions and multiple platforms. After dealing with many subtle bugs over the years (encodings, locales, large files) our libraries like spaCy and Prodigy have steadily grown a number of utility functions to wrap the multiple serialization formats we need to support (especially json, msgpack and pickle). These wrapping functions ended up duplicated across our codebases, so we wanted to put them in one place.
At the same time, we noticed that having a lot of small dependencies was making maintainence harder, and making installation slower. To solve this, we've made srsly standalone, by including the component packages directly within it. This way we can provide all the serialization utilities we need in a single binary wheel.
srsly currently includes forks of the following packages:
ujson
msgpack
msgpack-numpy
cloudpickle
* WIP: replace json/ujson with srsly
* Replace ujson in examples
Use regular json instead of srsly to make code easier to read and follow
* Update requirements
* Fix imports
* Fix typos
* Replace msgpack with srsly
* Fix warning
2018-12-03 03:28:22 +03:00
|
|
|
data = srsly.json_loads(line)
|
2018-11-30 22:16:14 +03:00
|
|
|
text = data["text"]
|
|
|
|
yield text
|