Tests for CLI app - init config generates train-able config (#12173)

* remove migration support form

* initial test commit

* add fixture

* add combo test

* pull out parameter example data

* fix formatting on examples

* remove unused import

* remove unncessary fmt:off instructions

* only set logger level if verbose flag is explicitly set

---------

Co-authored-by: svlandeg <svlandeg@github.com>
This commit is contained in:
Peter Baumgartner 2023-07-31 08:45:04 -04:00 committed by GitHub
parent 186889ec9c
commit a0a195688f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 172 additions and 8 deletions

View File

@ -40,7 +40,8 @@ def assemble_cli(
DOCS: https://spacy.io/api/cli#assemble
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if verbose:
util.logger.setLevel(logging.DEBUG)
# Make sure all files and paths exists if they are needed
if not config_path or (str(config_path) != "-" and not config_path.exists()):
msg.fail("Config file not found", config_path, exits=1)

View File

@ -52,8 +52,8 @@ def find_threshold_cli(
DOCS: https://spacy.io/api/cli#find-threshold
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if verbose:
util.logger.setLevel(logging.DEBUG)
import_code(code_path)
find_threshold(
model=model,

View File

@ -39,7 +39,8 @@ def init_vectors_cli(
you can use in the [initialize] block of your config to initialize
a model with vectors.
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if verbose:
util.logger.setLevel(logging.DEBUG)
msg.info(f"Creating blank nlp object for language '{lang}'")
nlp = util.get_lang_class(lang)()
if jsonl_loc is not None:
@ -87,7 +88,8 @@ def init_pipeline_cli(
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU")
# fmt: on
):
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if verbose:
util.logger.setLevel(logging.DEBUG)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
setup_gpu(use_gpu)
@ -116,7 +118,8 @@ def init_labels_cli(
"""Generate JSON files for the labels in the data. This helps speed up the
training process, since spaCy won't have to preprocess the data to
extract the labels."""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if verbose:
util.logger.setLevel(logging.DEBUG)
if not output_path.exists():
output_path.mkdir(parents=True)
overrides = parse_config_overrides(ctx.args)

View File

@ -47,7 +47,8 @@ def train_cli(
DOCS: https://spacy.io/api/cli#train
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
if verbose:
util.logger.setLevel(logging.DEBUG)
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)

View File

@ -6,7 +6,7 @@ import srsly
from typer.testing import CliRunner
from spacy.cli._util import app, get_git_version
from spacy.tokens import Doc, DocBin
from spacy.tokens import Doc, DocBin, Span
from .util import make_tempdir, normalize_whitespace
@ -267,3 +267,162 @@ def test_find_function_invalid():
function = "spacy.TextCatBOW.v666"
result = CliRunner().invoke(app, ["find-function", function])
assert f"Couldn't find registered function: '{function}'" in result.stdout
example_words_1 = ["I", "like", "cats"]
example_words_2 = ["I", "like", "dogs"]
example_lemmas_1 = ["I", "like", "cat"]
example_lemmas_2 = ["I", "like", "dog"]
example_tags = ["PRP", "VBP", "NNS"]
example_morphs = [
"Case=Nom|Number=Sing|Person=1|PronType=Prs",
"Tense=Pres|VerbForm=Fin",
"Number=Plur",
]
example_deps = ["nsubj", "ROOT", "dobj"]
example_pos = ["PRON", "VERB", "NOUN"]
example_ents = ["O", "O", "I-ANIMAL"]
example_spans = [(2, 3, "ANIMAL")]
TRAIN_EXAMPLE_1 = dict(
words=example_words_1,
lemmas=example_lemmas_1,
tags=example_tags,
morphs=example_morphs,
deps=example_deps,
heads=[1, 1, 1],
pos=example_pos,
ents=example_ents,
spans=example_spans,
cats={"CAT": 1.0, "DOG": 0.0},
)
TRAIN_EXAMPLE_2 = dict(
words=example_words_2,
lemmas=example_lemmas_2,
tags=example_tags,
morphs=example_morphs,
deps=example_deps,
heads=[1, 1, 1],
pos=example_pos,
ents=example_ents,
spans=example_spans,
cats={"CAT": 0.0, "DOG": 1.0},
)
@pytest.mark.slow
@pytest.mark.parametrize(
"component,examples",
[
("tagger", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
("morphologizer", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
("trainable_lemmatizer", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
("parser", [TRAIN_EXAMPLE_1] * 30),
("ner", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
("spancat", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
("textcat", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
],
)
def test_init_config_trainable(component, examples, en_vocab):
if component == "textcat":
train_docs = []
for example in examples:
doc = Doc(en_vocab, words=example["words"])
doc.cats = example["cats"]
train_docs.append(doc)
elif component == "spancat":
train_docs = []
for example in examples:
doc = Doc(en_vocab, words=example["words"])
doc.spans["sc"] = [
Span(doc, start, end, label) for start, end, label in example["spans"]
]
train_docs.append(doc)
else:
train_docs = []
for example in examples:
# cats, spans are not valid kwargs for instantiating a Doc
example = {k: v for k, v in example.items() if k not in ("cats", "spans")}
doc = Doc(en_vocab, **example)
train_docs.append(doc)
with make_tempdir() as d_in:
train_bin = DocBin(docs=train_docs)
train_bin.to_disk(d_in / "train.spacy")
dev_bin = DocBin(docs=train_docs)
dev_bin.to_disk(d_in / "dev.spacy")
init_config_result = CliRunner().invoke(
app,
[
"init",
"config",
f"{d_in}/config.cfg",
"--lang",
"en",
"--pipeline",
component,
],
)
assert init_config_result.exit_code == 0
train_result = CliRunner().invoke(
app,
[
"train",
f"{d_in}/config.cfg",
"--paths.train",
f"{d_in}/train.spacy",
"--paths.dev",
f"{d_in}/dev.spacy",
"--output",
f"{d_in}/model",
],
)
assert train_result.exit_code == 0
assert Path(d_in / "model" / "model-last").exists()
@pytest.mark.slow
@pytest.mark.parametrize(
"component,examples",
[("tagger,parser,morphologizer", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2] * 15)],
)
def test_init_config_trainable_multiple(component, examples, en_vocab):
train_docs = []
for example in examples:
example = {k: v for k, v in example.items() if k not in ("cats", "spans")}
doc = Doc(en_vocab, **example)
train_docs.append(doc)
with make_tempdir() as d_in:
train_bin = DocBin(docs=train_docs)
train_bin.to_disk(d_in / "train.spacy")
dev_bin = DocBin(docs=train_docs)
dev_bin.to_disk(d_in / "dev.spacy")
init_config_result = CliRunner().invoke(
app,
[
"init",
"config",
f"{d_in}/config.cfg",
"--lang",
"en",
"--pipeline",
component,
],
)
assert init_config_result.exit_code == 0
train_result = CliRunner().invoke(
app,
[
"train",
f"{d_in}/config.cfg",
"--paths.train",
f"{d_in}/train.spacy",
"--paths.dev",
f"{d_in}/dev.spacy",
"--output",
f"{d_in}/model",
],
)
assert train_result.exit_code == 0
assert Path(d_in / "model" / "model-last").exists()