spaCy/spacy/tests/test_cli_app.py
Adriane Boyd 606273f7e4
Normalize whitespace in evaluate CLI output test (#12157)
* Normalize whitespace in evaluate CLI output test

Depending on terminal settings, lines may be padded to the screen width
so the comparison is too strict with only the command string replacement.

* Move to test util method

* Change to normalization method
2023-01-27 16:13:34 +01:00

92 lines
3.2 KiB
Python

import os
from pathlib import Path
from typer.testing import CliRunner
from spacy.tokens import DocBin, Doc
from spacy.cli._util import app
from .util import make_tempdir, normalize_whitespace
def test_convert_auto():
with make_tempdir() as d_in, make_tempdir() as d_out:
for f in ["data1.iob", "data2.iob", "data3.iob"]:
Path(d_in / f).touch()
# ensure that "automatic" suffix detection works
result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
assert "Generated output file" in result.stdout
out_files = os.listdir(d_out)
assert len(out_files) == 3
assert "data1.spacy" in out_files
assert "data2.spacy" in out_files
assert "data3.spacy" in out_files
def test_convert_auto_conflict():
with make_tempdir() as d_in, make_tempdir() as d_out:
for f in ["data1.iob", "data2.iob", "data3.json"]:
Path(d_in / f).touch()
# ensure that "automatic" suffix detection warns when there are different file types
result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
assert "All input files must be same type" in result.stdout
out_files = os.listdir(d_out)
assert len(out_files) == 0
def test_benchmark_accuracy_alias():
# Verify that the `evaluate` alias works correctly.
result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])
result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"])
assert normalize_whitespace(result_benchmark.stdout) == normalize_whitespace(
result_evaluate.stdout.replace("spacy evaluate", "spacy benchmark accuracy")
)
def test_debug_data_trainable_lemmatizer_cli(en_vocab):
train_docs = [
Doc(en_vocab, words=["I", "like", "cats"], lemmas=["I", "like", "cat"]),
Doc(
en_vocab,
words=["Dogs", "are", "great", "too"],
lemmas=["dog", "be", "great", "too"],
),
]
dev_docs = [
Doc(en_vocab, words=["Cats", "are", "cute"], lemmas=["cat", "be", "cute"]),
Doc(en_vocab, words=["Pets", "are", "great"], lemmas=["pet", "be", "great"]),
]
with make_tempdir() as d_in:
train_bin = DocBin(docs=train_docs)
train_bin.to_disk(d_in / "train.spacy")
dev_bin = DocBin(docs=dev_docs)
dev_bin.to_disk(d_in / "dev.spacy")
# `debug data` requires an input pipeline config
CliRunner().invoke(
app,
[
"init",
"config",
f"{d_in}/config.cfg",
"--lang",
"en",
"--pipeline",
"trainable_lemmatizer",
],
)
result_debug_data = CliRunner().invoke(
app,
[
"debug",
"data",
f"{d_in}/config.cfg",
"--paths.train",
f"{d_in}/train.spacy",
"--paths.dev",
f"{d_in}/dev.spacy",
],
)
# Instead of checking specific wording of the output, which may change,
# we'll check that this section of the debug output is present.
assert "= Trainable Lemmatizer =" in result_debug_data.stdout