mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
606273f7e4
* Normalize whitespace in evaluate CLI output test Depending on terminal settings, lines may be padded to the screen width so the comparison is too strict with only the command string replacement. * Move to test util method * Change to normalization method
92 lines
3.2 KiB
Python
92 lines
3.2 KiB
Python
import os
|
|
from pathlib import Path
|
|
from typer.testing import CliRunner
|
|
from spacy.tokens import DocBin, Doc
|
|
|
|
from spacy.cli._util import app
|
|
from .util import make_tempdir, normalize_whitespace
|
|
|
|
|
|
def test_convert_auto():
|
|
with make_tempdir() as d_in, make_tempdir() as d_out:
|
|
for f in ["data1.iob", "data2.iob", "data3.iob"]:
|
|
Path(d_in / f).touch()
|
|
|
|
# ensure that "automatic" suffix detection works
|
|
result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
|
|
assert "Generated output file" in result.stdout
|
|
out_files = os.listdir(d_out)
|
|
assert len(out_files) == 3
|
|
assert "data1.spacy" in out_files
|
|
assert "data2.spacy" in out_files
|
|
assert "data3.spacy" in out_files
|
|
|
|
|
|
def test_convert_auto_conflict():
|
|
with make_tempdir() as d_in, make_tempdir() as d_out:
|
|
for f in ["data1.iob", "data2.iob", "data3.json"]:
|
|
Path(d_in / f).touch()
|
|
|
|
# ensure that "automatic" suffix detection warns when there are different file types
|
|
result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
|
|
assert "All input files must be same type" in result.stdout
|
|
out_files = os.listdir(d_out)
|
|
assert len(out_files) == 0
|
|
|
|
|
|
def test_benchmark_accuracy_alias():
|
|
# Verify that the `evaluate` alias works correctly.
|
|
result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])
|
|
result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"])
|
|
assert normalize_whitespace(result_benchmark.stdout) == normalize_whitespace(
|
|
result_evaluate.stdout.replace("spacy evaluate", "spacy benchmark accuracy")
|
|
)
|
|
|
|
|
|
def test_debug_data_trainable_lemmatizer_cli(en_vocab):
|
|
train_docs = [
|
|
Doc(en_vocab, words=["I", "like", "cats"], lemmas=["I", "like", "cat"]),
|
|
Doc(
|
|
en_vocab,
|
|
words=["Dogs", "are", "great", "too"],
|
|
lemmas=["dog", "be", "great", "too"],
|
|
),
|
|
]
|
|
dev_docs = [
|
|
Doc(en_vocab, words=["Cats", "are", "cute"], lemmas=["cat", "be", "cute"]),
|
|
Doc(en_vocab, words=["Pets", "are", "great"], lemmas=["pet", "be", "great"]),
|
|
]
|
|
with make_tempdir() as d_in:
|
|
train_bin = DocBin(docs=train_docs)
|
|
train_bin.to_disk(d_in / "train.spacy")
|
|
dev_bin = DocBin(docs=dev_docs)
|
|
dev_bin.to_disk(d_in / "dev.spacy")
|
|
# `debug data` requires an input pipeline config
|
|
CliRunner().invoke(
|
|
app,
|
|
[
|
|
"init",
|
|
"config",
|
|
f"{d_in}/config.cfg",
|
|
"--lang",
|
|
"en",
|
|
"--pipeline",
|
|
"trainable_lemmatizer",
|
|
],
|
|
)
|
|
result_debug_data = CliRunner().invoke(
|
|
app,
|
|
[
|
|
"debug",
|
|
"data",
|
|
f"{d_in}/config.cfg",
|
|
"--paths.train",
|
|
f"{d_in}/train.spacy",
|
|
"--paths.dev",
|
|
f"{d_in}/dev.spacy",
|
|
],
|
|
)
|
|
# Instead of checking specific wording of the output, which may change,
|
|
# we'll check that this section of the debug output is present.
|
|
assert "= Trainable Lemmatizer =" in result_debug_data.stdout
|