2023-01-05 12:21:00 +03:00
|
|
|
import os
|
Accept multiple code files in all CLI commands (#12101)
* Add support for multiple code files to all relevant commands
Prior to this, only the package command supported multiple code files.
* Update docs
* Add debug data test, plus generic fixtures
One tricky thing here: it's tempting to create the config by creating a
pipeline in code, but that requires declaring the custom components
here. However the CliRunner appears to be run in the same process or
otherwise have access to our registry, so it works even without any
code arguments. So it's necessary to avoid declaring the components in
the tests.
* Add debug config test and restructure
The code argument imports the provided file. If it adds item to the
registry, that affects global state, which CliRunner doesn't isolate.
Since there's no standard way to remove things from the registry, this
instead uses subprocess.run to run commands.
* Use a more generic, parametrized test
* Add output arg for assemble and pretrain
Assemble and pretrain require an output argument. This commit adds
assemble testing, but not pretrain, as that requires an actual trainable
component, which is not currently in the test config.
* Add evaluate test and some cleanup
* Mark tests as slow
* Revert argument name change
* Apply suggestions from code review
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
* Format API CLI docs
* isort
* Fix imports in tests
* isort
* Undo changes to package CLI help
* Fix python executable and lang code in test
* Fix executable in another test
---------
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
2023-08-01 16:24:02 +03:00
|
|
|
import subprocess
|
|
|
|
import sys
|
2023-01-05 12:21:00 +03:00
|
|
|
from pathlib import Path
|
2023-06-26 12:41:03 +03:00
|
|
|
|
2023-02-23 12:22:57 +03:00
|
|
|
import pytest
|
|
|
|
import srsly
|
2023-01-05 12:21:00 +03:00
|
|
|
from typer.testing import CliRunner
|
|
|
|
|
Accept multiple code files in all CLI commands (#12101)
* Add support for multiple code files to all relevant commands
Prior to this, only the package command supported multiple code files.
* Update docs
* Add debug data test, plus generic fixtures
One tricky thing here: it's tempting to create the config by creating a
pipeline in code, but that requires declaring the custom components
here. However the CliRunner appears to be run in the same process or
otherwise have access to our registry, so it works even without any
code arguments. So it's necessary to avoid declaring the components in
the tests.
* Add debug config test and restructure
The code argument imports the provided file. If it adds item to the
registry, that affects global state, which CliRunner doesn't isolate.
Since there's no standard way to remove things from the registry, this
instead uses subprocess.run to run commands.
* Use a more generic, parametrized test
* Add output arg for assemble and pretrain
Assemble and pretrain require an output argument. This commit adds
assemble testing, but not pretrain, as that requires an actual trainable
component, which is not currently in the test config.
* Add evaluate test and some cleanup
* Mark tests as slow
* Revert argument name change
* Apply suggestions from code review
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
* Format API CLI docs
* isort
* Fix imports in tests
* isort
* Undo changes to package CLI help
* Fix python executable and lang code in test
* Fix executable in another test
---------
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
2023-08-01 16:24:02 +03:00
|
|
|
import spacy
|
2023-03-09 18:41:21 +03:00
|
|
|
from spacy.cli._util import app, get_git_version
|
2023-07-31 15:45:04 +03:00
|
|
|
from spacy.tokens import Doc, DocBin, Span
|
2023-06-26 12:41:03 +03:00
|
|
|
|
2023-01-27 18:13:34 +03:00
|
|
|
from .util import make_tempdir, normalize_whitespace
|
2023-01-05 12:21:00 +03:00
|
|
|
|
|
|
|
|
2023-03-09 18:41:21 +03:00
|
|
|
def has_git():
|
|
|
|
try:
|
|
|
|
get_git_version()
|
|
|
|
return True
|
|
|
|
except RuntimeError:
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
2023-01-05 12:21:00 +03:00
|
|
|
def test_convert_auto():
|
|
|
|
with make_tempdir() as d_in, make_tempdir() as d_out:
|
|
|
|
for f in ["data1.iob", "data2.iob", "data3.iob"]:
|
|
|
|
Path(d_in / f).touch()
|
|
|
|
|
|
|
|
# ensure that "automatic" suffix detection works
|
|
|
|
result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
|
|
|
|
assert "Generated output file" in result.stdout
|
|
|
|
out_files = os.listdir(d_out)
|
|
|
|
assert len(out_files) == 3
|
|
|
|
assert "data1.spacy" in out_files
|
|
|
|
assert "data2.spacy" in out_files
|
|
|
|
assert "data3.spacy" in out_files
|
|
|
|
|
|
|
|
|
|
|
|
def test_convert_auto_conflict():
|
|
|
|
with make_tempdir() as d_in, make_tempdir() as d_out:
|
|
|
|
for f in ["data1.iob", "data2.iob", "data3.json"]:
|
|
|
|
Path(d_in / f).touch()
|
|
|
|
|
|
|
|
# ensure that "automatic" suffix detection warns when there are different file types
|
|
|
|
result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
|
|
|
|
assert "All input files must be same type" in result.stdout
|
|
|
|
out_files = os.listdir(d_out)
|
|
|
|
assert len(out_files) == 0
|
2023-01-12 13:55:21 +03:00
|
|
|
|
|
|
|
|
Accept multiple code files in all CLI commands (#12101)
* Add support for multiple code files to all relevant commands
Prior to this, only the package command supported multiple code files.
* Update docs
* Add debug data test, plus generic fixtures
One tricky thing here: it's tempting to create the config by creating a
pipeline in code, but that requires declaring the custom components
here. However the CliRunner appears to be run in the same process or
otherwise have access to our registry, so it works even without any
code arguments. So it's necessary to avoid declaring the components in
the tests.
* Add debug config test and restructure
The code argument imports the provided file. If it adds item to the
registry, that affects global state, which CliRunner doesn't isolate.
Since there's no standard way to remove things from the registry, this
instead uses subprocess.run to run commands.
* Use a more generic, parametrized test
* Add output arg for assemble and pretrain
Assemble and pretrain require an output argument. This commit adds
assemble testing, but not pretrain, as that requires an actual trainable
component, which is not currently in the test config.
* Add evaluate test and some cleanup
* Mark tests as slow
* Revert argument name change
* Apply suggestions from code review
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
* Format API CLI docs
* isort
* Fix imports in tests
* isort
* Undo changes to package CLI help
* Fix python executable and lang code in test
* Fix executable in another test
---------
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com>
2023-08-01 16:24:02 +03:00
|
|
|
NOOP_CONFIG = """
|
|
|
|
[paths]
|
|
|
|
train = null
|
|
|
|
dev = null
|
|
|
|
vectors = null
|
|
|
|
init_tok2vec = null
|
|
|
|
|
|
|
|
[system]
|
|
|
|
seed = 0
|
|
|
|
gpu_allocator = null
|
|
|
|
|
|
|
|
[nlp]
|
|
|
|
lang = "mul"
|
|
|
|
pipeline = ["noop", "noop2"]
|
|
|
|
disabled = []
|
|
|
|
before_creation = null
|
|
|
|
after_creation = null
|
|
|
|
after_pipeline_creation = null
|
|
|
|
batch_size = 1000
|
|
|
|
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
|
|
|
|
|
|
|
|
[components]
|
|
|
|
|
|
|
|
[components.noop]
|
|
|
|
factory = "noop"
|
|
|
|
|
|
|
|
[components.noop2]
|
|
|
|
factory = "noop2"
|
|
|
|
|
|
|
|
[corpora]
|
|
|
|
|
|
|
|
[corpora.dev]
|
|
|
|
@readers = "spacy.Corpus.v1"
|
|
|
|
path = ${paths.dev}
|
|
|
|
gold_preproc = false
|
|
|
|
max_length = 0
|
|
|
|
limit = 0
|
|
|
|
augmenter = null
|
|
|
|
|
|
|
|
[corpora.train]
|
|
|
|
@readers = "spacy.Corpus.v1"
|
|
|
|
path = ${paths.train}
|
|
|
|
gold_preproc = false
|
|
|
|
max_length = 0
|
|
|
|
limit = 0
|
|
|
|
augmenter = null
|
|
|
|
|
|
|
|
[training]
|
|
|
|
seed = ${system.seed}
|
|
|
|
gpu_allocator = ${system.gpu_allocator}
|
|
|
|
dropout = 0.1
|
|
|
|
accumulate_gradient = 1
|
|
|
|
patience = 1600
|
|
|
|
max_epochs = 0
|
|
|
|
max_steps = 100
|
|
|
|
eval_frequency = 200
|
|
|
|
frozen_components = []
|
|
|
|
annotating_components = []
|
|
|
|
dev_corpus = "corpora.dev"
|
|
|
|
|
|
|
|
train_corpus = "corpora.train"
|
|
|
|
before_to_disk = null
|
|
|
|
before_update = null
|
|
|
|
|
|
|
|
[training.batcher]
|
|
|
|
@batchers = "spacy.batch_by_words.v1"
|
|
|
|
discard_oversize = false
|
|
|
|
tolerance = 0.2
|
|
|
|
get_length = null
|
|
|
|
|
|
|
|
[training.batcher.size]
|
|
|
|
@schedules = "compounding.v1"
|
|
|
|
start = 100
|
|
|
|
stop = 1000
|
|
|
|
compound = 1.001
|
|
|
|
t = 0.0
|
|
|
|
|
|
|
|
[training.logger]
|
|
|
|
@loggers = "spacy.ConsoleLogger.v1"
|
|
|
|
progress_bar = false
|
|
|
|
|
|
|
|
[training.optimizer]
|
|
|
|
@optimizers = "Adam.v1"
|
|
|
|
beta1 = 0.9
|
|
|
|
beta2 = 0.999
|
|
|
|
L2_is_weight_decay = true
|
|
|
|
L2 = 0.01
|
|
|
|
grad_clip = 1.0
|
|
|
|
use_averages = false
|
|
|
|
eps = 0.00000001
|
|
|
|
learn_rate = 0.001
|
|
|
|
|
|
|
|
[training.score_weights]
|
|
|
|
|
|
|
|
[pretraining]
|
|
|
|
|
|
|
|
[initialize]
|
|
|
|
vectors = ${paths.vectors}
|
|
|
|
init_tok2vec = ${paths.init_tok2vec}
|
|
|
|
vocab_data = null
|
|
|
|
lookups = null
|
|
|
|
before_init = null
|
|
|
|
after_init = null
|
|
|
|
|
|
|
|
[initialize.components]
|
|
|
|
|
|
|
|
[initialize.tokenizer]
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def data_paths():
|
|
|
|
nlp = spacy.blank("mul")
|
|
|
|
doc = nlp("ok")
|
|
|
|
with make_tempdir() as tdir:
|
|
|
|
db = DocBin()
|
|
|
|
# debug data will *fail* if there aren't enough docs
|
|
|
|
for ii in range(100):
|
|
|
|
db.add(doc)
|
|
|
|
fpath = tdir / "data.spacy"
|
|
|
|
db.to_disk(fpath)
|
|
|
|
|
|
|
|
args = [
|
|
|
|
"--paths.train",
|
|
|
|
str(fpath),
|
|
|
|
"--paths.dev",
|
|
|
|
str(fpath),
|
|
|
|
]
|
|
|
|
yield args
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def code_paths():
|
|
|
|
noop_base = """
|
|
|
|
from spacy.language import Language
|
|
|
|
|
|
|
|
@Language.component("{}")
|
|
|
|
def noop(doc):
|
|
|
|
return doc
|
|
|
|
"""
|
|
|
|
|
|
|
|
with make_tempdir() as temp_d:
|
|
|
|
# write code files to load
|
|
|
|
paths = []
|
|
|
|
for ff in ["noop", "noop2"]:
|
|
|
|
pyfile = temp_d / f"{ff}.py"
|
|
|
|
pyfile.write_text(noop_base.format(ff))
|
|
|
|
paths.append(pyfile)
|
|
|
|
|
|
|
|
args = ["--code", ",".join([str(pp) for pp in paths])]
|
|
|
|
yield args
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def noop_config():
|
|
|
|
with make_tempdir() as temp_d:
|
|
|
|
cfg = temp_d / "config.cfg"
|
|
|
|
cfg.write_text(NOOP_CONFIG)
|
|
|
|
|
|
|
|
yield cfg
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.slow
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"cmd",
|
|
|
|
["debug config", "debug data", "train", "assemble"],
|
|
|
|
)
|
|
|
|
def test_multi_code(cmd, code_paths, data_paths, noop_config):
|
|
|
|
# check that it fails without the code arg
|
|
|
|
cmd = cmd.split()
|
|
|
|
output = ["."] if cmd[0] == "assemble" else []
|
|
|
|
cmd = [sys.executable, "-m", "spacy"] + cmd
|
|
|
|
result = subprocess.run([*cmd, str(noop_config), *output, *data_paths])
|
|
|
|
assert result.returncode == 1
|
|
|
|
|
|
|
|
# check that it succeeds with the code arg
|
|
|
|
result = subprocess.run([*cmd, str(noop_config), *output, *data_paths, *code_paths])
|
|
|
|
assert result.returncode == 0
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.slow
|
|
|
|
def test_multi_code_evaluate(code_paths, data_paths, noop_config):
|
|
|
|
# Evaluation requires a model, not a config, so this works differently from
|
|
|
|
# the other commands.
|
|
|
|
|
|
|
|
# Train a model to evaluate
|
|
|
|
cmd = f"{sys.executable} -m spacy train {noop_config} -o model".split()
|
|
|
|
result = subprocess.run([*cmd, *data_paths, *code_paths])
|
|
|
|
assert result.returncode == 0
|
|
|
|
|
|
|
|
# now do the evaluation
|
|
|
|
|
|
|
|
eval_data = data_paths[-1]
|
|
|
|
cmd = f"{sys.executable} -m spacy evaluate model/model-best {eval_data}".split()
|
|
|
|
|
|
|
|
# check that it fails without the code arg
|
|
|
|
result = subprocess.run(cmd)
|
|
|
|
assert result.returncode == 1
|
|
|
|
|
|
|
|
# check that it succeeds with the code arg
|
|
|
|
result = subprocess.run([*cmd, *code_paths])
|
|
|
|
assert result.returncode == 0
|
|
|
|
|
|
|
|
|
2023-01-12 13:55:21 +03:00
|
|
|
def test_benchmark_accuracy_alias():
|
|
|
|
# Verify that the `evaluate` alias works correctly.
|
|
|
|
result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])
|
|
|
|
result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"])
|
2023-01-27 18:13:34 +03:00
|
|
|
assert normalize_whitespace(result_benchmark.stdout) == normalize_whitespace(
|
|
|
|
result_evaluate.stdout.replace("spacy evaluate", "spacy benchmark accuracy")
|
2023-01-12 13:55:21 +03:00
|
|
|
)
|
2023-01-26 19:36:50 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_debug_data_trainable_lemmatizer_cli(en_vocab):
|
|
|
|
train_docs = [
|
|
|
|
Doc(en_vocab, words=["I", "like", "cats"], lemmas=["I", "like", "cat"]),
|
|
|
|
Doc(
|
|
|
|
en_vocab,
|
|
|
|
words=["Dogs", "are", "great", "too"],
|
|
|
|
lemmas=["dog", "be", "great", "too"],
|
|
|
|
),
|
|
|
|
]
|
|
|
|
dev_docs = [
|
|
|
|
Doc(en_vocab, words=["Cats", "are", "cute"], lemmas=["cat", "be", "cute"]),
|
|
|
|
Doc(en_vocab, words=["Pets", "are", "great"], lemmas=["pet", "be", "great"]),
|
|
|
|
]
|
|
|
|
with make_tempdir() as d_in:
|
|
|
|
train_bin = DocBin(docs=train_docs)
|
|
|
|
train_bin.to_disk(d_in / "train.spacy")
|
|
|
|
dev_bin = DocBin(docs=dev_docs)
|
|
|
|
dev_bin.to_disk(d_in / "dev.spacy")
|
|
|
|
# `debug data` requires an input pipeline config
|
|
|
|
CliRunner().invoke(
|
|
|
|
app,
|
|
|
|
[
|
|
|
|
"init",
|
|
|
|
"config",
|
|
|
|
f"{d_in}/config.cfg",
|
|
|
|
"--lang",
|
|
|
|
"en",
|
|
|
|
"--pipeline",
|
|
|
|
"trainable_lemmatizer",
|
|
|
|
],
|
|
|
|
)
|
|
|
|
result_debug_data = CliRunner().invoke(
|
|
|
|
app,
|
|
|
|
[
|
|
|
|
"debug",
|
|
|
|
"data",
|
|
|
|
f"{d_in}/config.cfg",
|
|
|
|
"--paths.train",
|
|
|
|
f"{d_in}/train.spacy",
|
|
|
|
"--paths.dev",
|
|
|
|
f"{d_in}/dev.spacy",
|
|
|
|
],
|
|
|
|
)
|
|
|
|
# Instead of checking specific wording of the output, which may change,
|
|
|
|
# we'll check that this section of the debug output is present.
|
|
|
|
assert "= Trainable Lemmatizer =" in result_debug_data.stdout
|
2023-02-23 12:22:57 +03:00
|
|
|
|
|
|
|
|
|
|
|
# project tests
|
|
|
|
|
2023-06-08 12:43:36 +03:00
|
|
|
CFG_FILE = "myconfig.cfg"
|
|
|
|
|
2023-02-23 12:22:57 +03:00
|
|
|
SAMPLE_PROJECT = {
|
|
|
|
"title": "Sample project",
|
|
|
|
"description": "This is a project for testing",
|
|
|
|
"assets": [
|
|
|
|
{
|
|
|
|
"dest": "assets/spacy-readme.md",
|
|
|
|
"url": "https://github.com/explosion/spaCy/raw/dec81508d28b47f09a06203c472b37f00db6c869/README.md",
|
|
|
|
"checksum": "411b2c89ccf34288fae8ed126bf652f7",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"dest": "assets/citation.cff",
|
|
|
|
"url": "https://github.com/explosion/spaCy/raw/master/CITATION.cff",
|
|
|
|
"checksum": "c996bfd80202d480eb2e592369714e5e",
|
|
|
|
"extra": True,
|
|
|
|
},
|
|
|
|
],
|
|
|
|
"commands": [
|
|
|
|
{
|
|
|
|
"name": "ok",
|
|
|
|
"help": "print ok",
|
|
|
|
"script": ["python -c \"print('okokok')\""],
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"name": "create",
|
|
|
|
"help": "make a file",
|
2023-06-08 12:43:36 +03:00
|
|
|
"script": [f"python -m spacy init config {CFG_FILE}"],
|
|
|
|
"outputs": [f"{CFG_FILE}"],
|
2023-02-23 12:22:57 +03:00
|
|
|
},
|
|
|
|
],
|
|
|
|
}
|
|
|
|
|
|
|
|
SAMPLE_PROJECT_TEXT = srsly.yaml_dumps(SAMPLE_PROJECT)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture
|
|
|
|
def project_dir():
|
|
|
|
with make_tempdir() as pdir:
|
|
|
|
(pdir / "project.yml").write_text(SAMPLE_PROJECT_TEXT)
|
|
|
|
yield pdir
|
|
|
|
|
|
|
|
|
|
|
|
def test_project_document(project_dir):
|
|
|
|
readme_path = project_dir / "README.md"
|
|
|
|
assert not readme_path.exists(), "README already exists"
|
|
|
|
result = CliRunner().invoke(
|
|
|
|
app, ["project", "document", str(project_dir), "-o", str(readme_path)]
|
|
|
|
)
|
|
|
|
assert result.exit_code == 0
|
|
|
|
assert readme_path.is_file()
|
|
|
|
text = readme_path.read_text("utf-8")
|
|
|
|
assert SAMPLE_PROJECT["description"] in text
|
|
|
|
|
|
|
|
|
|
|
|
def test_project_assets(project_dir):
|
|
|
|
asset_dir = project_dir / "assets"
|
|
|
|
assert not asset_dir.exists(), "Assets dir is already present"
|
|
|
|
result = CliRunner().invoke(app, ["project", "assets", str(project_dir)])
|
|
|
|
assert result.exit_code == 0
|
|
|
|
assert (asset_dir / "spacy-readme.md").is_file(), "Assets not downloaded"
|
|
|
|
# check that extras work
|
|
|
|
result = CliRunner().invoke(app, ["project", "assets", "--extra", str(project_dir)])
|
|
|
|
assert result.exit_code == 0
|
|
|
|
assert (asset_dir / "citation.cff").is_file(), "Extras not downloaded"
|
|
|
|
|
|
|
|
|
|
|
|
def test_project_run(project_dir):
|
|
|
|
# make sure dry run works
|
2023-06-08 12:43:36 +03:00
|
|
|
test_file = project_dir / CFG_FILE
|
2023-02-23 12:22:57 +03:00
|
|
|
result = CliRunner().invoke(
|
|
|
|
app, ["project", "run", "--dry", "create", str(project_dir)]
|
|
|
|
)
|
|
|
|
assert result.exit_code == 0
|
|
|
|
assert not test_file.is_file()
|
|
|
|
result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
|
|
|
|
assert result.exit_code == 0
|
|
|
|
assert test_file.is_file()
|
|
|
|
result = CliRunner().invoke(app, ["project", "run", "ok", str(project_dir)])
|
|
|
|
assert result.exit_code == 0
|
|
|
|
assert "okokok" in result.stdout
|
|
|
|
|
|
|
|
|
2023-03-09 18:41:21 +03:00
|
|
|
@pytest.mark.skipif(not has_git(), reason="git not installed")
|
2023-02-23 12:22:57 +03:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"options",
|
|
|
|
[
|
|
|
|
"",
|
|
|
|
# "--sparse",
|
|
|
|
"--branch v3",
|
|
|
|
"--repo https://github.com/explosion/projects --branch v3",
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_project_clone(options):
|
|
|
|
with make_tempdir() as workspace:
|
|
|
|
out = workspace / "project"
|
|
|
|
target = "benchmarks/ner_conll03"
|
|
|
|
if not options:
|
|
|
|
options = []
|
|
|
|
else:
|
|
|
|
options = options.split()
|
|
|
|
result = CliRunner().invoke(
|
|
|
|
app, ["project", "clone", target, *options, str(out)]
|
|
|
|
)
|
|
|
|
assert result.exit_code == 0
|
|
|
|
assert (out / "README.md").is_file()
|
|
|
|
|
|
|
|
|
|
|
|
def test_project_push_pull(project_dir):
|
|
|
|
proj = dict(SAMPLE_PROJECT)
|
|
|
|
remote = "xyz"
|
|
|
|
|
|
|
|
with make_tempdir() as remote_dir:
|
|
|
|
proj["remotes"] = {remote: str(remote_dir)}
|
|
|
|
proj_text = srsly.yaml_dumps(proj)
|
|
|
|
(project_dir / "project.yml").write_text(proj_text)
|
|
|
|
|
2023-06-08 12:43:36 +03:00
|
|
|
test_file = project_dir / CFG_FILE
|
2023-02-23 12:22:57 +03:00
|
|
|
result = CliRunner().invoke(app, ["project", "run", "create", str(project_dir)])
|
|
|
|
assert result.exit_code == 0
|
|
|
|
assert test_file.is_file()
|
|
|
|
result = CliRunner().invoke(app, ["project", "push", remote, str(project_dir)])
|
|
|
|
assert result.exit_code == 0
|
2023-06-08 12:43:36 +03:00
|
|
|
test_file.unlink()
|
2023-02-23 12:22:57 +03:00
|
|
|
assert not test_file.exists()
|
|
|
|
result = CliRunner().invoke(app, ["project", "pull", remote, str(project_dir)])
|
|
|
|
assert result.exit_code == 0
|
|
|
|
assert test_file.is_file()
|
2023-07-31 10:39:00 +03:00
|
|
|
|
|
|
|
|
|
|
|
def test_find_function_valid():
|
|
|
|
# example of architecture in main code base
|
2023-11-29 11:11:54 +03:00
|
|
|
function = "spacy.TextCatBOW.v3"
|
2023-07-31 10:39:00 +03:00
|
|
|
result = CliRunner().invoke(app, ["find-function", function, "-r", "architectures"])
|
|
|
|
assert f"Found registered function '{function}'" in result.stdout
|
|
|
|
assert "textcat.py" in result.stdout
|
|
|
|
|
|
|
|
result = CliRunner().invoke(app, ["find-function", function])
|
|
|
|
assert f"Found registered function '{function}'" in result.stdout
|
|
|
|
assert "textcat.py" in result.stdout
|
|
|
|
|
|
|
|
# example of architecture in spacy-legacy
|
|
|
|
function = "spacy.TextCatBOW.v1"
|
|
|
|
result = CliRunner().invoke(app, ["find-function", function])
|
|
|
|
assert f"Found registered function '{function}'" in result.stdout
|
|
|
|
assert "spacy_legacy" in result.stdout
|
|
|
|
assert "textcat.py" in result.stdout
|
|
|
|
|
|
|
|
|
|
|
|
def test_find_function_invalid():
|
|
|
|
# invalid registry
|
2023-11-29 11:11:54 +03:00
|
|
|
function = "spacy.TextCatBOW.v3"
|
2023-07-31 10:39:00 +03:00
|
|
|
registry = "foobar"
|
|
|
|
result = CliRunner().invoke(
|
|
|
|
app, ["find-function", function, "--registry", registry]
|
|
|
|
)
|
|
|
|
assert f"Unknown function registry: '{registry}'" in result.stdout
|
|
|
|
|
|
|
|
# invalid function
|
|
|
|
function = "spacy.TextCatBOW.v666"
|
|
|
|
result = CliRunner().invoke(app, ["find-function", function])
|
|
|
|
assert f"Couldn't find registered function: '{function}'" in result.stdout
|
2023-07-31 15:45:04 +03:00
|
|
|
|
|
|
|
|
|
|
|
example_words_1 = ["I", "like", "cats"]
|
|
|
|
example_words_2 = ["I", "like", "dogs"]
|
|
|
|
example_lemmas_1 = ["I", "like", "cat"]
|
|
|
|
example_lemmas_2 = ["I", "like", "dog"]
|
|
|
|
example_tags = ["PRP", "VBP", "NNS"]
|
|
|
|
example_morphs = [
|
|
|
|
"Case=Nom|Number=Sing|Person=1|PronType=Prs",
|
|
|
|
"Tense=Pres|VerbForm=Fin",
|
|
|
|
"Number=Plur",
|
|
|
|
]
|
|
|
|
example_deps = ["nsubj", "ROOT", "dobj"]
|
|
|
|
example_pos = ["PRON", "VERB", "NOUN"]
|
|
|
|
example_ents = ["O", "O", "I-ANIMAL"]
|
|
|
|
example_spans = [(2, 3, "ANIMAL")]
|
|
|
|
|
|
|
|
TRAIN_EXAMPLE_1 = dict(
|
|
|
|
words=example_words_1,
|
|
|
|
lemmas=example_lemmas_1,
|
|
|
|
tags=example_tags,
|
|
|
|
morphs=example_morphs,
|
|
|
|
deps=example_deps,
|
|
|
|
heads=[1, 1, 1],
|
|
|
|
pos=example_pos,
|
|
|
|
ents=example_ents,
|
|
|
|
spans=example_spans,
|
|
|
|
cats={"CAT": 1.0, "DOG": 0.0},
|
|
|
|
)
|
|
|
|
TRAIN_EXAMPLE_2 = dict(
|
|
|
|
words=example_words_2,
|
|
|
|
lemmas=example_lemmas_2,
|
|
|
|
tags=example_tags,
|
|
|
|
morphs=example_morphs,
|
|
|
|
deps=example_deps,
|
|
|
|
heads=[1, 1, 1],
|
|
|
|
pos=example_pos,
|
|
|
|
ents=example_ents,
|
|
|
|
spans=example_spans,
|
|
|
|
cats={"CAT": 0.0, "DOG": 1.0},
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.slow
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"component,examples",
|
|
|
|
[
|
|
|
|
("tagger", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
|
|
|
|
("morphologizer", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
|
|
|
|
("trainable_lemmatizer", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
|
|
|
|
("parser", [TRAIN_EXAMPLE_1] * 30),
|
|
|
|
("ner", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
|
|
|
|
("spancat", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
|
|
|
|
("textcat", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2]),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_init_config_trainable(component, examples, en_vocab):
|
|
|
|
if component == "textcat":
|
|
|
|
train_docs = []
|
|
|
|
for example in examples:
|
|
|
|
doc = Doc(en_vocab, words=example["words"])
|
|
|
|
doc.cats = example["cats"]
|
|
|
|
train_docs.append(doc)
|
|
|
|
elif component == "spancat":
|
|
|
|
train_docs = []
|
|
|
|
for example in examples:
|
|
|
|
doc = Doc(en_vocab, words=example["words"])
|
|
|
|
doc.spans["sc"] = [
|
|
|
|
Span(doc, start, end, label) for start, end, label in example["spans"]
|
|
|
|
]
|
|
|
|
train_docs.append(doc)
|
|
|
|
else:
|
|
|
|
train_docs = []
|
|
|
|
for example in examples:
|
|
|
|
# cats, spans are not valid kwargs for instantiating a Doc
|
|
|
|
example = {k: v for k, v in example.items() if k not in ("cats", "spans")}
|
|
|
|
doc = Doc(en_vocab, **example)
|
|
|
|
train_docs.append(doc)
|
|
|
|
|
|
|
|
with make_tempdir() as d_in:
|
|
|
|
train_bin = DocBin(docs=train_docs)
|
|
|
|
train_bin.to_disk(d_in / "train.spacy")
|
|
|
|
dev_bin = DocBin(docs=train_docs)
|
|
|
|
dev_bin.to_disk(d_in / "dev.spacy")
|
|
|
|
init_config_result = CliRunner().invoke(
|
|
|
|
app,
|
|
|
|
[
|
|
|
|
"init",
|
|
|
|
"config",
|
|
|
|
f"{d_in}/config.cfg",
|
|
|
|
"--lang",
|
|
|
|
"en",
|
|
|
|
"--pipeline",
|
|
|
|
component,
|
|
|
|
],
|
|
|
|
)
|
|
|
|
assert init_config_result.exit_code == 0
|
|
|
|
train_result = CliRunner().invoke(
|
|
|
|
app,
|
|
|
|
[
|
|
|
|
"train",
|
|
|
|
f"{d_in}/config.cfg",
|
|
|
|
"--paths.train",
|
|
|
|
f"{d_in}/train.spacy",
|
|
|
|
"--paths.dev",
|
|
|
|
f"{d_in}/dev.spacy",
|
|
|
|
"--output",
|
|
|
|
f"{d_in}/model",
|
|
|
|
],
|
|
|
|
)
|
|
|
|
assert train_result.exit_code == 0
|
|
|
|
assert Path(d_in / "model" / "model-last").exists()
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.slow
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"component,examples",
|
|
|
|
[("tagger,parser,morphologizer", [TRAIN_EXAMPLE_1, TRAIN_EXAMPLE_2] * 15)],
|
|
|
|
)
|
|
|
|
def test_init_config_trainable_multiple(component, examples, en_vocab):
|
|
|
|
train_docs = []
|
|
|
|
for example in examples:
|
|
|
|
example = {k: v for k, v in example.items() if k not in ("cats", "spans")}
|
|
|
|
doc = Doc(en_vocab, **example)
|
|
|
|
train_docs.append(doc)
|
|
|
|
|
|
|
|
with make_tempdir() as d_in:
|
|
|
|
train_bin = DocBin(docs=train_docs)
|
|
|
|
train_bin.to_disk(d_in / "train.spacy")
|
|
|
|
dev_bin = DocBin(docs=train_docs)
|
|
|
|
dev_bin.to_disk(d_in / "dev.spacy")
|
|
|
|
init_config_result = CliRunner().invoke(
|
|
|
|
app,
|
|
|
|
[
|
|
|
|
"init",
|
|
|
|
"config",
|
|
|
|
f"{d_in}/config.cfg",
|
|
|
|
"--lang",
|
|
|
|
"en",
|
|
|
|
"--pipeline",
|
|
|
|
component,
|
|
|
|
],
|
|
|
|
)
|
|
|
|
assert init_config_result.exit_code == 0
|
|
|
|
train_result = CliRunner().invoke(
|
|
|
|
app,
|
|
|
|
[
|
|
|
|
"train",
|
|
|
|
f"{d_in}/config.cfg",
|
|
|
|
"--paths.train",
|
|
|
|
f"{d_in}/train.spacy",
|
|
|
|
"--paths.dev",
|
|
|
|
f"{d_in}/dev.spacy",
|
|
|
|
"--output",
|
|
|
|
f"{d_in}/model",
|
|
|
|
],
|
|
|
|
)
|
|
|
|
assert train_result.exit_code == 0
|
|
|
|
assert Path(d_in / "model" / "model-last").exists()
|