From f40d8c4ce7f04b93f7aeae012be0123e4faba826 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Fri, 23 Dec 2022 19:01:13 +0900 Subject: [PATCH 01/23] Add commands for automatically modifying configs This continues work started in https://github.com/explosion/projects/pull/147, which provides features for automatically manipulating pipelines and configs. The functions included are: - merge: combine components from two pipelines and handle listeners - use_transformer: use transformer as feature source - use_tok2vec: use CNN tok2vec as feature source - resume: make a version of a config for resuming training Currently these are all grouped under a new `spacy configure` command. That may not be the best place for them; in particular, `merge` may belong elsewhere, since it outputs a pipeline rather than a config. The current state of the PR is that the commands run, but there's only one small test, and docs haven't been written yet. Docs can be started but will depend somewhat on how the naming issues work out. --- spacy/cli/__init__.py | 2 + spacy/cli/_util.py | 3 + spacy/cli/configure.py | 330 ++++++++++++++++++++++++++++++++++++++++ spacy/tests/test_cli.py | 17 +++ 4 files changed, 352 insertions(+) create mode 100644 spacy/cli/configure.py diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index aabd1cfef..59c763b6b 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -29,6 +29,8 @@ from .project.push import project_push # noqa: F401 from .project.pull import project_pull # noqa: F401 from .project.document import project_document # noqa: F401 from .find_threshold import find_threshold # noqa: F401 +from .configure import merge, use_tok2vec, use_transformer # noqa: F401 +from .configure import configure_resume_cli # noqa: F401 @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py index c46abffe5..fa3d6e962 100644 --- a/spacy/cli/_util.py +++ b/spacy/cli/_util.py @@ -47,6 +47,7 @@ commands to check and validate your config files, training and evaluation data, and custom model implementations. """ INIT_HELP = """Commands for initializing configs and pipeline packages.""" +CONFIGURE_HELP = """Commands for automatically modifying configs.""" # Wrappers for Typer's annotations. Initially created to set defaults and to # keep the names short, but not needed at the moment. @@ -57,10 +58,12 @@ app = typer.Typer(name=NAME, help=HELP) project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True) debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True) init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True) +configure_cli = typer.Typer(name="configure", help=CONFIGURE_HELP, no_args_is_help=True) app.add_typer(project_cli) app.add_typer(debug_cli) app.add_typer(init_cli) +app.add_typer(configure_cli) def setup_cli() -> None: diff --git a/spacy/cli/configure.py b/spacy/cli/configure.py new file mode 100644 index 000000000..aa28da480 --- /dev/null +++ b/spacy/cli/configure.py @@ -0,0 +1,330 @@ +from pathlib import Path +import re +from wasabi import msg +import typer + +import spacy +from spacy.language import Language + +from ._util import configure_cli, Arg, Opt + +# These are the architectures that are recognized as tok2vec/feature sources. +TOK2VEC_ARCHS = [("spacy", "Tok2Vec"), ("spacy-transformers", "TransformerModel")] +# These are the listeners. +LISTENER_ARCHS = [ + ("spacy", "Tok2VecListener"), + ("spacy-transformers", "TransformerListener"), +] + + +def _deep_get(obj, key, default): + """Given a multi-part key, try to get the key. If at any point this isn't + possible, return the default. + """ + out = None + slot = obj + for notch in key: + if slot is None or notch not in slot: + return default + slot = slot[notch] + return slot + + +def _get_tok2vecs(config): + """Given a pipeline config, return the names of components that are + tok2vecs (or Transformers). + """ + out = [] + for name, comp in config["components"].items(): + arch = _deep_get(comp, ("model", "@architectures"), False) + if not arch: + continue + + ns, model, ver = arch.split(".") + if (ns, model) in TOK2VEC_ARCHS: + out.append(name) + return out + + +def _has_listener(nlp, pipe_name): + """Given a pipeline and a component name, check if it has a listener.""" + arch = _deep_get( + nlp.config, + ("components", pipe_name, "model", "tok2vec", "@architectures"), + False, + ) + if not arch: + return False + ns, model, ver = arch.split(".") + return (ns, model) in LISTENER_ARCHS + + +def _get_listeners(nlp): + """Get the name of every component that contains a listener. + + Does not check that they listen to the same thing; assumes a pipeline has + only one feature source. + """ + out = [] + for name in nlp.pipe_names: + if _has_listener(nlp, name): + out.append(name) + return out + + +def _increment_suffix(name): + """Given a name, return an incremented version. + + If no numeric suffix is found, return the original with "2" appended. + + This is used to avoid name collisions in pipelines. + """ + + res = re.search(r"\d+$", name) + if res is None: + return f"{name}2" + else: + num = res.match + prefix = name[0 : -len(num)] + return f"{prefix}{int(num) + 1}" + + +def _check_single_tok2vec(name, config): + """Check if there is just one tok2vec in a config. + + A very simple check, but used in multiple functions. + """ + tok2vecs = _get_tok2vecs(config) + fail_msg = f""" + Can't handle pipelines with more than one feature source, + but {name} has {len(tok2vecs)}.""" + if len(tok2vecs) > 1: + msg.fail(fail_msg, exits=1) + + +def _check_pipeline_names(nlp, nlp2): + """Given two pipelines, try to rename any collisions in component names. + + If a simple increment of a numeric suffix doesn't work, will give up. + """ + + fail_msg = """ + Tried automatically renaming {name} to {new_name}, but still + had a collision, so bailing out. Please make your pipe names + more unique. + """ + + # map of components to be renamed + rename = {} + # check pipeline names + names = nlp.pipe_names + for name in nlp2.pipe_names: + if name in names: + inc = _increment_suffix(name) + # TODO Would it be better to just keep incrementing? + if inc in names or inc in nlp2.pipe_names: + msg.fail(fail_msg.format(name=name, new_name=inc), exits=1) + rename[name] = inc + return rename + + +@configure_cli.command("resume") +def configure_resume_cli( + # fmt: off + base_model: Path = Arg(..., help="Path or name of base model to use for config"), + output_path: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True), + # fmt: on +): + """Create a config for resuming training. + + A config for resuming training is the same as the input config, but with + all components sourced. + """ + + nlp = spacy.load(base_model) + conf = nlp.config + + # Paths are not JSON serializable + path_str = str(base_model) + + for comp in nlp.pipe_names: + conf["components"][comp] = {"source": path_str} + + if str(output_path) == "-": + print(conf.to_str()) + else: + conf.to_disk(output_path) + msg.good("Saved config", output_path) + + return conf + + +@configure_cli.command("transformer") +def use_transformer( + base_model: str, output_path: Path, transformer_name: str = "roberta-base" +): + """Replace pipeline tok2vec with transformer.""" + + # 1. identify tok2vec + # 2. replace tok2vec + # 3. replace listeners + nlp = spacy.load(base_model) + _check_single_tok2vec(base_model, nlp.config) + + tok2vecs = _get_tok2vecs(nlp.config) + assert len(tok2vecs) > 0, "Must have tok2vec to replace!" + + nlp.remove_pipe(tok2vecs[0]) + # the rest can be default values + trf_config = { + "model": { + "name": transformer_name, + } + } + trf = nlp.add_pipe("transformer", config=trf_config, first=True) + + # TODO maybe remove vectors? + + # now update the listeners + listeners = _get_listeners(nlp) + for listener in listeners: + listener_config = { + "@architectures": "spacy-transformers.TransformerListener.v1", + "grad_factor": 1.0, + "upstream": "transformer", + "pooling": {"@layers": "reduce_mean.v1"}, + } + nlp.config["components"][listener]["model"]["tok2vec"] = listener_config + + if str(output_path) == "-": + print(nlp.config.to_str()) + else: + nlp.config.to_disk(output_path) + msg.good("Saved config", output_path) + + return nlp.config + + +@configure_cli.command("tok2vec") +def use_tok2vec(base_model: str, output_path: Path) -> Language: + """Replace pipeline tok2vec with CNN tok2vec.""" + nlp = spacy.load(base_model) + _check_single_tok2vec(base_model, nlp.config) + + tok2vecs = _get_tok2vecs(nlp.config) + assert len(tok2vecs) > 0, "Must have tok2vec to replace!" + + nlp.remove_pipe(tok2vecs[0]) + + tok2vec = nlp.add_pipe("tok2vec", first=True) + width = "${components.tok2vec.model.encode:width}" + + listeners = _get_listeners(nlp) + for listener in listeners: + listener_config = { + "@architectures": "spacy.Tok2VecListener.v1", + "width": width, + "upstream": "tok2vec", + } + nlp.config["components"][listener]["model"]["tok2vec"] = listener_config + + if str(output_path) == "-": + print(nlp.config.to_str()) + else: + nlp.config.to_disk(output_path) + msg.good("Saved config", output_path) + + return nlp.config + + +def _inner_merge(nlp, nlp2, replace_listeners=False) -> Language: + """Actually do the merge. + + nlp: Base pipeline to add components to. + nlp2: Pipeline to add components from. + replace_listeners (bool): Whether to replace listeners. Usually only true + if there's one listener. + returns: assembled pipeline. + """ + + # we checked earlier, so there's definitely just one + tok2vec_name = _get_tok2vecs(nlp2.config)[0] + rename = _check_pipeline_names(nlp, nlp2) + + if len(_get_listeners(nlp2)) > 1: + if replace_listeners: + msg.warn( + """ + Replacing listeners for multiple components. Note this can make + your pipeline large and slow. Consider chaining pipelines (like + nlp2(nlp(text))) instead. + """ + ) + else: + # TODO provide a guide for what to do here + msg.warn( + """ + The result of this merge will have two feature sources + (tok2vecs) and multiple listeners. This will work for + inference, but will probably not work when training without + extra adjustment. If you continue to train the pipelines + separately this is not a problem. + """ + ) + + for comp in nlp2.pipe_names: + if replace_listeners and comp == tok2vec_name: + # the tok2vec should not be copied over + continue + if replace_listeners and _has_listener(nlp2, comp): + # TODO does "model.tok2vec" work for everything? + nlp2.replace_listeners(tok2vec_name, comp, ["model.tok2vec"]) + nlp.add_pipe(comp, source=nlp2, name=rename.get(comp, comp)) + if comp in rename: + msg.info(f"Renaming {comp} to {rename[comp]} to avoid collision...") + return nlp + + +@configure_cli.command("merge") +def merge_pipelines(base_model: str, added_model: str, output_path: Path) -> Language: + """Combine components from multiple pipelines.""" + nlp = spacy.load(base_model) + nlp2 = spacy.load(added_model) + + # to merge models: + # - lang must be the same + # - vectors must be the same + # - vocabs must be the same (how to check?) + # - tokenizer must be the same (only partially checkable) + if nlp.lang != nlp2.lang: + msg.fail("Can't merge - languages don't match", exits=1) + + # check vector equality + if ( + nlp.vocab.vectors.shape != nlp2.vocab.vectors.shape + or nlp.vocab.vectors.key2row != nlp2.vocab.vectors.key2row + or nlp.vocab.vectors.to_bytes(exclude=["strings"]) + != nlp2.vocab.vectors.to_bytes(exclude=["strings"]) + ): + msg.fail("Can't merge - vectors don't match", exits=1) + + if nlp.config["nlp"]["tokenizer"] != nlp2.config["nlp"]["tokenizer"]: + msg.fail("Can't merge - tokenizers don't match", exits=1) + + # Check that each pipeline only has one feature source + _check_single_tok2vec(base_model, nlp.config) + _check_single_tok2vec(added_model, nlp2.config) + + # Check how many listeners there are and replace based on that + # TODO: option to recognize frozen tok2vecs + # TODO: take list of pipe names to copy + listeners = _get_listeners(nlp2) + replace_listeners = len(listeners) == 1 + print(replace_listeners, len(listeners)) + nlp_out = _inner_merge(nlp, nlp2, replace_listeners=replace_listeners) + + # write the final pipeline + nlp.to_disk(output_path) + msg.info(f"Saved pipeline to: {output_path}") + + return nlp diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index c6768a3fd..592999680 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -20,6 +20,7 @@ from spacy.cli._util import parse_config_overrides, string_to_list from spacy.cli._util import substitute_project_variables from spacy.cli._util import validate_project_commands from spacy.cli._util import upload_file, download_file +from spacy.cli.configure import configure_resume_cli from spacy.cli.debug_data import _compile_gold, _get_labels_from_model from spacy.cli.debug_data import _get_labels_from_spancat from spacy.cli.debug_data import _get_distribution, _get_kl_divergence @@ -1185,3 +1186,19 @@ def test_upload_download_local_file(): download_file(remote_file, local_file) with local_file.open(mode="r") as file_: assert file_.read() == content + + +def test_configure_resume(tmp_path): + nlp = spacy.blank("en") + nlp.add_pipe("ner") + nlp.add_pipe("textcat") + base_path = tmp_path / "base" + nlp.to_disk(base_path) + + out_path = tmp_path / "resume.cfg" + conf = configure_resume_cli(base_path, out_path) + + assert out_path.exists(), "Didn't save config file" + + for comp, val in conf["components"].items(): + assert "source" in val, f"Non-sourced component: {comp}" From ab2773e6b3b8ff2463c2c05061b5cc6e92cab1b4 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Fri, 23 Dec 2022 19:31:21 +0900 Subject: [PATCH 02/23] Fix import --- spacy/cli/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 59c763b6b..383548057 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -29,7 +29,7 @@ from .project.push import project_push # noqa: F401 from .project.pull import project_pull # noqa: F401 from .project.document import project_document # noqa: F401 from .find_threshold import find_threshold # noqa: F401 -from .configure import merge, use_tok2vec, use_transformer # noqa: F401 +from .configure import merge_pipelines, use_tok2vec, use_transformer # noqa: F401 from .configure import configure_resume_cli # noqa: F401 From f3a928cb4b8967a0077c1762eced7932c5be1c3d Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 26 Dec 2022 14:55:40 +0900 Subject: [PATCH 03/23] Fix types --- spacy/cli/configure.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/cli/configure.py b/spacy/cli/configure.py index aa28da480..2d0f1ade2 100644 --- a/spacy/cli/configure.py +++ b/spacy/cli/configure.py @@ -2,6 +2,7 @@ from pathlib import Path import re from wasabi import msg import typer +from thinc.api import Config import spacy from spacy.language import Language @@ -162,7 +163,7 @@ def configure_resume_cli( @configure_cli.command("transformer") def use_transformer( base_model: str, output_path: Path, transformer_name: str = "roberta-base" -): +) -> Config: """Replace pipeline tok2vec with transformer.""" # 1. identify tok2vec @@ -206,7 +207,7 @@ def use_transformer( @configure_cli.command("tok2vec") -def use_tok2vec(base_model: str, output_path: Path) -> Language: +def use_tok2vec(base_model: str, output_path: Path) -> Config: """Replace pipeline tok2vec with CNN tok2vec.""" nlp = spacy.load(base_model) _check_single_tok2vec(base_model, nlp.config) From 836fd87b1e40e040b5df65b5c6da9a909001ae74 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 27 Dec 2022 17:34:25 +0900 Subject: [PATCH 04/23] Add use_transformer test --- spacy/tests/test_cli.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 592999680..3efd5b76c 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -20,7 +20,7 @@ from spacy.cli._util import parse_config_overrides, string_to_list from spacy.cli._util import substitute_project_variables from spacy.cli._util import validate_project_commands from spacy.cli._util import upload_file, download_file -from spacy.cli.configure import configure_resume_cli +from spacy.cli.configure import configure_resume_cli, use_transformer from spacy.cli.debug_data import _compile_gold, _get_labels_from_model from spacy.cli.debug_data import _get_labels_from_spancat from spacy.cli.debug_data import _get_distribution, _get_kl_divergence @@ -1202,3 +1202,16 @@ def test_configure_resume(tmp_path): for comp, val in conf["components"].items(): assert "source" in val, f"Non-sourced component: {comp}" + + +def test_use_transformer(tmp_path): + nlp = spacy.blank("en") + nlp.add_pipe("tok2vec") + base_path = tmp_path / "tok2vec_sample" + nlp.to_disk(base_path) + + out_path = tmp_path / "converted_to_trf" + conf = use_transformer(base_path, out_path) + assert out_path.exists(), "No model saved" + + assert "transformer" in conf["components"], "No transformer component" From dab7894cf38cfcd883fd98c69476be85a0d8fd6b Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 27 Dec 2022 17:34:34 +0900 Subject: [PATCH 05/23] Add HashEmbedCNN to list of tok2vec architectures --- spacy/cli/configure.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/spacy/cli/configure.py b/spacy/cli/configure.py index 2d0f1ade2..77d0dfe68 100644 --- a/spacy/cli/configure.py +++ b/spacy/cli/configure.py @@ -10,7 +10,11 @@ from spacy.language import Language from ._util import configure_cli, Arg, Opt # These are the architectures that are recognized as tok2vec/feature sources. -TOK2VEC_ARCHS = [("spacy", "Tok2Vec"), ("spacy-transformers", "TransformerModel")] +TOK2VEC_ARCHS = [ + ("spacy", "Tok2Vec"), + ("spacy", "HashEmbedCNN"), + ("spacy-transformers", "TransformerModel"), +] # These are the listeners. LISTENER_ARCHS = [ ("spacy", "Tok2VecListener"), From be95ef5221c81d1b83c6d8d3cba2a7ae9785aa48 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 27 Dec 2022 17:55:08 +0900 Subject: [PATCH 06/23] TODO REVERT Try turning off batching Maybe this will fix the CI issue? --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 0f7ea91f9..a792b04b6 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -1,5 +1,5 @@ trigger: - batch: true + batch: false branches: include: - "*" From a749d2def18586a38765f5514a56e23226b7226d Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 27 Dec 2022 18:38:29 +0900 Subject: [PATCH 07/23] Revert "TODO REVERT Try turning off batching" This reverts commit be95ef5221c81d1b83c6d8d3cba2a7ae9785aa48. --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index a792b04b6..0f7ea91f9 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -1,5 +1,5 @@ trigger: - batch: false + batch: true branches: include: - "*" From 10bbb01bb67c0a3bacee45d398bc8c1b54be4b9d Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 27 Dec 2022 18:47:24 +0900 Subject: [PATCH 08/23] Test use_tok2vec, not use_transformer Adding the transformer component requires spacy-transformers, which isn't present in the normal test env. --- spacy/cli/configure.py | 9 ++++++++- spacy/tests/test_cli.py | 13 +++++++------ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/spacy/cli/configure.py b/spacy/cli/configure.py index 77d0dfe68..052851b62 100644 --- a/spacy/cli/configure.py +++ b/spacy/cli/configure.py @@ -186,7 +186,14 @@ def use_transformer( "name": transformer_name, } } - trf = nlp.add_pipe("transformer", config=trf_config, first=True) + try: + trf = nlp.add_pipe("transformer", config=trf_config, first=True) + except ValueError: + fail_msg = ( + "Configuring a transformer requires spacy-transformers. " + "Install with: pip install spacy-transformers" + ) + msg.fail(fail_msg, exits=1) # TODO maybe remove vectors? diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 3efd5b76c..627c59b1e 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -20,7 +20,7 @@ from spacy.cli._util import parse_config_overrides, string_to_list from spacy.cli._util import substitute_project_variables from spacy.cli._util import validate_project_commands from spacy.cli._util import upload_file, download_file -from spacy.cli.configure import configure_resume_cli, use_transformer +from spacy.cli.configure import configure_resume_cli, use_tok2vec from spacy.cli.debug_data import _compile_gold, _get_labels_from_model from spacy.cli.debug_data import _get_labels_from_spancat from spacy.cli.debug_data import _get_distribution, _get_kl_divergence @@ -1204,14 +1204,15 @@ def test_configure_resume(tmp_path): assert "source" in val, f"Non-sourced component: {comp}" -def test_use_transformer(tmp_path): +def test_use_tok2vec(tmp_path): + # Can't add a transformer here because spacy-transformers might not be present nlp = spacy.blank("en") nlp.add_pipe("tok2vec") - base_path = tmp_path / "tok2vec_sample" + base_path = tmp_path / "tok2vec_sample_2" nlp.to_disk(base_path) - out_path = tmp_path / "converted_to_trf" - conf = use_transformer(base_path, out_path) + out_path = tmp_path / "converted_to_tok2vec" + conf = use_tok2vec(base_path, out_path) assert out_path.exists(), "No model saved" - assert "transformer" in conf["components"], "No transformer component" + assert "tok2vec" in conf["components"], "No tok2vec component" From 2791f0b552f2c684206310d2d66ba5cf80e55a36 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 27 Dec 2022 19:16:59 +0900 Subject: [PATCH 09/23] Add test for merging pipelines --- spacy/tests/test_cli.py | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 627c59b1e..7faab850a 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -20,7 +20,7 @@ from spacy.cli._util import parse_config_overrides, string_to_list from spacy.cli._util import substitute_project_variables from spacy.cli._util import validate_project_commands from spacy.cli._util import upload_file, download_file -from spacy.cli.configure import configure_resume_cli, use_tok2vec +from spacy.cli.configure import configure_resume_cli, use_tok2vec, merge_pipelines from spacy.cli.debug_data import _compile_gold, _get_labels_from_model from spacy.cli.debug_data import _get_labels_from_spancat from spacy.cli.debug_data import _get_distribution, _get_kl_divergence @@ -1216,3 +1216,38 @@ def test_use_tok2vec(tmp_path): assert out_path.exists(), "No model saved" assert "tok2vec" in conf["components"], "No tok2vec component" + + +def test_merge_pipelines(tmp_path): + + # width is a placeholder, since we won't actually train this + listener_config = { + "model": { + "tok2vec": {"@architectures": "spacy.Tok2VecListener.v1", "width": "0"} + } + } + # base pipeline + base = spacy.blank("en") + base.add_pipe("tok2vec") + base.add_pipe("ner", config=listener_config) + base_path = tmp_path / "merge_base" + base.to_disk(base_path) + + # added pipeline + added = spacy.blank("en") + added.add_pipe("tok2vec") + added.add_pipe("ner", config=listener_config) + added_path = tmp_path / "merge_added" + added.to_disk(added_path) + + # these should combine and not have a name collision + out_path = tmp_path / "merge_result" + merged = merge_pipelines(base_path, added_path, out_path) + + # will give a key error if not present + merged.get_pipe("ner") + merged.get_pipe("ner2") + + ner2_conf = merged.config["components"]["ner2"] + arch = ner2_conf["model"]["tok2vec"]["@architectures"] + assert arch == "spacy.HashEmbedCNN.v2", "Wrong arch - listener not replaced?" From f2bbab46236ecb6fbdf38f44708a0313a7f02673 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Wed, 11 Jan 2023 16:06:50 +0900 Subject: [PATCH 10/23] Add docs for configure command This also change the `output_file` arg to match other commands. --- spacy/cli/configure.py | 30 +++++++------- website/docs/api/cli.md | 89 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+), 15 deletions(-) diff --git a/spacy/cli/configure.py b/spacy/cli/configure.py index 052851b62..75d115ab7 100644 --- a/spacy/cli/configure.py +++ b/spacy/cli/configure.py @@ -137,7 +137,7 @@ def _check_pipeline_names(nlp, nlp2): def configure_resume_cli( # fmt: off base_model: Path = Arg(..., help="Path or name of base model to use for config"), - output_path: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True), + output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True), # fmt: on ): """Create a config for resuming training. @@ -155,18 +155,18 @@ def configure_resume_cli( for comp in nlp.pipe_names: conf["components"][comp] = {"source": path_str} - if str(output_path) == "-": + if str(output_file) == "-": print(conf.to_str()) else: - conf.to_disk(output_path) - msg.good("Saved config", output_path) + conf.to_disk(output_file) + msg.good("Saved config", output_file) return conf @configure_cli.command("transformer") def use_transformer( - base_model: str, output_path: Path, transformer_name: str = "roberta-base" + base_model: str, output_file: Path, transformer_name: str = "roberta-base" ) -> Config: """Replace pipeline tok2vec with transformer.""" @@ -208,17 +208,17 @@ def use_transformer( } nlp.config["components"][listener]["model"]["tok2vec"] = listener_config - if str(output_path) == "-": + if str(output_file) == "-": print(nlp.config.to_str()) else: - nlp.config.to_disk(output_path) - msg.good("Saved config", output_path) + nlp.config.to_disk(output_file) + msg.good("Saved config", output_file) return nlp.config @configure_cli.command("tok2vec") -def use_tok2vec(base_model: str, output_path: Path) -> Config: +def use_tok2vec(base_model: str, output_file: Path) -> Config: """Replace pipeline tok2vec with CNN tok2vec.""" nlp = spacy.load(base_model) _check_single_tok2vec(base_model, nlp.config) @@ -240,11 +240,11 @@ def use_tok2vec(base_model: str, output_path: Path) -> Config: } nlp.config["components"][listener]["model"]["tok2vec"] = listener_config - if str(output_path) == "-": + if str(output_file) == "-": print(nlp.config.to_str()) else: - nlp.config.to_disk(output_path) - msg.good("Saved config", output_path) + nlp.config.to_disk(output_file) + msg.good("Saved config", output_file) return nlp.config @@ -298,7 +298,7 @@ def _inner_merge(nlp, nlp2, replace_listeners=False) -> Language: @configure_cli.command("merge") -def merge_pipelines(base_model: str, added_model: str, output_path: Path) -> Language: +def merge_pipelines(base_model: str, added_model: str, output_file: Path) -> Language: """Combine components from multiple pipelines.""" nlp = spacy.load(base_model) nlp2 = spacy.load(added_model) @@ -336,7 +336,7 @@ def merge_pipelines(base_model: str, added_model: str, output_path: Path) -> Lan nlp_out = _inner_merge(nlp, nlp2, replace_listeners=replace_listeners) # write the final pipeline - nlp.to_disk(output_path) - msg.info(f"Saved pipeline to: {output_path}") + nlp.to_disk(output_file) + msg.info(f"Saved pipeline to: {output_file}") return nlp diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 275e37ee0..c2ba9d933 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -7,6 +7,7 @@ menu: - ['info', 'info'] - ['validate', 'validate'] - ['init', 'init'] + - ['configure', 'configure'] - ['convert', 'convert'] - ['debug', 'debug'] - ['train', 'train'] @@ -249,6 +250,94 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [ | overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | | **CREATES** | The label files. | +## configure {#configure new="TODO"} + +Modify or combine existing configs in high-level ways. Can be used to automate +config changes made as part of the development cycle. + +### configure resume {#configure-resume tag="command"} + +Modify the input config for use in resuming training. When resuming training, +all components are sourced from the previously trained pipeline. + +```cli +$ python -m spacy configure resume [base_model] [output_file] +``` + +| Name | Description | +| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `base_model` | A trained pipeline to resume training (package name or path). ~~str (positional)~~ | +| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ | + +### configure transformer {#configure-transformer tag="command"} + +Modify the base config to use a transformer component, optionally specifying the +base transformer to use. Useful for converting a CNN tok2vec pipeline to use +transformers. + +During development of a model, you can use a CNN tok2vec for faster training +time and reduced hardware requirements, and then use this command to convert +your pipeline to use a transformer once you've verified a proof of concept. This +can also help isolate whether any training issues are transformer-related or +not. + +```cli +$ python -m spacy configure transformer [base_model] [output_file] [--transformer_name] +``` + +| Name | Description | +| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `base_model` | A trained pipeline to resume training (package name or path). ~~str (positional)~~ | +| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ | +| `transformer_name` | The name of the base HuggingFace model to use. Defaults to `roberta-base`. ~~str (option)~~ | + +### configure tok2vec {#configure-tok2vec tag="command"} + +Modify the base model config to use a CNN tok2vec component. Useful for +generating a config from a transformer-based model for faster training +iteration. + +```cli +$ python -m spacy configure tok2vec [base_model] [output_file] +``` + +| Name | Description | +| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `base_model` | A trained pipeline to resume training (package name or path). ~~str (positional)~~ | +| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ | + +### configure merge {#configure-merge tag="command"} + +Take two pipelines and create a new one with components from both of them, +handling the configuration of listeners. Note that unlike other commands, this +produces a whole pipeline, not just a config. + +Components in the final pipeline are in the same order as in the original +pipelines, with the base pipeline first and the added pipeline after. Because +pipeline names must be unique, if there is a name collision in components, the +later components will be automatically renamed. + +For components with listeners, the resulting pipeline structure depends on the +number of listeners. If the second pipeline has only one listener, then +[`replace_listeners`](https://spacy.io/api/language/#replace_listeners) will be +used. If there is more than one listener, `replace_listeners` will not be used. +In the multi-listener case, the resulting pipeline may require more adjustment +for training to work. + +This is useful if you have trained a specialized component, such as NER or +textcat, and want to provide with one of the official pretrained pipelines or +another pipeline. + +```cli +$ python -m spacy configure tok2vec [base_model] [added_model] [output_file] +``` + +| Name | Description | +| ------------- | ---------------------------------------------------------------------------------------- | +| `base_model` | A trained pipeline (package name or path) to use as a base. ~~str (positional)~~ | +| `added_model` | A trained pipeline (package name or path) to combine with the base. ~~str (positional)~~ | +| `output_file` | Path to output pipeline. ~~Path (positional)~~ | + ## convert {#convert tag="command"} Convert files into spaCy's From 10adbcb86db07056f93eae6c24238fdd4e448397 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Thu, 12 Jan 2023 13:53:58 +0900 Subject: [PATCH 11/23] Use new-style tags in docs --- website/docs/api/cli.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index c2ba9d933..35965b466 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -250,12 +250,12 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [ | overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | | **CREATES** | The label files. | -## configure {#configure new="TODO"} +## configure {id="configure", new="TODO"} Modify or combine existing configs in high-level ways. Can be used to automate config changes made as part of the development cycle. -### configure resume {#configure-resume tag="command"} +### configure resume {id="configure-resume", tag="command"} Modify the input config for use in resuming training. When resuming training, all components are sourced from the previously trained pipeline. @@ -269,7 +269,7 @@ $ python -m spacy configure resume [base_model] [output_file] | `base_model` | A trained pipeline to resume training (package name or path). ~~str (positional)~~ | | `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ | -### configure transformer {#configure-transformer tag="command"} +### configure transformer {id="configure-transformer", tag="command"} Modify the base config to use a transformer component, optionally specifying the base transformer to use. Useful for converting a CNN tok2vec pipeline to use @@ -291,7 +291,7 @@ $ python -m spacy configure transformer [base_model] [output_file] [--transforme | `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ | | `transformer_name` | The name of the base HuggingFace model to use. Defaults to `roberta-base`. ~~str (option)~~ | -### configure tok2vec {#configure-tok2vec tag="command"} +### configure tok2vec {id="configure-tok2vec", tag="command"} Modify the base model config to use a CNN tok2vec component. Useful for generating a config from a transformer-based model for faster training @@ -306,7 +306,7 @@ $ python -m spacy configure tok2vec [base_model] [output_file] | `base_model` | A trained pipeline to resume training (package name or path). ~~str (positional)~~ | | `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ | -### configure merge {#configure-merge tag="command"} +### configure merge {id="configure-merge", tag="command"} Take two pipelines and create a new one with components from both of them, handling the configuration of listeners. Note that unlike other commands, this From 4bad2962938d70c6beba42bdbd977393922dbe24 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Thu, 12 Jan 2023 17:30:59 +0900 Subject: [PATCH 12/23] Fix new-style header --- website/docs/api/cli.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 08c422a17..0c982f389 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -250,7 +250,7 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [ | overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | | **CREATES** | The label files. | -## configure {id="configure", new="TODO"} +## configure {id="configure", version="TODO"} Modify or combine existing configs in high-level ways. Can be used to automate config changes made as part of the development cycle. From 3fe723c1f99326e8ccf41bd299126572bf2b2bb2 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Fri, 13 Jan 2023 14:10:18 +0900 Subject: [PATCH 13/23] Cleanup This removes one old print statement and some old TODOs. Some TODOs are left as future work. --- spacy/cli/configure.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/spacy/cli/configure.py b/spacy/cli/configure.py index 75d115ab7..7db39973a 100644 --- a/spacy/cli/configure.py +++ b/spacy/cli/configure.py @@ -126,7 +126,6 @@ def _check_pipeline_names(nlp, nlp2): for name in nlp2.pipe_names: if name in names: inc = _increment_suffix(name) - # TODO Would it be better to just keep incrementing? if inc in names or inc in nlp2.pipe_names: msg.fail(fail_msg.format(name=name, new_name=inc), exits=1) rename[name] = inc @@ -195,8 +194,6 @@ def use_transformer( ) msg.fail(fail_msg, exits=1) - # TODO maybe remove vectors? - # now update the listeners listeners = _get_listeners(nlp) for listener in listeners: @@ -289,7 +286,6 @@ def _inner_merge(nlp, nlp2, replace_listeners=False) -> Language: # the tok2vec should not be copied over continue if replace_listeners and _has_listener(nlp2, comp): - # TODO does "model.tok2vec" work for everything? nlp2.replace_listeners(tok2vec_name, comp, ["model.tok2vec"]) nlp.add_pipe(comp, source=nlp2, name=rename.get(comp, comp)) if comp in rename: @@ -306,7 +302,7 @@ def merge_pipelines(base_model: str, added_model: str, output_file: Path) -> Lan # to merge models: # - lang must be the same # - vectors must be the same - # - vocabs must be the same (how to check?) + # - vocabs must be the same # - tokenizer must be the same (only partially checkable) if nlp.lang != nlp2.lang: msg.fail("Can't merge - languages don't match", exits=1) @@ -329,10 +325,9 @@ def merge_pipelines(base_model: str, added_model: str, output_file: Path) -> Lan # Check how many listeners there are and replace based on that # TODO: option to recognize frozen tok2vecs - # TODO: take list of pipe names to copy + # TODO: take list of pipe names to copy, ignore others listeners = _get_listeners(nlp2) replace_listeners = len(listeners) == 1 - print(replace_listeners, len(listeners)) nlp_out = _inner_merge(nlp, nlp2, replace_listeners=replace_listeners) # write the final pipeline From 9d3e3e6be2658b046de5691343ed8cc2b7968e69 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 31 Jan 2023 12:46:06 +0900 Subject: [PATCH 14/23] Add types to _deep_get --- spacy/cli/configure.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/cli/configure.py b/spacy/cli/configure.py index 7db39973a..488978827 100644 --- a/spacy/cli/configure.py +++ b/spacy/cli/configure.py @@ -3,6 +3,7 @@ import re from wasabi import msg import typer from thinc.api import Config +from typing import Any, Dict, Iterable import spacy from spacy.language import Language @@ -22,7 +23,7 @@ LISTENER_ARCHS = [ ] -def _deep_get(obj, key, default): +def _deep_get(obj: Dict[str, Any], key: Iterable[str], default: Any): """Given a multi-part key, try to get the key. If at any point this isn't possible, return the default. """ From 9d0ae2407b82f0b46b19a927b13ff743f9b3051b Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 31 Jan 2023 13:08:51 +0900 Subject: [PATCH 15/23] Add types to everything --- spacy/cli/configure.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/spacy/cli/configure.py b/spacy/cli/configure.py index 488978827..884ba68fe 100644 --- a/spacy/cli/configure.py +++ b/spacy/cli/configure.py @@ -3,7 +3,7 @@ import re from wasabi import msg import typer from thinc.api import Config -from typing import Any, Dict, Iterable +from typing import Any, Dict, Iterable, List, Union import spacy from spacy.language import Language @@ -23,7 +23,9 @@ LISTENER_ARCHS = [ ] -def _deep_get(obj: Dict[str, Any], key: Iterable[str], default: Any): +def _deep_get( + obj: Union[Dict[str, Any], Config], key: Iterable[str], default: Any +) -> Any: """Given a multi-part key, try to get the key. If at any point this isn't possible, return the default. """ @@ -36,7 +38,7 @@ def _deep_get(obj: Dict[str, Any], key: Iterable[str], default: Any): return slot -def _get_tok2vecs(config): +def _get_tok2vecs(config: Config) -> List[str]: """Given a pipeline config, return the names of components that are tok2vecs (or Transformers). """ @@ -52,7 +54,7 @@ def _get_tok2vecs(config): return out -def _has_listener(nlp, pipe_name): +def _has_listener(nlp: Language, pipe_name: str): """Given a pipeline and a component name, check if it has a listener.""" arch = _deep_get( nlp.config, @@ -65,7 +67,7 @@ def _has_listener(nlp, pipe_name): return (ns, model) in LISTENER_ARCHS -def _get_listeners(nlp): +def _get_listeners(nlp: Language) -> List[str]: """Get the name of every component that contains a listener. Does not check that they listen to the same thing; assumes a pipeline has @@ -78,7 +80,7 @@ def _get_listeners(nlp): return out -def _increment_suffix(name): +def _increment_suffix(name: str) -> str: """Given a name, return an incremented version. If no numeric suffix is found, return the original with "2" appended. @@ -90,12 +92,12 @@ def _increment_suffix(name): if res is None: return f"{name}2" else: - num = res.match + num = res.group() prefix = name[0 : -len(num)] return f"{prefix}{int(num) + 1}" -def _check_single_tok2vec(name, config): +def _check_single_tok2vec(name: str, config: Config) -> None: """Check if there is just one tok2vec in a config. A very simple check, but used in multiple functions. @@ -108,7 +110,7 @@ def _check_single_tok2vec(name, config): msg.fail(fail_msg, exits=1) -def _check_pipeline_names(nlp, nlp2): +def _check_pipeline_names(nlp: Language, nlp2: Language) -> Dict[str, str]: """Given two pipelines, try to rename any collisions in component names. If a simple increment of a numeric suffix doesn't work, will give up. @@ -139,7 +141,7 @@ def configure_resume_cli( base_model: Path = Arg(..., help="Path or name of base model to use for config"), output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True), # fmt: on -): +) -> Config: """Create a config for resuming training. A config for resuming training is the same as the input config, but with @@ -247,7 +249,9 @@ def use_tok2vec(base_model: str, output_file: Path) -> Config: return nlp.config -def _inner_merge(nlp, nlp2, replace_listeners=False) -> Language: +def _inner_merge( + nlp: Language, nlp2: Language, replace_listeners: bool = False +) -> Language: """Actually do the merge. nlp: Base pipeline to add components to. From 03a0c2badcc3e0355d52110fa0c526e8bbb52324 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Thu, 9 Feb 2023 15:16:50 +0900 Subject: [PATCH 16/23] Move merge to independent command --- spacy/cli/__init__.py | 3 +- spacy/cli/configure.py | 93 --------------------------------- spacy/cli/merge.py | 108 +++++++++++++++++++++++++++++++++++++++ spacy/tests/test_cli.py | 3 +- website/docs/api/cli.mdx | 8 +-- 5 files changed, 116 insertions(+), 99 deletions(-) create mode 100644 spacy/cli/merge.py diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 383548057..4041e01b6 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -29,8 +29,9 @@ from .project.push import project_push # noqa: F401 from .project.pull import project_pull # noqa: F401 from .project.document import project_document # noqa: F401 from .find_threshold import find_threshold # noqa: F401 -from .configure import merge_pipelines, use_tok2vec, use_transformer # noqa: F401 +from .configure import use_tok2vec, use_transformer # noqa: F401 from .configure import configure_resume_cli # noqa: F401 +from .merge import merge_pipelines # noqa: F401 @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) diff --git a/spacy/cli/configure.py b/spacy/cli/configure.py index 884ba68fe..5f31ca0b4 100644 --- a/spacy/cli/configure.py +++ b/spacy/cli/configure.py @@ -247,96 +247,3 @@ def use_tok2vec(base_model: str, output_file: Path) -> Config: msg.good("Saved config", output_file) return nlp.config - - -def _inner_merge( - nlp: Language, nlp2: Language, replace_listeners: bool = False -) -> Language: - """Actually do the merge. - - nlp: Base pipeline to add components to. - nlp2: Pipeline to add components from. - replace_listeners (bool): Whether to replace listeners. Usually only true - if there's one listener. - returns: assembled pipeline. - """ - - # we checked earlier, so there's definitely just one - tok2vec_name = _get_tok2vecs(nlp2.config)[0] - rename = _check_pipeline_names(nlp, nlp2) - - if len(_get_listeners(nlp2)) > 1: - if replace_listeners: - msg.warn( - """ - Replacing listeners for multiple components. Note this can make - your pipeline large and slow. Consider chaining pipelines (like - nlp2(nlp(text))) instead. - """ - ) - else: - # TODO provide a guide for what to do here - msg.warn( - """ - The result of this merge will have two feature sources - (tok2vecs) and multiple listeners. This will work for - inference, but will probably not work when training without - extra adjustment. If you continue to train the pipelines - separately this is not a problem. - """ - ) - - for comp in nlp2.pipe_names: - if replace_listeners and comp == tok2vec_name: - # the tok2vec should not be copied over - continue - if replace_listeners and _has_listener(nlp2, comp): - nlp2.replace_listeners(tok2vec_name, comp, ["model.tok2vec"]) - nlp.add_pipe(comp, source=nlp2, name=rename.get(comp, comp)) - if comp in rename: - msg.info(f"Renaming {comp} to {rename[comp]} to avoid collision...") - return nlp - - -@configure_cli.command("merge") -def merge_pipelines(base_model: str, added_model: str, output_file: Path) -> Language: - """Combine components from multiple pipelines.""" - nlp = spacy.load(base_model) - nlp2 = spacy.load(added_model) - - # to merge models: - # - lang must be the same - # - vectors must be the same - # - vocabs must be the same - # - tokenizer must be the same (only partially checkable) - if nlp.lang != nlp2.lang: - msg.fail("Can't merge - languages don't match", exits=1) - - # check vector equality - if ( - nlp.vocab.vectors.shape != nlp2.vocab.vectors.shape - or nlp.vocab.vectors.key2row != nlp2.vocab.vectors.key2row - or nlp.vocab.vectors.to_bytes(exclude=["strings"]) - != nlp2.vocab.vectors.to_bytes(exclude=["strings"]) - ): - msg.fail("Can't merge - vectors don't match", exits=1) - - if nlp.config["nlp"]["tokenizer"] != nlp2.config["nlp"]["tokenizer"]: - msg.fail("Can't merge - tokenizers don't match", exits=1) - - # Check that each pipeline only has one feature source - _check_single_tok2vec(base_model, nlp.config) - _check_single_tok2vec(added_model, nlp2.config) - - # Check how many listeners there are and replace based on that - # TODO: option to recognize frozen tok2vecs - # TODO: take list of pipe names to copy, ignore others - listeners = _get_listeners(nlp2) - replace_listeners = len(listeners) == 1 - nlp_out = _inner_merge(nlp, nlp2, replace_listeners=replace_listeners) - - # write the final pipeline - nlp.to_disk(output_file) - msg.info(f"Saved pipeline to: {output_file}") - - return nlp diff --git a/spacy/cli/merge.py b/spacy/cli/merge.py new file mode 100644 index 000000000..6fd9e8153 --- /dev/null +++ b/spacy/cli/merge.py @@ -0,0 +1,108 @@ +from pathlib import Path +from wasabi import msg + +import spacy +from spacy.language import Language + +from ._util import app, Arg, Opt +from .configure import _check_single_tok2vec, _get_listeners, _get_tok2vecs +from .configure import _check_pipeline_names, _has_listener + + +def _inner_merge( + nlp: Language, nlp2: Language, replace_listeners: bool = False +) -> Language: + """Actually do the merge. + + nlp: Base pipeline to add components to. + nlp2: Pipeline to add components from. + replace_listeners (bool): Whether to replace listeners. Usually only true + if there's one listener. + returns: assembled pipeline. + """ + + # we checked earlier, so there's definitely just one + tok2vec_name = _get_tok2vecs(nlp2.config)[0] + rename = _check_pipeline_names(nlp, nlp2) + + if len(_get_listeners(nlp2)) > 1: + if replace_listeners: + msg.warn( + """ + Replacing listeners for multiple components. Note this can make + your pipeline large and slow. Consider chaining pipelines (like + nlp2(nlp(text))) instead. + """ + ) + else: + # TODO provide a guide for what to do here + msg.warn( + """ + The result of this merge will have two feature sources + (tok2vecs) and multiple listeners. This will work for + inference, but will probably not work when training without + extra adjustment. If you continue to train the pipelines + separately this is not a problem. + """ + ) + + for comp in nlp2.pipe_names: + if replace_listeners and comp == tok2vec_name: + # the tok2vec should not be copied over + continue + if replace_listeners and _has_listener(nlp2, comp): + nlp2.replace_listeners(tok2vec_name, comp, ["model.tok2vec"]) + nlp.add_pipe(comp, source=nlp2, name=rename.get(comp, comp)) + if comp in rename: + msg.info(f"Renaming {comp} to {rename[comp]} to avoid collision...") + return nlp + + +@app.command("merge") +def merge_pipelines( + # fmt: off + base_model: str = Arg(..., help="Name or path of base model"), + added_model: str = Arg(..., help="Name or path of model to be merged"), + output_file: Path = Arg(..., help="Path to save merged model") + # fmt: on +) -> Language: + """Combine components from multiple pipelines.""" + nlp = spacy.load(base_model) + nlp2 = spacy.load(added_model) + + # to merge models: + # - lang must be the same + # - vectors must be the same + # - vocabs must be the same + # - tokenizer must be the same (only partially checkable) + if nlp.lang != nlp2.lang: + msg.fail("Can't merge - languages don't match", exits=1) + + # check vector equality + if ( + nlp.vocab.vectors.shape != nlp2.vocab.vectors.shape + or nlp.vocab.vectors.key2row != nlp2.vocab.vectors.key2row + or nlp.vocab.vectors.to_bytes(exclude=["strings"]) + != nlp2.vocab.vectors.to_bytes(exclude=["strings"]) + ): + msg.fail("Can't merge - vectors don't match", exits=1) + + if nlp.config["nlp"]["tokenizer"] != nlp2.config["nlp"]["tokenizer"]: + msg.fail("Can't merge - tokenizers don't match", exits=1) + + # Check that each pipeline only has one feature source + _check_single_tok2vec(base_model, nlp.config) + _check_single_tok2vec(added_model, nlp2.config) + + # Check how many listeners there are and replace based on that + # TODO: option to recognize frozen tok2vecs + # TODO: take list of pipe names to copy, ignore others + listeners = _get_listeners(nlp2) + replace_listeners = len(listeners) == 1 + nlp_out = _inner_merge(nlp, nlp2, replace_listeners=replace_listeners) + + # write the final pipeline + nlp.to_disk(output_file) + msg.info(f"Saved pipeline to: {output_file}") + + return nlp diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 8b7352048..242a3a885 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -21,7 +21,7 @@ from spacy.cli._util import parse_config_overrides, string_to_list from spacy.cli._util import substitute_project_variables from spacy.cli._util import validate_project_commands from spacy.cli._util import upload_file, download_file -from spacy.cli.configure import configure_resume_cli, use_tok2vec, merge_pipelines +from spacy.cli.configure import configure_resume_cli, use_tok2vec from spacy.cli.debug_data import _compile_gold, _get_labels_from_model from spacy.cli.debug_data import _get_labels_from_spancat from spacy.cli.debug_data import _get_distribution, _get_kl_divergence @@ -30,6 +30,7 @@ from spacy.cli.debug_data import _print_span_characteristics from spacy.cli.debug_data import _get_spans_length_freq_dist from spacy.cli.download import get_compatibility, get_version from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config +from spacy.cli.merge import merge_pipelines from spacy.cli.package import get_third_party_dependencies from spacy.cli.package import _is_permitted_package_name from spacy.cli.project.remote_storage import RemoteStorage diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 0c982f389..19f1ae3f8 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -8,6 +8,7 @@ menu: - ['validate', 'validate'] - ['init', 'init'] - ['configure', 'configure'] + - ['merge', 'merge'] - ['convert', 'convert'] - ['debug', 'debug'] - ['train', 'train'] @@ -306,11 +307,10 @@ $ python -m spacy configure tok2vec [base_model] [output_file] | `base_model` | A trained pipeline to resume training (package name or path). ~~str (positional)~~ | | `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ | -### configure merge {id="configure-merge", tag="command"} +## merge {id="merge", tag="command"} Take two pipelines and create a new one with components from both of them, -handling the configuration of listeners. Note that unlike other commands, this -produces a whole pipeline, not just a config. +handling the configuration of listeners. The output is a serialized pipeline. Components in the final pipeline are in the same order as in the original pipelines, with the base pipeline first and the added pipeline after. Because @@ -329,7 +329,7 @@ textcat, and want to provide with one of the official pretrained pipelines or another pipeline. ```cli -$ python -m spacy configure tok2vec [base_model] [added_model] [output_file] +$ python -m spacy merge [base_model] [added_model] [output_file] ``` | Name | Description | From a76fd0da99b7ee47f88abb7eabd6b0028af706cc Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Fri, 10 Feb 2023 14:17:51 +0900 Subject: [PATCH 17/23] Apply suggestions from code review Co-authored-by: Raphael Mitsch --- spacy/cli/configure.py | 2 +- spacy/cli/merge.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/cli/configure.py b/spacy/cli/configure.py index 5f31ca0b4..c7637cf04 100644 --- a/spacy/cli/configure.py +++ b/spacy/cli/configure.py @@ -119,7 +119,7 @@ def _check_pipeline_names(nlp: Language, nlp2: Language) -> Dict[str, str]: fail_msg = """ Tried automatically renaming {name} to {new_name}, but still had a collision, so bailing out. Please make your pipe names - more unique. + unique. """ # map of components to be renamed diff --git a/spacy/cli/merge.py b/spacy/cli/merge.py index 6fd9e8153..528237dae 100644 --- a/spacy/cli/merge.py +++ b/spacy/cli/merge.py @@ -14,8 +14,8 @@ def _inner_merge( ) -> Language: """Actually do the merge. - nlp: Base pipeline to add components to. - nlp2: Pipeline to add components from. + nlp (Language): Base pipeline to add components to. + nlp2 (Language): Pipeline to add components from. replace_listeners (bool): Whether to replace listeners. Usually only true if there's one listener. returns: assembled pipeline. From c0a3e9a44a3e043a830a76f2b5e9d44ac7aa0ec0 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Fri, 10 Feb 2023 15:12:40 +0900 Subject: [PATCH 18/23] Code reorganization --- spacy/cli/__init__.py | 3 ++- spacy/cli/configure.py | 47 ++----------------------------------- spacy/cli/merge.py | 51 +++++++++++++++++++++++++++++++++++++---- spacy/tests/test_cli.py | 7 +++--- 4 files changed, 55 insertions(+), 53 deletions(-) diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 4041e01b6..003859777 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -29,7 +29,8 @@ from .project.push import project_push # noqa: F401 from .project.pull import project_pull # noqa: F401 from .project.document import project_document # noqa: F401 from .find_threshold import find_threshold # noqa: F401 -from .configure import use_tok2vec, use_transformer # noqa: F401 +from .configure import configure_tok2vec_feature_source # noqa: F401 +from .configure import configure_transformer_feature_source # noqa: F401 from .configure import configure_resume_cli # noqa: F401 from .merge import merge_pipelines # noqa: F401 diff --git a/spacy/cli/configure.py b/spacy/cli/configure.py index c7637cf04..cc36924bc 100644 --- a/spacy/cli/configure.py +++ b/spacy/cli/configure.py @@ -1,5 +1,4 @@ from pathlib import Path -import re from wasabi import msg import typer from thinc.api import Config @@ -80,23 +79,6 @@ def _get_listeners(nlp: Language) -> List[str]: return out -def _increment_suffix(name: str) -> str: - """Given a name, return an incremented version. - - If no numeric suffix is found, return the original with "2" appended. - - This is used to avoid name collisions in pipelines. - """ - - res = re.search(r"\d+$", name) - if res is None: - return f"{name}2" - else: - num = res.group() - prefix = name[0 : -len(num)] - return f"{prefix}{int(num) + 1}" - - def _check_single_tok2vec(name: str, config: Config) -> None: """Check if there is just one tok2vec in a config. @@ -110,31 +92,6 @@ def _check_single_tok2vec(name: str, config: Config) -> None: msg.fail(fail_msg, exits=1) -def _check_pipeline_names(nlp: Language, nlp2: Language) -> Dict[str, str]: - """Given two pipelines, try to rename any collisions in component names. - - If a simple increment of a numeric suffix doesn't work, will give up. - """ - - fail_msg = """ - Tried automatically renaming {name} to {new_name}, but still - had a collision, so bailing out. Please make your pipe names - unique. - """ - - # map of components to be renamed - rename = {} - # check pipeline names - names = nlp.pipe_names - for name in nlp2.pipe_names: - if name in names: - inc = _increment_suffix(name) - if inc in names or inc in nlp2.pipe_names: - msg.fail(fail_msg.format(name=name, new_name=inc), exits=1) - rename[name] = inc - return rename - - @configure_cli.command("resume") def configure_resume_cli( # fmt: off @@ -167,7 +124,7 @@ def configure_resume_cli( @configure_cli.command("transformer") -def use_transformer( +def configure_transformer_feature_source( base_model: str, output_file: Path, transformer_name: str = "roberta-base" ) -> Config: """Replace pipeline tok2vec with transformer.""" @@ -218,7 +175,7 @@ def use_transformer( @configure_cli.command("tok2vec") -def use_tok2vec(base_model: str, output_file: Path) -> Config: +def configure_tok2vec_feature_source(base_model: str, output_file: Path) -> Config: """Replace pipeline tok2vec with CNN tok2vec.""" nlp = spacy.load(base_model) _check_single_tok2vec(base_model, nlp.config) diff --git a/spacy/cli/merge.py b/spacy/cli/merge.py index 528237dae..8ad909bfc 100644 --- a/spacy/cli/merge.py +++ b/spacy/cli/merge.py @@ -1,12 +1,55 @@ from pathlib import Path +import re from wasabi import msg import spacy from spacy.language import Language -from ._util import app, Arg, Opt +from ._util import app, Arg, Opt, Dict from .configure import _check_single_tok2vec, _get_listeners, _get_tok2vecs -from .configure import _check_pipeline_names, _has_listener +from .configure import _has_listener + + +def _increment_suffix(name: str) -> str: + """Given a name, return an incremented version. + + If no numeric suffix is found, return the original with "2" appended. + + This is used to avoid name collisions in pipelines. + """ + + res = re.search(r"\d+$", name) + if res is None: + return f"{name}2" + else: + num = res.group() + prefix = name[0 : -len(num)] + return f"{prefix}{int(num) + 1}" + + +def _make_unique_pipeline_names(nlp: Language, nlp2: Language) -> Dict[str, str]: + """Given two pipelines, try to rename any collisions in component names. + + If a simple increment of a numeric suffix doesn't work, will give up. + """ + + fail_msg = """ + Tried automatically renaming {name} to {new_name}, but still + had a collision, so bailing out. Please make your pipe names + unique. + """ + + # map of components to be renamed + rename = {} + # check pipeline names + names = nlp.pipe_names + for name in nlp2.pipe_names: + if name in names: + inc = _increment_suffix(name) + if inc in names or inc in nlp2.pipe_names: + msg.fail(fail_msg.format(name=name, new_name=inc), exits=1) + rename[name] = inc + return rename def _inner_merge( @@ -21,9 +64,9 @@ def _inner_merge( returns: assembled pipeline. """ - # we checked earlier, so there's definitely just one + # The outer merge already verified there was exactly one tok2vec tok2vec_name = _get_tok2vecs(nlp2.config)[0] - rename = _check_pipeline_names(nlp, nlp2) + rename = _make_unique_pipeline_names(nlp, nlp2) if len(_get_listeners(nlp2)) > 1: if replace_listeners: diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 242a3a885..7ede6339f 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -21,7 +21,8 @@ from spacy.cli._util import parse_config_overrides, string_to_list from spacy.cli._util import substitute_project_variables from spacy.cli._util import validate_project_commands from spacy.cli._util import upload_file, download_file -from spacy.cli.configure import configure_resume_cli, use_tok2vec +from spacy.cli.configure import configure_resume_cli +from spacy.cli.configure import configure_tok2vec_feature_source from spacy.cli.debug_data import _compile_gold, _get_labels_from_model from spacy.cli.debug_data import _get_labels_from_spancat from spacy.cli.debug_data import _get_distribution, _get_kl_divergence @@ -1206,7 +1207,7 @@ def test_configure_resume(tmp_path): assert "source" in val, f"Non-sourced component: {comp}" -def test_use_tok2vec(tmp_path): +def test_configure_tok2vec_feature_source(tmp_path): # Can't add a transformer here because spacy-transformers might not be present nlp = spacy.blank("en") nlp.add_pipe("tok2vec") @@ -1214,7 +1215,7 @@ def test_use_tok2vec(tmp_path): nlp.to_disk(base_path) out_path = tmp_path / "converted_to_tok2vec" - conf = use_tok2vec(base_path, out_path) + conf = configure_tok2vec_feature_source(base_path, out_path) assert out_path.exists(), "No model saved" assert "tok2vec" in conf["components"], "No tok2vec component" From 2524d067fdcda8f581faca6966626f2572b63342 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Fri, 10 Feb 2023 15:12:53 +0900 Subject: [PATCH 19/23] Add links to docs in docstrings --- spacy/cli/configure.py | 12 ++++++++++-- spacy/cli/merge.py | 11 ++++++++++- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/spacy/cli/configure.py b/spacy/cli/configure.py index cc36924bc..bfbcafb31 100644 --- a/spacy/cli/configure.py +++ b/spacy/cli/configure.py @@ -103,6 +103,8 @@ def configure_resume_cli( A config for resuming training is the same as the input config, but with all components sourced. + + DOCS: https://spacy.io/api/cli#configure-resume """ nlp = spacy.load(base_model) @@ -127,7 +129,10 @@ def configure_resume_cli( def configure_transformer_feature_source( base_model: str, output_file: Path, transformer_name: str = "roberta-base" ) -> Config: - """Replace pipeline tok2vec with transformer.""" + """Replace pipeline tok2vec with transformer. + + DOCS: https://spacy.io/api/cli#configure-transformer + """ # 1. identify tok2vec # 2. replace tok2vec @@ -176,7 +181,10 @@ def configure_transformer_feature_source( @configure_cli.command("tok2vec") def configure_tok2vec_feature_source(base_model: str, output_file: Path) -> Config: - """Replace pipeline tok2vec with CNN tok2vec.""" + """Replace pipeline tok2vec with CNN tok2vec. + + DOCS: https://spacy.io/api/cli#configure-tok2vec + """ nlp = spacy.load(base_model) _check_single_tok2vec(base_model, nlp.config) diff --git a/spacy/cli/merge.py b/spacy/cli/merge.py index 8ad909bfc..4ee2b3a00 100644 --- a/spacy/cli/merge.py +++ b/spacy/cli/merge.py @@ -109,7 +109,16 @@ def merge_pipelines( output_file: Path = Arg(..., help="Path to save merged model") # fmt: on ) -> Language: - """Combine components from multiple pipelines.""" + """Combine components from multiple pipelines. + + Given two pipelines, the components from them are merged into a single + pipeline. The exact way this works depends on whether the second pipeline + has one listener or more than one listener. In the single listener case + `replace_listeners` is used, otherwise components are simply appended to + the base pipeline. + + DOCS: https://spacy.io/api/cli#merge + """ nlp = spacy.load(base_model) nlp2 = spacy.load(added_model) From b9537ec03d6c43dec97fc7334ad6fe4741b4afd8 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Fri, 10 Feb 2023 17:04:18 +0900 Subject: [PATCH 20/23] rename to _make_unique_pipe_names --- spacy/cli/merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/merge.py b/spacy/cli/merge.py index 4ee2b3a00..ec78ddbe5 100644 --- a/spacy/cli/merge.py +++ b/spacy/cli/merge.py @@ -27,7 +27,7 @@ def _increment_suffix(name: str) -> str: return f"{prefix}{int(num) + 1}" -def _make_unique_pipeline_names(nlp: Language, nlp2: Language) -> Dict[str, str]: +def _make_unique_pipe_names(nlp: Language, nlp2: Language) -> Dict[str, str]: """Given two pipelines, try to rename any collisions in component names. If a simple increment of a numeric suffix doesn't work, will give up. @@ -66,7 +66,7 @@ def _inner_merge( # The outer merge already verified there was exactly one tok2vec tok2vec_name = _get_tok2vecs(nlp2.config)[0] - rename = _make_unique_pipeline_names(nlp, nlp2) + rename = _make_unique_pipe_names(nlp, nlp2) if len(_get_listeners(nlp2)) > 1: if replace_listeners: From 4279c73b6d1796cf0c8d4417b8bdd869c605ca65 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 13 Feb 2023 12:06:33 +0900 Subject: [PATCH 21/23] Update from code review --- website/docs/api/cli.mdx | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 19f1ae3f8..9fa576fa5 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -253,8 +253,11 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [ ## configure {id="configure", version="TODO"} -Modify or combine existing configs in high-level ways. Can be used to automate -config changes made as part of the development cycle. +Modify or combine existing configs. Example uses include swapping feature +sources or creating configs to resume training. This may simplify parts of the +development cycle. For example, it allows starting off with a config for a +faster, less accurate model (using the CNN tok2vec) and conveniently switching +to transformers later, without having to manually adjust the config. ### configure resume {id="configure-resume", tag="command"} @@ -1283,7 +1286,6 @@ $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | **CREATES** | A `DocBin` with the annotations from the `model` for all the files found in `data-path`. | - ## find-threshold {id="find-threshold",version="3.5",tag="command"} Runs prediction trials for a trained model with varying tresholds to maximize From 81276f21a52fa035935c1f96f59e7d9a77579bfb Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 13 Feb 2023 12:08:37 +0900 Subject: [PATCH 22/23] Change back to short names --- spacy/cli/__init__.py | 3 +-- spacy/cli/configure.py | 4 ++-- spacy/tests/test_cli.py | 3 +-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 003859777..47d05b5b6 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -29,8 +29,7 @@ from .project.push import project_push # noqa: F401 from .project.pull import project_pull # noqa: F401 from .project.document import project_document # noqa: F401 from .find_threshold import find_threshold # noqa: F401 -from .configure import configure_tok2vec_feature_source # noqa: F401 -from .configure import configure_transformer_feature_source # noqa: F401 +from .configure import use_transformer, use_tok2vec # noqa: F401 from .configure import configure_resume_cli # noqa: F401 from .merge import merge_pipelines # noqa: F401 diff --git a/spacy/cli/configure.py b/spacy/cli/configure.py index bfbcafb31..cd25ac351 100644 --- a/spacy/cli/configure.py +++ b/spacy/cli/configure.py @@ -126,7 +126,7 @@ def configure_resume_cli( @configure_cli.command("transformer") -def configure_transformer_feature_source( +def use_transformer( base_model: str, output_file: Path, transformer_name: str = "roberta-base" ) -> Config: """Replace pipeline tok2vec with transformer. @@ -180,7 +180,7 @@ def configure_transformer_feature_source( @configure_cli.command("tok2vec") -def configure_tok2vec_feature_source(base_model: str, output_file: Path) -> Config: +def use_tok2vec(base_model: str, output_file: Path) -> Config: """Replace pipeline tok2vec with CNN tok2vec. DOCS: https://spacy.io/api/cli#configure-tok2vec diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 7ede6339f..10701263f 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -21,8 +21,7 @@ from spacy.cli._util import parse_config_overrides, string_to_list from spacy.cli._util import substitute_project_variables from spacy.cli._util import validate_project_commands from spacy.cli._util import upload_file, download_file -from spacy.cli.configure import configure_resume_cli -from spacy.cli.configure import configure_tok2vec_feature_source +from spacy.cli.configure import configure_resume_cli, use_tok2vec from spacy.cli.debug_data import _compile_gold, _get_labels_from_model from spacy.cli.debug_data import _get_labels_from_spancat from spacy.cli.debug_data import _get_distribution, _get_kl_divergence From e668c2c21f48f9de301e01340f5c32d5bd9a1d31 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Mon, 13 Feb 2023 17:51:09 +0900 Subject: [PATCH 23/23] Fix function name in tests --- spacy/tests/test_cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 1188e4a1b..4c6bd28b4 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1198,7 +1198,7 @@ def test_configure_resume(tmp_path): assert "source" in val, f"Non-sourced component: {comp}" -def test_configure_tok2vec_feature_source(tmp_path): +def test_use_tok2vec(tmp_path): # Can't add a transformer here because spacy-transformers might not be present nlp = spacy.blank("en") nlp.add_pipe("tok2vec") @@ -1206,7 +1206,7 @@ def test_configure_tok2vec_feature_source(tmp_path): nlp.to_disk(base_path) out_path = tmp_path / "converted_to_tok2vec" - conf = configure_tok2vec_feature_source(base_path, out_path) + conf = use_tok2vec(base_path, out_path) assert out_path.exists(), "No model saved" assert "tok2vec" in conf["components"], "No tok2vec component"