mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-13 09:42:26 +03:00
Move merge to independent command
This commit is contained in:
parent
9d0ae2407b
commit
03a0c2badc
|
@ -29,8 +29,9 @@ from .project.push import project_push # noqa: F401
|
|||
from .project.pull import project_pull # noqa: F401
|
||||
from .project.document import project_document # noqa: F401
|
||||
from .find_threshold import find_threshold # noqa: F401
|
||||
from .configure import merge_pipelines, use_tok2vec, use_transformer # noqa: F401
|
||||
from .configure import use_tok2vec, use_transformer # noqa: F401
|
||||
from .configure import configure_resume_cli # noqa: F401
|
||||
from .merge import merge_pipelines # noqa: F401
|
||||
|
||||
|
||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||
|
|
|
@ -247,96 +247,3 @@ def use_tok2vec(base_model: str, output_file: Path) -> Config:
|
|||
msg.good("Saved config", output_file)
|
||||
|
||||
return nlp.config
|
||||
|
||||
|
||||
def _inner_merge(
|
||||
nlp: Language, nlp2: Language, replace_listeners: bool = False
|
||||
) -> Language:
|
||||
"""Actually do the merge.
|
||||
|
||||
nlp: Base pipeline to add components to.
|
||||
nlp2: Pipeline to add components from.
|
||||
replace_listeners (bool): Whether to replace listeners. Usually only true
|
||||
if there's one listener.
|
||||
returns: assembled pipeline.
|
||||
"""
|
||||
|
||||
# we checked earlier, so there's definitely just one
|
||||
tok2vec_name = _get_tok2vecs(nlp2.config)[0]
|
||||
rename = _check_pipeline_names(nlp, nlp2)
|
||||
|
||||
if len(_get_listeners(nlp2)) > 1:
|
||||
if replace_listeners:
|
||||
msg.warn(
|
||||
"""
|
||||
Replacing listeners for multiple components. Note this can make
|
||||
your pipeline large and slow. Consider chaining pipelines (like
|
||||
nlp2(nlp(text))) instead.
|
||||
"""
|
||||
)
|
||||
else:
|
||||
# TODO provide a guide for what to do here
|
||||
msg.warn(
|
||||
"""
|
||||
The result of this merge will have two feature sources
|
||||
(tok2vecs) and multiple listeners. This will work for
|
||||
inference, but will probably not work when training without
|
||||
extra adjustment. If you continue to train the pipelines
|
||||
separately this is not a problem.
|
||||
"""
|
||||
)
|
||||
|
||||
for comp in nlp2.pipe_names:
|
||||
if replace_listeners and comp == tok2vec_name:
|
||||
# the tok2vec should not be copied over
|
||||
continue
|
||||
if replace_listeners and _has_listener(nlp2, comp):
|
||||
nlp2.replace_listeners(tok2vec_name, comp, ["model.tok2vec"])
|
||||
nlp.add_pipe(comp, source=nlp2, name=rename.get(comp, comp))
|
||||
if comp in rename:
|
||||
msg.info(f"Renaming {comp} to {rename[comp]} to avoid collision...")
|
||||
return nlp
|
||||
|
||||
|
||||
@configure_cli.command("merge")
|
||||
def merge_pipelines(base_model: str, added_model: str, output_file: Path) -> Language:
|
||||
"""Combine components from multiple pipelines."""
|
||||
nlp = spacy.load(base_model)
|
||||
nlp2 = spacy.load(added_model)
|
||||
|
||||
# to merge models:
|
||||
# - lang must be the same
|
||||
# - vectors must be the same
|
||||
# - vocabs must be the same
|
||||
# - tokenizer must be the same (only partially checkable)
|
||||
if nlp.lang != nlp2.lang:
|
||||
msg.fail("Can't merge - languages don't match", exits=1)
|
||||
|
||||
# check vector equality
|
||||
if (
|
||||
nlp.vocab.vectors.shape != nlp2.vocab.vectors.shape
|
||||
or nlp.vocab.vectors.key2row != nlp2.vocab.vectors.key2row
|
||||
or nlp.vocab.vectors.to_bytes(exclude=["strings"])
|
||||
!= nlp2.vocab.vectors.to_bytes(exclude=["strings"])
|
||||
):
|
||||
msg.fail("Can't merge - vectors don't match", exits=1)
|
||||
|
||||
if nlp.config["nlp"]["tokenizer"] != nlp2.config["nlp"]["tokenizer"]:
|
||||
msg.fail("Can't merge - tokenizers don't match", exits=1)
|
||||
|
||||
# Check that each pipeline only has one feature source
|
||||
_check_single_tok2vec(base_model, nlp.config)
|
||||
_check_single_tok2vec(added_model, nlp2.config)
|
||||
|
||||
# Check how many listeners there are and replace based on that
|
||||
# TODO: option to recognize frozen tok2vecs
|
||||
# TODO: take list of pipe names to copy, ignore others
|
||||
listeners = _get_listeners(nlp2)
|
||||
replace_listeners = len(listeners) == 1
|
||||
nlp_out = _inner_merge(nlp, nlp2, replace_listeners=replace_listeners)
|
||||
|
||||
# write the final pipeline
|
||||
nlp.to_disk(output_file)
|
||||
msg.info(f"Saved pipeline to: {output_file}")
|
||||
|
||||
return nlp
|
||||
|
|
108
spacy/cli/merge.py
Normal file
108
spacy/cli/merge.py
Normal file
|
@ -0,0 +1,108 @@
|
|||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
|
||||
import spacy
|
||||
from spacy.language import Language
|
||||
|
||||
from ._util import app, Arg, Opt
|
||||
from .configure import _check_single_tok2vec, _get_listeners, _get_tok2vecs
|
||||
from .configure import _check_pipeline_names, _has_listener
|
||||
|
||||
|
||||
def _inner_merge(
|
||||
nlp: Language, nlp2: Language, replace_listeners: bool = False
|
||||
) -> Language:
|
||||
"""Actually do the merge.
|
||||
|
||||
nlp: Base pipeline to add components to.
|
||||
nlp2: Pipeline to add components from.
|
||||
replace_listeners (bool): Whether to replace listeners. Usually only true
|
||||
if there's one listener.
|
||||
returns: assembled pipeline.
|
||||
"""
|
||||
|
||||
# we checked earlier, so there's definitely just one
|
||||
tok2vec_name = _get_tok2vecs(nlp2.config)[0]
|
||||
rename = _check_pipeline_names(nlp, nlp2)
|
||||
|
||||
if len(_get_listeners(nlp2)) > 1:
|
||||
if replace_listeners:
|
||||
msg.warn(
|
||||
"""
|
||||
Replacing listeners for multiple components. Note this can make
|
||||
your pipeline large and slow. Consider chaining pipelines (like
|
||||
nlp2(nlp(text))) instead.
|
||||
"""
|
||||
)
|
||||
else:
|
||||
# TODO provide a guide for what to do here
|
||||
msg.warn(
|
||||
"""
|
||||
The result of this merge will have two feature sources
|
||||
(tok2vecs) and multiple listeners. This will work for
|
||||
inference, but will probably not work when training without
|
||||
extra adjustment. If you continue to train the pipelines
|
||||
separately this is not a problem.
|
||||
"""
|
||||
)
|
||||
|
||||
for comp in nlp2.pipe_names:
|
||||
if replace_listeners and comp == tok2vec_name:
|
||||
# the tok2vec should not be copied over
|
||||
continue
|
||||
if replace_listeners and _has_listener(nlp2, comp):
|
||||
nlp2.replace_listeners(tok2vec_name, comp, ["model.tok2vec"])
|
||||
nlp.add_pipe(comp, source=nlp2, name=rename.get(comp, comp))
|
||||
if comp in rename:
|
||||
msg.info(f"Renaming {comp} to {rename[comp]} to avoid collision...")
|
||||
return nlp
|
||||
|
||||
|
||||
@app.command("merge")
|
||||
def merge_pipelines(
|
||||
# fmt: off
|
||||
base_model: str = Arg(..., help="Name or path of base model"),
|
||||
added_model: str = Arg(..., help="Name or path of model to be merged"),
|
||||
output_file: Path = Arg(..., help="Path to save merged model")
|
||||
# fmt: on
|
||||
) -> Language:
|
||||
"""Combine components from multiple pipelines."""
|
||||
nlp = spacy.load(base_model)
|
||||
nlp2 = spacy.load(added_model)
|
||||
|
||||
# to merge models:
|
||||
# - lang must be the same
|
||||
# - vectors must be the same
|
||||
# - vocabs must be the same
|
||||
# - tokenizer must be the same (only partially checkable)
|
||||
if nlp.lang != nlp2.lang:
|
||||
msg.fail("Can't merge - languages don't match", exits=1)
|
||||
|
||||
# check vector equality
|
||||
if (
|
||||
nlp.vocab.vectors.shape != nlp2.vocab.vectors.shape
|
||||
or nlp.vocab.vectors.key2row != nlp2.vocab.vectors.key2row
|
||||
or nlp.vocab.vectors.to_bytes(exclude=["strings"])
|
||||
!= nlp2.vocab.vectors.to_bytes(exclude=["strings"])
|
||||
):
|
||||
msg.fail("Can't merge - vectors don't match", exits=1)
|
||||
|
||||
if nlp.config["nlp"]["tokenizer"] != nlp2.config["nlp"]["tokenizer"]:
|
||||
msg.fail("Can't merge - tokenizers don't match", exits=1)
|
||||
|
||||
# Check that each pipeline only has one feature source
|
||||
_check_single_tok2vec(base_model, nlp.config)
|
||||
_check_single_tok2vec(added_model, nlp2.config)
|
||||
|
||||
# Check how many listeners there are and replace based on that
|
||||
# TODO: option to recognize frozen tok2vecs
|
||||
# TODO: take list of pipe names to copy, ignore others
|
||||
listeners = _get_listeners(nlp2)
|
||||
replace_listeners = len(listeners) == 1
|
||||
nlp_out = _inner_merge(nlp, nlp2, replace_listeners=replace_listeners)
|
||||
|
||||
# write the final pipeline
|
||||
nlp.to_disk(output_file)
|
||||
msg.info(f"Saved pipeline to: {output_file}")
|
||||
|
||||
return nlp
|
|
@ -21,7 +21,7 @@ from spacy.cli._util import parse_config_overrides, string_to_list
|
|||
from spacy.cli._util import substitute_project_variables
|
||||
from spacy.cli._util import validate_project_commands
|
||||
from spacy.cli._util import upload_file, download_file
|
||||
from spacy.cli.configure import configure_resume_cli, use_tok2vec, merge_pipelines
|
||||
from spacy.cli.configure import configure_resume_cli, use_tok2vec
|
||||
from spacy.cli.debug_data import _compile_gold, _get_labels_from_model
|
||||
from spacy.cli.debug_data import _get_labels_from_spancat
|
||||
from spacy.cli.debug_data import _get_distribution, _get_kl_divergence
|
||||
|
@ -30,6 +30,7 @@ from spacy.cli.debug_data import _print_span_characteristics
|
|||
from spacy.cli.debug_data import _get_spans_length_freq_dist
|
||||
from spacy.cli.download import get_compatibility, get_version
|
||||
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
|
||||
from spacy.cli.merge import merge_pipelines
|
||||
from spacy.cli.package import get_third_party_dependencies
|
||||
from spacy.cli.package import _is_permitted_package_name
|
||||
from spacy.cli.project.remote_storage import RemoteStorage
|
||||
|
|
|
@ -8,6 +8,7 @@ menu:
|
|||
- ['validate', 'validate']
|
||||
- ['init', 'init']
|
||||
- ['configure', 'configure']
|
||||
- ['merge', 'merge']
|
||||
- ['convert', 'convert']
|
||||
- ['debug', 'debug']
|
||||
- ['train', 'train']
|
||||
|
@ -306,11 +307,10 @@ $ python -m spacy configure tok2vec [base_model] [output_file]
|
|||
| `base_model` | A trained pipeline to resume training (package name or path). ~~str (positional)~~ |
|
||||
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
|
||||
|
||||
### configure merge {id="configure-merge", tag="command"}
|
||||
## merge {id="merge", tag="command"}
|
||||
|
||||
Take two pipelines and create a new one with components from both of them,
|
||||
handling the configuration of listeners. Note that unlike other commands, this
|
||||
produces a whole pipeline, not just a config.
|
||||
handling the configuration of listeners. The output is a serialized pipeline.
|
||||
|
||||
Components in the final pipeline are in the same order as in the original
|
||||
pipelines, with the base pipeline first and the added pipeline after. Because
|
||||
|
@ -329,7 +329,7 @@ textcat, and want to provide with one of the official pretrained pipelines or
|
|||
another pipeline.
|
||||
|
||||
```cli
|
||||
$ python -m spacy configure tok2vec [base_model] [added_model] [output_file]
|
||||
$ python -m spacy merge [base_model] [added_model] [output_file]
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
|
|
Loading…
Reference in New Issue
Block a user