mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-25 03:13:41 +03:00
332 lines
10 KiB
Python
332 lines
10 KiB
Python
from pathlib import Path
|
|
import re
|
|
from wasabi import msg
|
|
import typer
|
|
from thinc.api import Config
|
|
|
|
import spacy
|
|
from spacy.language import Language
|
|
|
|
from ._util import configure_cli, Arg, Opt
|
|
|
|
# These are the architectures that are recognized as tok2vec/feature sources.
|
|
TOK2VEC_ARCHS = [("spacy", "Tok2Vec"), ("spacy-transformers", "TransformerModel")]
|
|
# These are the listeners.
|
|
LISTENER_ARCHS = [
|
|
("spacy", "Tok2VecListener"),
|
|
("spacy-transformers", "TransformerListener"),
|
|
]
|
|
|
|
|
|
def _deep_get(obj, key, default):
|
|
"""Given a multi-part key, try to get the key. If at any point this isn't
|
|
possible, return the default.
|
|
"""
|
|
out = None
|
|
slot = obj
|
|
for notch in key:
|
|
if slot is None or notch not in slot:
|
|
return default
|
|
slot = slot[notch]
|
|
return slot
|
|
|
|
|
|
def _get_tok2vecs(config):
|
|
"""Given a pipeline config, return the names of components that are
|
|
tok2vecs (or Transformers).
|
|
"""
|
|
out = []
|
|
for name, comp in config["components"].items():
|
|
arch = _deep_get(comp, ("model", "@architectures"), False)
|
|
if not arch:
|
|
continue
|
|
|
|
ns, model, ver = arch.split(".")
|
|
if (ns, model) in TOK2VEC_ARCHS:
|
|
out.append(name)
|
|
return out
|
|
|
|
|
|
def _has_listener(nlp, pipe_name):
|
|
"""Given a pipeline and a component name, check if it has a listener."""
|
|
arch = _deep_get(
|
|
nlp.config,
|
|
("components", pipe_name, "model", "tok2vec", "@architectures"),
|
|
False,
|
|
)
|
|
if not arch:
|
|
return False
|
|
ns, model, ver = arch.split(".")
|
|
return (ns, model) in LISTENER_ARCHS
|
|
|
|
|
|
def _get_listeners(nlp):
|
|
"""Get the name of every component that contains a listener.
|
|
|
|
Does not check that they listen to the same thing; assumes a pipeline has
|
|
only one feature source.
|
|
"""
|
|
out = []
|
|
for name in nlp.pipe_names:
|
|
if _has_listener(nlp, name):
|
|
out.append(name)
|
|
return out
|
|
|
|
|
|
def _increment_suffix(name):
|
|
"""Given a name, return an incremented version.
|
|
|
|
If no numeric suffix is found, return the original with "2" appended.
|
|
|
|
This is used to avoid name collisions in pipelines.
|
|
"""
|
|
|
|
res = re.search(r"\d+$", name)
|
|
if res is None:
|
|
return f"{name}2"
|
|
else:
|
|
num = res.match
|
|
prefix = name[0 : -len(num)]
|
|
return f"{prefix}{int(num) + 1}"
|
|
|
|
|
|
def _check_single_tok2vec(name, config):
|
|
"""Check if there is just one tok2vec in a config.
|
|
|
|
A very simple check, but used in multiple functions.
|
|
"""
|
|
tok2vecs = _get_tok2vecs(config)
|
|
fail_msg = f"""
|
|
Can't handle pipelines with more than one feature source,
|
|
but {name} has {len(tok2vecs)}."""
|
|
if len(tok2vecs) > 1:
|
|
msg.fail(fail_msg, exits=1)
|
|
|
|
|
|
def _check_pipeline_names(nlp, nlp2):
|
|
"""Given two pipelines, try to rename any collisions in component names.
|
|
|
|
If a simple increment of a numeric suffix doesn't work, will give up.
|
|
"""
|
|
|
|
fail_msg = """
|
|
Tried automatically renaming {name} to {new_name}, but still
|
|
had a collision, so bailing out. Please make your pipe names
|
|
more unique.
|
|
"""
|
|
|
|
# map of components to be renamed
|
|
rename = {}
|
|
# check pipeline names
|
|
names = nlp.pipe_names
|
|
for name in nlp2.pipe_names:
|
|
if name in names:
|
|
inc = _increment_suffix(name)
|
|
# TODO Would it be better to just keep incrementing?
|
|
if inc in names or inc in nlp2.pipe_names:
|
|
msg.fail(fail_msg.format(name=name, new_name=inc), exits=1)
|
|
rename[name] = inc
|
|
return rename
|
|
|
|
|
|
@configure_cli.command("resume")
|
|
def configure_resume_cli(
|
|
# fmt: off
|
|
base_model: Path = Arg(..., help="Path or name of base model to use for config"),
|
|
output_path: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
|
|
# fmt: on
|
|
):
|
|
"""Create a config for resuming training.
|
|
|
|
A config for resuming training is the same as the input config, but with
|
|
all components sourced.
|
|
"""
|
|
|
|
nlp = spacy.load(base_model)
|
|
conf = nlp.config
|
|
|
|
# Paths are not JSON serializable
|
|
path_str = str(base_model)
|
|
|
|
for comp in nlp.pipe_names:
|
|
conf["components"][comp] = {"source": path_str}
|
|
|
|
if str(output_path) == "-":
|
|
print(conf.to_str())
|
|
else:
|
|
conf.to_disk(output_path)
|
|
msg.good("Saved config", output_path)
|
|
|
|
return conf
|
|
|
|
|
|
@configure_cli.command("transformer")
|
|
def use_transformer(
|
|
base_model: str, output_path: Path, transformer_name: str = "roberta-base"
|
|
) -> Config:
|
|
"""Replace pipeline tok2vec with transformer."""
|
|
|
|
# 1. identify tok2vec
|
|
# 2. replace tok2vec
|
|
# 3. replace listeners
|
|
nlp = spacy.load(base_model)
|
|
_check_single_tok2vec(base_model, nlp.config)
|
|
|
|
tok2vecs = _get_tok2vecs(nlp.config)
|
|
assert len(tok2vecs) > 0, "Must have tok2vec to replace!"
|
|
|
|
nlp.remove_pipe(tok2vecs[0])
|
|
# the rest can be default values
|
|
trf_config = {
|
|
"model": {
|
|
"name": transformer_name,
|
|
}
|
|
}
|
|
trf = nlp.add_pipe("transformer", config=trf_config, first=True)
|
|
|
|
# TODO maybe remove vectors?
|
|
|
|
# now update the listeners
|
|
listeners = _get_listeners(nlp)
|
|
for listener in listeners:
|
|
listener_config = {
|
|
"@architectures": "spacy-transformers.TransformerListener.v1",
|
|
"grad_factor": 1.0,
|
|
"upstream": "transformer",
|
|
"pooling": {"@layers": "reduce_mean.v1"},
|
|
}
|
|
nlp.config["components"][listener]["model"]["tok2vec"] = listener_config
|
|
|
|
if str(output_path) == "-":
|
|
print(nlp.config.to_str())
|
|
else:
|
|
nlp.config.to_disk(output_path)
|
|
msg.good("Saved config", output_path)
|
|
|
|
return nlp.config
|
|
|
|
|
|
@configure_cli.command("tok2vec")
|
|
def use_tok2vec(base_model: str, output_path: Path) -> Config:
|
|
"""Replace pipeline tok2vec with CNN tok2vec."""
|
|
nlp = spacy.load(base_model)
|
|
_check_single_tok2vec(base_model, nlp.config)
|
|
|
|
tok2vecs = _get_tok2vecs(nlp.config)
|
|
assert len(tok2vecs) > 0, "Must have tok2vec to replace!"
|
|
|
|
nlp.remove_pipe(tok2vecs[0])
|
|
|
|
tok2vec = nlp.add_pipe("tok2vec", first=True)
|
|
width = "${components.tok2vec.model.encode:width}"
|
|
|
|
listeners = _get_listeners(nlp)
|
|
for listener in listeners:
|
|
listener_config = {
|
|
"@architectures": "spacy.Tok2VecListener.v1",
|
|
"width": width,
|
|
"upstream": "tok2vec",
|
|
}
|
|
nlp.config["components"][listener]["model"]["tok2vec"] = listener_config
|
|
|
|
if str(output_path) == "-":
|
|
print(nlp.config.to_str())
|
|
else:
|
|
nlp.config.to_disk(output_path)
|
|
msg.good("Saved config", output_path)
|
|
|
|
return nlp.config
|
|
|
|
|
|
def _inner_merge(nlp, nlp2, replace_listeners=False) -> Language:
|
|
"""Actually do the merge.
|
|
|
|
nlp: Base pipeline to add components to.
|
|
nlp2: Pipeline to add components from.
|
|
replace_listeners (bool): Whether to replace listeners. Usually only true
|
|
if there's one listener.
|
|
returns: assembled pipeline.
|
|
"""
|
|
|
|
# we checked earlier, so there's definitely just one
|
|
tok2vec_name = _get_tok2vecs(nlp2.config)[0]
|
|
rename = _check_pipeline_names(nlp, nlp2)
|
|
|
|
if len(_get_listeners(nlp2)) > 1:
|
|
if replace_listeners:
|
|
msg.warn(
|
|
"""
|
|
Replacing listeners for multiple components. Note this can make
|
|
your pipeline large and slow. Consider chaining pipelines (like
|
|
nlp2(nlp(text))) instead.
|
|
"""
|
|
)
|
|
else:
|
|
# TODO provide a guide for what to do here
|
|
msg.warn(
|
|
"""
|
|
The result of this merge will have two feature sources
|
|
(tok2vecs) and multiple listeners. This will work for
|
|
inference, but will probably not work when training without
|
|
extra adjustment. If you continue to train the pipelines
|
|
separately this is not a problem.
|
|
"""
|
|
)
|
|
|
|
for comp in nlp2.pipe_names:
|
|
if replace_listeners and comp == tok2vec_name:
|
|
# the tok2vec should not be copied over
|
|
continue
|
|
if replace_listeners and _has_listener(nlp2, comp):
|
|
# TODO does "model.tok2vec" work for everything?
|
|
nlp2.replace_listeners(tok2vec_name, comp, ["model.tok2vec"])
|
|
nlp.add_pipe(comp, source=nlp2, name=rename.get(comp, comp))
|
|
if comp in rename:
|
|
msg.info(f"Renaming {comp} to {rename[comp]} to avoid collision...")
|
|
return nlp
|
|
|
|
|
|
@configure_cli.command("merge")
|
|
def merge_pipelines(base_model: str, added_model: str, output_path: Path) -> Language:
|
|
"""Combine components from multiple pipelines."""
|
|
nlp = spacy.load(base_model)
|
|
nlp2 = spacy.load(added_model)
|
|
|
|
# to merge models:
|
|
# - lang must be the same
|
|
# - vectors must be the same
|
|
# - vocabs must be the same (how to check?)
|
|
# - tokenizer must be the same (only partially checkable)
|
|
if nlp.lang != nlp2.lang:
|
|
msg.fail("Can't merge - languages don't match", exits=1)
|
|
|
|
# check vector equality
|
|
if (
|
|
nlp.vocab.vectors.shape != nlp2.vocab.vectors.shape
|
|
or nlp.vocab.vectors.key2row != nlp2.vocab.vectors.key2row
|
|
or nlp.vocab.vectors.to_bytes(exclude=["strings"])
|
|
!= nlp2.vocab.vectors.to_bytes(exclude=["strings"])
|
|
):
|
|
msg.fail("Can't merge - vectors don't match", exits=1)
|
|
|
|
if nlp.config["nlp"]["tokenizer"] != nlp2.config["nlp"]["tokenizer"]:
|
|
msg.fail("Can't merge - tokenizers don't match", exits=1)
|
|
|
|
# Check that each pipeline only has one feature source
|
|
_check_single_tok2vec(base_model, nlp.config)
|
|
_check_single_tok2vec(added_model, nlp2.config)
|
|
|
|
# Check how many listeners there are and replace based on that
|
|
# TODO: option to recognize frozen tok2vecs
|
|
# TODO: take list of pipe names to copy
|
|
listeners = _get_listeners(nlp2)
|
|
replace_listeners = len(listeners) == 1
|
|
print(replace_listeners, len(listeners))
|
|
nlp_out = _inner_merge(nlp, nlp2, replace_listeners=replace_listeners)
|
|
|
|
# write the final pipeline
|
|
nlp.to_disk(output_path)
|
|
msg.info(f"Saved pipeline to: {output_path}")
|
|
|
|
return nlp
|