This commit is contained in:
Matthew Honnibal 2023-07-15 08:42:57 +02:00 committed by GitHub
commit 708ee32f9f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 187 additions and 2 deletions

View File

@ -32,6 +32,7 @@ def load(
enable: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES, enable: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES,
exclude: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES, exclude: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES,
config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(), config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
pipe_instances: Dict[str, Any] = util.SimpleFrozenDict(),
) -> Language: ) -> Language:
"""Load a spaCy model from an installed package or a local path. """Load a spaCy model from an installed package or a local path.
@ -55,6 +56,7 @@ def load(
enable=enable, enable=enable,
exclude=exclude, exclude=exclude,
config=config, config=config,
pipe_instances=pipe_instances,
) )

View File

@ -219,6 +219,9 @@ class Warnings(metaclass=ErrorsWithCodes):
W125 = ("The StaticVectors key_attr is no longer used. To set a custom " W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
"key attribute for vectors, configure it through Vectors(attr=) or " "key attribute for vectors, configure it through Vectors(attr=) or "
"'spacy init vectors --attr'") "'spacy init vectors --attr'")
W126 = ("Pipe instance '{name}' is being added with a vocab "
"instance that will not match other components. This is "
"usually an error.")
class Errors(metaclass=ErrorsWithCodes): class Errors(metaclass=ErrorsWithCodes):
@ -981,6 +984,7 @@ class Errors(metaclass=ErrorsWithCodes):
" 'min_length': {min_length}, 'max_length': {max_length}") " 'min_length': {min_length}, 'max_length': {max_length}")
E1054 = ("The text, including whitespace, must match between reference and " E1054 = ("The text, including whitespace, must match between reference and "
"predicted docs when training {component}.") "predicted docs when training {component}.")
E1055 = ("Cannot create Language instance from config: missing pipeline components. The following components were added by instance (rather than config) via the 'Language.add_pipe_instance()' method, but are not present in the 'pipe_instances' variable: {names}")
# Deprecated model shortcuts, only used in errors and warnings # Deprecated model shortcuts, only used in errors and warnings

View File

@ -75,6 +75,9 @@ DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH)
# This is the base config for the [pretraining] block and currently not included # This is the base config for the [pretraining] block and currently not included
# in the main config and only added via the 'init fill-config' command # in the main config and only added via the 'init fill-config' command
DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg" DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg"
# Factory name indicating that the component wasn't constructed by a factory,
# and was instead passed by instance
INSTANCE_FACTORY_NAME = "__added_by_instance__"
# Type variable for contexts piped with documents # Type variable for contexts piped with documents
_AnyContext = TypeVar("_AnyContext") _AnyContext = TypeVar("_AnyContext")
@ -771,6 +774,9 @@ class Language:
"""Add a component to the processing pipeline. Valid components are """Add a component to the processing pipeline. Valid components are
callables that take a `Doc` object, modify it and return it. Only one callables that take a `Doc` object, modify it and return it. Only one
of before/after/first/last can be set. Default behaviour is "last". of before/after/first/last can be set. Default behaviour is "last".
Components can be added either by factory name or by instance. If
an instance is supplied and you serialize the pipeline, you'll need
to also pass an instance into spacy.load() to construct the pipeline.
factory_name (str): Name of the component factory. factory_name (str): Name of the component factory.
name (str): Name of pipeline component. Overwrites existing name (str): Name of pipeline component. Overwrites existing
@ -818,12 +824,61 @@ class Language:
raw_config=raw_config, raw_config=raw_config,
validate=validate, validate=validate,
) )
pipe_index = self._get_pipe_index(before, after, first, last)
self._pipe_meta[name] = self.get_factory_meta(factory_name) self._pipe_meta[name] = self.get_factory_meta(factory_name)
pipe_index = self._get_pipe_index(before, after, first, last)
self._components.insert(pipe_index, (name, pipe_component)) self._components.insert(pipe_index, (name, pipe_component))
self._link_components() self._link_components()
return pipe_component return pipe_component
def add_pipe_instance(
self,
component: PipeCallable,
name: Optional[str] = None,
*,
before: Optional[Union[str, int]] = None,
after: Optional[Union[str, int]] = None,
first: Optional[bool] = None,
last: Optional[bool] = None,
) -> PipeCallable:
"""Add a component instance to the processing pipeline. Valid components
are callables that take a `Doc` object, modify it and return it. Only one
of before/after/first/last can be set. Default behaviour is "last".
A limitation of this method is that spaCy will not know how to reconstruct
your pipeline after you save it out (unlike the 'Language.add_pipe()' method,
where you provide a config and let spaCy construct the instance). See 'spacy.load'
for details of how to load back a pipeline with components added by instance.
pipe_instance (Callable[[Doc], Doc]): The component to add.
name (str): Name of pipeline component. Overwrites existing
component.name attribute if available. If no name is set and
the component exposes no name attribute, component.__name__ is
used. An error is raised if a name already exists in the pipeline.
before (Union[str, int]): Name or index of the component to insert new
component directly before.
after (Union[str, int]): Name or index of the component to insert new
component directly after.
first (bool): If True, insert component first in the pipeline.
last (bool): If True, insert component last in the pipeline.
RETURNS (Callable[[Doc], Doc]): The pipeline component.
DOCS: https://spacy.io/api/language#add_pipe_instance
"""
name = name if name is not None else getattr(component, "name")
if name is None:
raise ValueError("TODO error")
if name in self.component_names:
raise ValueError(Errors.E007.format(name=name, opts=self.component_names))
# It would be possible to take arguments for the FactoryMeta here, but we'll then have
# a problem on deserialization: where will the data be coming from?
# I think if someone wants that, they should register a component function.
self._pipe_meta[name] = FactoryMeta(INSTANCE_FACTORY_NAME)
self._pipe_configs[name] = Config()
pipe_index = self._get_pipe_index(before, after, first, last)
self._components.insert(pipe_index, (name, component))
return component
def _get_pipe_index( def _get_pipe_index(
self, self,
before: Optional[Union[str, int]] = None, before: Optional[Union[str, int]] = None,
@ -1735,6 +1790,7 @@ class Language:
meta: Dict[str, Any] = SimpleFrozenDict(), meta: Dict[str, Any] = SimpleFrozenDict(),
auto_fill: bool = True, auto_fill: bool = True,
validate: bool = True, validate: bool = True,
pipe_instances: Dict[str, Any] = SimpleFrozenDict(),
) -> "Language": ) -> "Language":
"""Create the nlp object from a loaded config. Will set up the tokenizer """Create the nlp object from a loaded config. Will set up the tokenizer
and language data, add pipeline components etc. If no config is provided, and language data, add pipeline components etc. If no config is provided,
@ -1810,6 +1866,11 @@ class Language:
# Warn about require_gpu usage in jupyter notebook # Warn about require_gpu usage in jupyter notebook
warn_if_jupyter_cupy() warn_if_jupyter_cupy()
# If we've been passed pipe instances, check whether
# they have a Vocab instance, and if they do, use
# that one. This also performs some additional checks and
# warns if there's a mismatch.
vocab = _get_instantiated_vocab(vocab, pipe_instances)
# Note that we don't load vectors here, instead they get loaded explicitly # Note that we don't load vectors here, instead they get loaded explicitly
# inside stuff like the spacy train function. If we loaded them here, # inside stuff like the spacy train function. If we loaded them here,
@ -1826,6 +1887,11 @@ class Language:
interpolated = filled.interpolate() if not filled.is_interpolated else filled interpolated = filled.interpolate() if not filled.is_interpolated else filled
pipeline = interpolated.get("components", {}) pipeline = interpolated.get("components", {})
sourced = util.get_sourced_components(interpolated) sourced = util.get_sourced_components(interpolated)
# Check for components that aren't in the pipe_instances dict, aren't disabled,
# and aren't built by factory.
missing_components = _find_missing_components(pipeline, pipe_instances, exclude)
if missing_components:
raise ValueError(Errors.E1055.format(names=", ".join(missing_components)))
# If components are loaded from a source (existing models), we cache # If components are loaded from a source (existing models), we cache
# them here so they're only loaded once # them here so they're only loaded once
source_nlps = {} source_nlps = {}
@ -1835,6 +1901,16 @@ class Language:
if pipe_name not in pipeline: if pipe_name not in pipeline:
opts = ", ".join(pipeline.keys()) opts = ", ".join(pipeline.keys())
raise ValueError(Errors.E956.format(name=pipe_name, opts=opts)) raise ValueError(Errors.E956.format(name=pipe_name, opts=opts))
if pipe_name in pipe_instances:
if pipe_name in exclude:
continue
else:
nlp.add_pipe_instance(pipe_instances[pipe_name])
# Is it important that we instantiate pipes that
# aren't excluded? It seems like we would want
# the exclude check above. I've left it how it
# is though, in case there's some sort of crazy
# load-bearing side-effects someone is relying on?
pipe_cfg = util.copy_config(pipeline[pipe_name]) pipe_cfg = util.copy_config(pipeline[pipe_name])
raw_config = Config(filled["components"][pipe_name]) raw_config = Config(filled["components"][pipe_name])
if pipe_name not in exclude: if pipe_name not in exclude:
@ -2337,3 +2413,46 @@ class _Sender:
if self.count >= self.chunk_size: if self.count >= self.chunk_size:
self.count = 0 self.count = 0
self.send() self.send()
def _get_instantiated_vocab(
vocab: Union[bool, Vocab], pipe_instances: Dict[str, Any]
) -> Union[bool, Vocab]:
vocab_instances = {}
for name, instance in pipe_instances.items():
if hasattr(instance, "vocab") and isinstance(instance.vocab, Vocab):
vocab_instances[name] = instance.vocab
if not vocab_instances:
return vocab
elif isinstance(vocab, Vocab):
for name, inst_voc in vocab_instances.items():
if inst_voc is not vocab:
warnings.warn(Warnings.W126.format(name=name))
return vocab
else:
resolved_vocab = None
for name, inst_voc in vocab_instances.items():
if resolved_vocab is None:
resolved_vocab = inst_voc
elif inst_voc is not resolved_vocab:
warnings.warn(Warnings.W126.format(name=name))
# This is supposed to only be for the type checker --
# it should be unreachable
assert resolved_vocab is not None
return resolved_vocab
def _find_missing_components(
pipeline: Dict[str, Dict[str, Any]],
pipe_instances: Dict[str, Any],
exclude: Iterable[str],
) -> List[str]:
missing = []
for name, config in pipeline.items():
if (
config.get("factory") == INSTANCE_FACTORY_NAME
and name not in pipe_instances
and name not in exclude
):
missing.append(name)
return missing

View File

@ -800,3 +800,40 @@ def test_component_return():
nlp.add_pipe("test_component_bad_pipe") nlp.add_pipe("test_component_bad_pipe")
with pytest.raises(ValueError, match="instead of a Doc"): with pytest.raises(ValueError, match="instead of a Doc"):
nlp("text") nlp("text")
@pytest.mark.parametrize(
"components,kwargs,position",
[
(["t1", "t2"], {"before": "t1"}, 0),
(["t1", "t2"], {"after": "t1"}, 1),
(["t1", "t2"], {"after": "t1"}, 1),
(["t1", "t2"], {"first": True}, 0),
(["t1", "t2"], {"last": True}, 2),
(["t1", "t2"], {"last": False}, 2),
(["t1", "t2"], {"first": False}, ValueError),
],
)
def test_add_pipe_instance(components, kwargs, position):
nlp = Language()
for name in components:
nlp.add_pipe("textcat", name=name)
pipe_names = list(nlp.pipe_names)
if isinstance(position, int):
result = nlp.add_pipe_instance(evil_component, name="new_component", **kwargs)
assert result is evil_component
pipe_names.insert(position, "new_component")
assert nlp.pipe_names == pipe_names
else:
with pytest.raises(ValueError):
result = nlp.add_pipe_instance(
evil_component, name="new_component", **kwargs
)
def test_add_pipe_instance_to_bytes():
nlp = Language()
nlp.add_pipe("textcat", name="t1")
nlp.add_pipe("textcat", name="t2")
nlp.add_pipe_instance(evil_component, name="new_component")
b = nlp.to_bytes()

View File

@ -438,6 +438,7 @@ def load_model(
enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
pipe_instances: Dict[str, Any] = SimpleFrozenDict(),
) -> "Language": ) -> "Language":
"""Load a model from a package or data path. """Load a model from a package or data path.
@ -449,6 +450,9 @@ def load_model(
exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation. keyed by section values in dot notation.
pipe_instances (Dict[str, Any]): Dictionary of components
to be added to the pipeline directly (not created from
config)
RETURNS (Language): The loaded nlp object. RETURNS (Language): The loaded nlp object.
""" """
kwargs = { kwargs = {
@ -457,6 +461,7 @@ def load_model(
"enable": enable, "enable": enable,
"exclude": exclude, "exclude": exclude,
"config": config, "config": config,
"pipe_instances": pipe_instances,
} }
if isinstance(name, str): # name or string path if isinstance(name, str): # name or string path
if name.startswith("blank:"): # shortcut for blank model if name.startswith("blank:"): # shortcut for blank model
@ -480,6 +485,7 @@ def load_model_from_package(
enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
pipe_instances: Dict[str, Any] = SimpleFrozenDict(),
) -> "Language": ) -> "Language":
"""Load a model from an installed package. """Load a model from an installed package.
@ -495,10 +501,13 @@ def load_model_from_package(
components won't be loaded. components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation. keyed by section values in dot notation.
pipe_instances (Dict[str, Any]): Dictionary of components
to be added to the pipeline directly (not created from
config)
RETURNS (Language): The loaded nlp object. RETURNS (Language): The loaded nlp object.
""" """
cls = importlib.import_module(name) cls = importlib.import_module(name)
return cls.load(vocab=vocab, disable=disable, enable=enable, exclude=exclude, config=config) # type: ignore[attr-defined] return cls.load(vocab=vocab, disable=disable, enable=enable, exclude=exclude, config=config, pipe_instances=pipe_instances) # type: ignore[attr-defined]
def load_model_from_path( def load_model_from_path(
@ -510,6 +519,7 @@ def load_model_from_path(
enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
pipe_instances: Dict[str, Any] = SimpleFrozenDict(),
) -> "Language": ) -> "Language":
"""Load a model from a data directory path. Creates Language class with """Load a model from a data directory path. Creates Language class with
pipeline from config.cfg and then calls from_disk() with path. pipeline from config.cfg and then calls from_disk() with path.
@ -527,6 +537,9 @@ def load_model_from_path(
components won't be loaded. components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation. keyed by section values in dot notation.
pipe_instances (Dict[str, Any]): Dictionary of components
to be added to the pipeline directly (not created from
config)
RETURNS (Language): The loaded nlp object. RETURNS (Language): The loaded nlp object.
""" """
if not model_path.exists(): if not model_path.exists():
@ -543,6 +556,7 @@ def load_model_from_path(
enable=enable, enable=enable,
exclude=exclude, exclude=exclude,
meta=meta, meta=meta,
pipe_instances=pipe_instances,
) )
return nlp.from_disk(model_path, exclude=exclude, overrides=overrides) return nlp.from_disk(model_path, exclude=exclude, overrides=overrides)
@ -557,6 +571,7 @@ def load_model_from_config(
exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
auto_fill: bool = False, auto_fill: bool = False,
validate: bool = True, validate: bool = True,
pipe_instances: Dict[str, Any] = SimpleFrozenDict(),
) -> "Language": ) -> "Language":
"""Create an nlp object from a config. Expects the full config file including """Create an nlp object from a config. Expects the full config file including
a section "nlp" containing the settings for the nlp object. a section "nlp" containing the settings for the nlp object.
@ -574,6 +589,9 @@ def load_model_from_config(
components won't be loaded. components won't be loaded.
auto_fill (bool): Whether to auto-fill config with missing defaults. auto_fill (bool): Whether to auto-fill config with missing defaults.
validate (bool): Whether to show config validation errors. validate (bool): Whether to show config validation errors.
pipe_instances (Dict[str, Any]): Dictionary of components
to be added to the pipeline directly (not created from
config)
RETURNS (Language): The loaded nlp object. RETURNS (Language): The loaded nlp object.
""" """
if "nlp" not in config: if "nlp" not in config:
@ -593,6 +611,7 @@ def load_model_from_config(
auto_fill=auto_fill, auto_fill=auto_fill,
validate=validate, validate=validate,
meta=meta, meta=meta,
pipe_instances=pipe_instances,
) )
return nlp return nlp
@ -656,6 +675,7 @@ def load_model_from_init_py(
enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
pipe_instances: Dict[str, Any] = SimpleFrozenDict(),
) -> "Language": ) -> "Language":
"""Helper function to use in the `load()` method of a model package's """Helper function to use in the `load()` method of a model package's
__init__.py. __init__.py.
@ -671,6 +691,9 @@ def load_model_from_init_py(
components won't be loaded. components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation. keyed by section values in dot notation.
pipe_instances (Dict[str, Any]): Dictionary of components
to be added to the pipeline directly (not created from
config)
RETURNS (Language): The loaded nlp object. RETURNS (Language): The loaded nlp object.
""" """
model_path = Path(init_file).parent model_path = Path(init_file).parent