From 6f821efaf31880600f20a0a153f2ffb085c34be1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Jun 2023 16:53:59 +0200 Subject: [PATCH 1/9] Add errors for pipe instance problems --- spacy/errors.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/spacy/errors.py b/spacy/errors.py index 40cfa8d92..f9bee07c9 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -215,6 +215,11 @@ class Warnings(metaclass=ErrorsWithCodes): W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option " "`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.") W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.") + W125 = ( + "Pipe instance '{name}' is being added with a vocab " + "instance that will not match other components. This is " + "usually an error." + ) class Errors(metaclass=ErrorsWithCodes): @@ -970,6 +975,7 @@ class Errors(metaclass=ErrorsWithCodes): E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` " "or use `auto_select_port=True` to pick an available port automatically.") E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.") + E1052 = ("Cannot create Language instance from config: missing pipeline components. The following components were added by instance (rather than config) via the 'Language.add_pipe_instance()' method, but are not present in the 'pipe_instances' variable: {names}") # Deprecated model shortcuts, only used in errors and warnings From aa0d747739bf361844d38311be6834b19d6bf5ca Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Jun 2023 16:55:13 +0200 Subject: [PATCH 2/9] Support adding pipeline component by instance --- spacy/language.py | 115 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 112 insertions(+), 3 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 9fdcf6328..7e52c60c3 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1,6 +1,6 @@ from typing import Iterator, Optional, Any, Dict, Callable, Iterable -from typing import Union, Tuple, List, Set, Pattern, Sequence -from typing import NoReturn, TYPE_CHECKING, TypeVar, cast, overload +from typing import Union, Tuple, List, Set, Pattern, Sequence, overload +from typing import NoReturn, TYPE_CHECKING, TypeVar, cast from dataclasses import dataclass import random @@ -52,6 +52,9 @@ DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH) # This is the base config for the [pretraining] block and currently not included # in the main config and only added via the 'init fill-config' command DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg" +# Factory name indicating that the component wasn't constructed by a factory, +# and was instead passed by instance +INSTANCE_FACTORY_NAME = "__added_by_instance__" # Type variable for contexts piped with documents _AnyContext = TypeVar("_AnyContext") @@ -743,6 +746,9 @@ class Language: """Add a component to the processing pipeline. Valid components are callables that take a `Doc` object, modify it and return it. Only one of before/after/first/last can be set. Default behaviour is "last". + Components can be added either by factory name or by instance. If + an instance is supplied and you serialize the pipeline, you'll need + to also pass an instance into spacy.load() to construct the pipeline. factory_name (str): Name of the component factory. name (str): Name of pipeline component. Overwrites existing @@ -790,11 +796,58 @@ class Language: raw_config=raw_config, validate=validate, ) - pipe_index = self._get_pipe_index(before, after, first, last) self._pipe_meta[name] = self.get_factory_meta(factory_name) + pipe_index = self._get_pipe_index(before, after, first, last) self._components.insert(pipe_index, (name, pipe_component)) return pipe_component + def add_pipe_instance(self, component: PipeCallable, + /, name: Optional[str] = None, + *, + before: Optional[Union[str, int]] = None, + after: Optional[Union[str, int]] = None, + first: Optional[bool] = None, + last: Optional[bool] = None, + ) -> PipeCallable: + """Add a component instance to the processing pipeline. Valid components + are callables that take a `Doc` object, modify it and return it. Only one + of before/after/first/last can be set. Default behaviour is "last". + + A limitation of this method is that spaCy will not know how to reconstruct + your pipeline after you save it out (unlike the 'Language.add_pipe()' method, + where you provide a config and let spaCy construct the instance). See 'spacy.load' + for details of how to load back a pipeline with components added by instance. + + pipe_instance (Callable[[Doc], Doc]): The component to add. + name (str): Name of pipeline component. Overwrites existing + component.name attribute if available. If no name is set and + the component exposes no name attribute, component.__name__ is + used. An error is raised if a name already exists in the pipeline. + before (Union[str, int]): Name or index of the component to insert new + component directly before. + after (Union[str, int]): Name or index of the component to insert new + component directly after. + first (bool): If True, insert component first in the pipeline. + last (bool): If True, insert component last in the pipeline. + RETURNS (Callable[[Doc], Doc]): The pipeline component. + + DOCS: https://spacy.io/api/language#add_pipe_instance + """ + name = name if name is not None else getattr(component, "name") + if name is None: + raise ValueError("TODO error") + if name in self.component_names: + raise ValueError(Errors.E007.format(name=name, opts=self.component_names)) + + # It would be possible to take arguments for the FactoryMeta here, but we'll then have + # a problem on deserialization: where will the data be coming from? + # I think if someone wants that, they should register a component function. + self._pipe_meta[name] = FactoryMeta(INSTANCE_FACTORY_NAME) + self._pipe_configs[name] = Config() + pipe_index = self._get_pipe_index(before, after, first, last) + self._components.insert(pipe_index, (name, component)) + return component + def _get_pipe_index( self, before: Optional[Union[str, int]] = None, @@ -1690,6 +1743,7 @@ class Language: meta: Dict[str, Any] = SimpleFrozenDict(), auto_fill: bool = True, validate: bool = True, + pipe_instances: Dict[str, Any] = SimpleFrozenDict() ) -> "Language": """Create the nlp object from a loaded config. Will set up the tokenizer and language data, add pipeline components etc. If no config is provided, @@ -1765,6 +1819,11 @@ class Language: # Warn about require_gpu usage in jupyter notebook warn_if_jupyter_cupy() + # If we've been passed pipe instances, check whether + # they have a Vocab instance, and if they do, use + # that one. This also performs some additional checks and + # warns if there's a mismatch. + vocab = _get_instantiated_vocab(vocab, pipe_instances) # Note that we don't load vectors here, instead they get loaded explicitly # inside stuff like the spacy train function. If we loaded them here, @@ -1781,6 +1840,11 @@ class Language: interpolated = filled.interpolate() if not filled.is_interpolated else filled pipeline = interpolated.get("components", {}) sourced = util.get_sourced_components(interpolated) + # Check for components that aren't in the pipe_instances dict, aren't disabled, + # and aren't built by factory. + missing_components = _find_missing_components(pipeline, pipe_instances, exclude) + if missing_components: + raise ValueError(Errors.E1052.format(", ",join(missing_components))) # If components are loaded from a source (existing models), we cache # them here so they're only loaded once source_nlps = {} @@ -1790,6 +1854,18 @@ class Language: if pipe_name not in pipeline: opts = ", ".join(pipeline.keys()) raise ValueError(Errors.E956.format(name=pipe_name, opts=opts)) + if pipe_name in pipe_instances: + if pipe_name in exclude: + continue + else: + nlp.add_pipe_instance( + pipe_instances[pipe_name] + ) + # Is it important that we instantiate pipes that + # aren't excluded? It seems like we would want + # the exclude check above. I've left it how it + # is though, in case there's some sort of crazy + # load-bearing side-effects someone is relying on? pipe_cfg = util.copy_config(pipeline[pipe_name]) raw_config = Config(filled["components"][pipe_name]) if pipe_name not in exclude: @@ -2306,3 +2382,36 @@ class _Sender: if self.count >= self.chunk_size: self.count = 0 self.send() + + +def _get_instantiated_vocab(vocab: Union[bool, Vocab], pipe_instances: Dict[str, Any]) -> Union[bool, Vocab]: + vocab_instances = {} + for name, instance in pipe_instances.items(): + if hasattr(instance, "vocab") and isinstance(instance.vocab, Vocab): + vocab_instances[name] = instance.vocab + if not vocab_instances: + return vocab + elif isinstance(vocab, Vocab): + for name, inst_voc in vocab_instances.items(): + if inst_voc is not vocab: + warnings.warn(Warnings.W125.format(name=name)) + return vocab + else: + resolved_vocab = None + for name, inst_voc in vocab_instances.items(): + if resolved_vocab is None: + resolved_vocab = inst_voc + elif inst_voc is not resolved_vocab: + warnings.warn(Warnings.W125.format(name=name)) + # This is supposed to only be for the type checker -- + # it should be unreachable + assert resolved_vocab is not None + return resolved_vocab + + +def _find_missing_components( + pipeline: List[str], + pipe_instances: Dict[str, Any], + exclude: List[str] +) -> List[str]: + return [name for name in pipeline if name not in pipe_instances and name not in exclude] From 4332d12ce28807f1102f5dc81a285c959ce72fad Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Jun 2023 16:55:52 +0200 Subject: [PATCH 3/9] Support adding pipeline component by instance --- spacy/__init__.py | 2 ++ spacy/tests/test_language.py | 33 +++++++++++++++++++++++++++++++++ spacy/util.py | 21 ++++++++++++++++++++- 3 files changed, 55 insertions(+), 1 deletion(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index c3568bc5c..995f965ae 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -35,6 +35,7 @@ def load( enable: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES, exclude: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES, config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(), + pipe_instances: Dict[str, Any] = util.SimpleFrozenDict(), ) -> Language: """Load a spaCy model from an installed package or a local path. @@ -58,6 +59,7 @@ def load( enable=enable, exclude=exclude, config=config, + pipe_instances=pipe_instances, ) diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 236856dad..02e58d0a0 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -799,3 +799,36 @@ def test_component_return(): nlp.add_pipe("test_component_bad_pipe") with pytest.raises(ValueError, match="instead of a Doc"): nlp("text") + + +@pytest.mark.parametrize("components,kwargs,position", [ + (["t1", "t2"], {"before": "t1"}, 0), + (["t1", "t2"], {"after": "t1"}, 1), + (["t1", "t2"], {"after": "t1"}, 1), + (["t1", "t2"], {"first": True}, 0), + (["t1", "t2"], {"last": True}, 2), + (["t1", "t2"], {"last": False}, 2), + (["t1", "t2"], {"first": False}, ValueError), +]) +def test_add_pipe_instance(components, kwargs, position): + nlp = Language() + for name in components: + nlp.add_pipe("textcat", name=name) + pipe_names = list(nlp.pipe_names) + if isinstance(position, int): + result = nlp.add_pipe_instance(evil_component, name="new_component", **kwargs) + assert result is evil_component + pipe_names.insert(position, "new_component") + assert nlp.pipe_names == pipe_names + else: + with pytest.raises(ValueError): + result = nlp.add_pipe_instance(evil_component, name="new_component", **kwargs) + + +def test_add_pipe_instance_to_bytes(): + nlp = Language() + nlp.add_pipe("textcat", name="t1") + nlp.add_pipe("textcat", name="t2") + nlp.add_pipe_instance(evil_component, name="new_component") + b = nlp.to_bytes() + diff --git a/spacy/util.py b/spacy/util.py index 8cc89217d..fce3f73be 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -415,6 +415,7 @@ def load_model( enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), + pipe_instances: Dict[str, Any] = SimpleFrozenDict() ) -> "Language": """Load a model from a package or data path. @@ -426,6 +427,9 @@ def load_model( exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. config (Dict[str, Any] / Config): Config overrides as nested dict or dict keyed by section values in dot notation. + pipe_instances (Dict[str, Any]): Dictionary of components + to be added to the pipeline directly (not created from + config) RETURNS (Language): The loaded nlp object. """ kwargs = { @@ -434,6 +438,7 @@ def load_model( "enable": enable, "exclude": exclude, "config": config, + "pipe_instances": pipe_instances } if isinstance(name, str): # name or string path if name.startswith("blank:"): # shortcut for blank model @@ -457,6 +462,7 @@ def load_model_from_package( enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), + pipe_instances: Dict[str, Any] = SimpleFrozenDict() ) -> "Language": """Load a model from an installed package. @@ -472,10 +478,13 @@ def load_model_from_package( components won't be loaded. config (Dict[str, Any] / Config): Config overrides as nested dict or dict keyed by section values in dot notation. + pipe_instances (Dict[str, Any]): Dictionary of components + to be added to the pipeline directly (not created from + config) RETURNS (Language): The loaded nlp object. """ cls = importlib.import_module(name) - return cls.load(vocab=vocab, disable=disable, enable=enable, exclude=exclude, config=config) # type: ignore[attr-defined] + return cls.load(vocab=vocab, disable=disable, enable=enable, exclude=exclude, config=config, pipe_instances=pipe_instances) # type: ignore[attr-defined] def load_model_from_path( @@ -487,6 +496,7 @@ def load_model_from_path( enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), + pipe_instances: Dict[str, Any] = SimpleFrozenDict() ) -> "Language": """Load a model from a data directory path. Creates Language class with pipeline from config.cfg and then calls from_disk() with path. @@ -504,6 +514,9 @@ def load_model_from_path( components won't be loaded. config (Dict[str, Any] / Config): Config overrides as nested dict or dict keyed by section values in dot notation. + pipe_instances (Dict[str, Any]): Dictionary of components + to be added to the pipeline directly (not created from + config) RETURNS (Language): The loaded nlp object. """ if not model_path.exists(): @@ -520,6 +533,7 @@ def load_model_from_path( enable=enable, exclude=exclude, meta=meta, + pipe_instances=pipe_instances ) return nlp.from_disk(model_path, exclude=exclude, overrides=overrides) @@ -534,6 +548,7 @@ def load_model_from_config( exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, auto_fill: bool = False, validate: bool = True, + pipe_instances: Dict[str, Any] = SimpleFrozenDict() ) -> "Language": """Create an nlp object from a config. Expects the full config file including a section "nlp" containing the settings for the nlp object. @@ -551,6 +566,9 @@ def load_model_from_config( components won't be loaded. auto_fill (bool): Whether to auto-fill config with missing defaults. validate (bool): Whether to show config validation errors. + pipe_instances (Dict[str, Any]): Dictionary of components + to be added to the pipeline directly (not created from + config) RETURNS (Language): The loaded nlp object. """ if "nlp" not in config: @@ -570,6 +588,7 @@ def load_model_from_config( auto_fill=auto_fill, validate=validate, meta=meta, + pipe_instances=pipe_instances ) return nlp From b9730a64cb3a620be6635433f93ee3d6d870edef Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Jun 2023 16:56:10 +0200 Subject: [PATCH 4/9] Format --- spacy/language.py | 27 +++++++++++++++------------ spacy/tests/test_language.py | 26 +++++++++++++++----------- spacy/util.py | 18 +++++++++--------- 3 files changed, 39 insertions(+), 32 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 7e52c60c3..03d8f4dee 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -801,8 +801,11 @@ class Language: self._components.insert(pipe_index, (name, pipe_component)) return pipe_component - def add_pipe_instance(self, component: PipeCallable, - /, name: Optional[str] = None, + def add_pipe_instance( + self, + component: PipeCallable, + /, + name: Optional[str] = None, *, before: Optional[Union[str, int]] = None, after: Optional[Union[str, int]] = None, @@ -1743,7 +1746,7 @@ class Language: meta: Dict[str, Any] = SimpleFrozenDict(), auto_fill: bool = True, validate: bool = True, - pipe_instances: Dict[str, Any] = SimpleFrozenDict() + pipe_instances: Dict[str, Any] = SimpleFrozenDict(), ) -> "Language": """Create the nlp object from a loaded config. Will set up the tokenizer and language data, add pipeline components etc. If no config is provided, @@ -1844,7 +1847,7 @@ class Language: # and aren't built by factory. missing_components = _find_missing_components(pipeline, pipe_instances, exclude) if missing_components: - raise ValueError(Errors.E1052.format(", ",join(missing_components))) + raise ValueError(Errors.E1052.format(", ", join(missing_components))) # If components are loaded from a source (existing models), we cache # them here so they're only loaded once source_nlps = {} @@ -1858,9 +1861,7 @@ class Language: if pipe_name in exclude: continue else: - nlp.add_pipe_instance( - pipe_instances[pipe_name] - ) + nlp.add_pipe_instance(pipe_instances[pipe_name]) # Is it important that we instantiate pipes that # aren't excluded? It seems like we would want # the exclude check above. I've left it how it @@ -2384,7 +2385,9 @@ class _Sender: self.send() -def _get_instantiated_vocab(vocab: Union[bool, Vocab], pipe_instances: Dict[str, Any]) -> Union[bool, Vocab]: +def _get_instantiated_vocab( + vocab: Union[bool, Vocab], pipe_instances: Dict[str, Any] +) -> Union[bool, Vocab]: vocab_instances = {} for name, instance in pipe_instances.items(): if hasattr(instance, "vocab") and isinstance(instance.vocab, Vocab): @@ -2410,8 +2413,8 @@ def _get_instantiated_vocab(vocab: Union[bool, Vocab], pipe_instances: Dict[str, def _find_missing_components( - pipeline: List[str], - pipe_instances: Dict[str, Any], - exclude: List[str] + pipeline: List[str], pipe_instances: Dict[str, Any], exclude: List[str] ) -> List[str]: - return [name for name in pipeline if name not in pipe_instances and name not in exclude] + return [ + name for name in pipeline if name not in pipe_instances and name not in exclude + ] diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 02e58d0a0..9c1d63e74 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -801,15 +801,18 @@ def test_component_return(): nlp("text") -@pytest.mark.parametrize("components,kwargs,position", [ - (["t1", "t2"], {"before": "t1"}, 0), - (["t1", "t2"], {"after": "t1"}, 1), - (["t1", "t2"], {"after": "t1"}, 1), - (["t1", "t2"], {"first": True}, 0), - (["t1", "t2"], {"last": True}, 2), - (["t1", "t2"], {"last": False}, 2), - (["t1", "t2"], {"first": False}, ValueError), -]) +@pytest.mark.parametrize( + "components,kwargs,position", + [ + (["t1", "t2"], {"before": "t1"}, 0), + (["t1", "t2"], {"after": "t1"}, 1), + (["t1", "t2"], {"after": "t1"}, 1), + (["t1", "t2"], {"first": True}, 0), + (["t1", "t2"], {"last": True}, 2), + (["t1", "t2"], {"last": False}, 2), + (["t1", "t2"], {"first": False}, ValueError), + ], +) def test_add_pipe_instance(components, kwargs, position): nlp = Language() for name in components: @@ -822,7 +825,9 @@ def test_add_pipe_instance(components, kwargs, position): assert nlp.pipe_names == pipe_names else: with pytest.raises(ValueError): - result = nlp.add_pipe_instance(evil_component, name="new_component", **kwargs) + result = nlp.add_pipe_instance( + evil_component, name="new_component", **kwargs + ) def test_add_pipe_instance_to_bytes(): @@ -831,4 +836,3 @@ def test_add_pipe_instance_to_bytes(): nlp.add_pipe("textcat", name="t2") nlp.add_pipe_instance(evil_component, name="new_component") b = nlp.to_bytes() - diff --git a/spacy/util.py b/spacy/util.py index fce3f73be..75602dfbf 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -415,7 +415,7 @@ def load_model( enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), - pipe_instances: Dict[str, Any] = SimpleFrozenDict() + pipe_instances: Dict[str, Any] = SimpleFrozenDict(), ) -> "Language": """Load a model from a package or data path. @@ -427,7 +427,7 @@ def load_model( exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude. config (Dict[str, Any] / Config): Config overrides as nested dict or dict keyed by section values in dot notation. - pipe_instances (Dict[str, Any]): Dictionary of components + pipe_instances (Dict[str, Any]): Dictionary of components to be added to the pipeline directly (not created from config) RETURNS (Language): The loaded nlp object. @@ -438,7 +438,7 @@ def load_model( "enable": enable, "exclude": exclude, "config": config, - "pipe_instances": pipe_instances + "pipe_instances": pipe_instances, } if isinstance(name, str): # name or string path if name.startswith("blank:"): # shortcut for blank model @@ -462,7 +462,7 @@ def load_model_from_package( enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), - pipe_instances: Dict[str, Any] = SimpleFrozenDict() + pipe_instances: Dict[str, Any] = SimpleFrozenDict(), ) -> "Language": """Load a model from an installed package. @@ -478,7 +478,7 @@ def load_model_from_package( components won't be loaded. config (Dict[str, Any] / Config): Config overrides as nested dict or dict keyed by section values in dot notation. - pipe_instances (Dict[str, Any]): Dictionary of components + pipe_instances (Dict[str, Any]): Dictionary of components to be added to the pipeline directly (not created from config) RETURNS (Language): The loaded nlp object. @@ -496,7 +496,7 @@ def load_model_from_path( enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), - pipe_instances: Dict[str, Any] = SimpleFrozenDict() + pipe_instances: Dict[str, Any] = SimpleFrozenDict(), ) -> "Language": """Load a model from a data directory path. Creates Language class with pipeline from config.cfg and then calls from_disk() with path. @@ -533,7 +533,7 @@ def load_model_from_path( enable=enable, exclude=exclude, meta=meta, - pipe_instances=pipe_instances + pipe_instances=pipe_instances, ) return nlp.from_disk(model_path, exclude=exclude, overrides=overrides) @@ -548,7 +548,7 @@ def load_model_from_config( exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, auto_fill: bool = False, validate: bool = True, - pipe_instances: Dict[str, Any] = SimpleFrozenDict() + pipe_instances: Dict[str, Any] = SimpleFrozenDict(), ) -> "Language": """Create an nlp object from a config. Expects the full config file including a section "nlp" containing the settings for the nlp object. @@ -588,7 +588,7 @@ def load_model_from_config( auto_fill=auto_fill, validate=validate, meta=meta, - pipe_instances=pipe_instances + pipe_instances=pipe_instances, ) return nlp From afbdd8259a12ada463ef5319f963f5e217670dc3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 10 Jun 2023 17:54:37 +0200 Subject: [PATCH 5/9] Fix find missing pipes --- spacy/language.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 03d8f4dee..c4962ea45 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1847,7 +1847,7 @@ class Language: # and aren't built by factory. missing_components = _find_missing_components(pipeline, pipe_instances, exclude) if missing_components: - raise ValueError(Errors.E1052.format(", ", join(missing_components))) + raise ValueError(Errors.E1052.format(names=", ".join(missing_components))) # If components are loaded from a source (existing models), we cache # them here so they're only loaded once source_nlps = {} @@ -2413,8 +2413,10 @@ def _get_instantiated_vocab( def _find_missing_components( - pipeline: List[str], pipe_instances: Dict[str, Any], exclude: List[str] + pipeline: Dict[str, Dict[str, Any]], pipe_instances: Dict[str, Any], exclude: Iterable[str] ) -> List[str]: - return [ - name for name in pipeline if name not in pipe_instances and name not in exclude - ] + missing = [] + for name, config in pipeline.items(): + if config.get("factory") == INSTANCE_FACTORY_NAME and name not in pipe_instances and name not in exclude: + missing.append(name) + return missing From 77a08591ad48e0503012881b289391492e03df38 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 13 Jun 2023 10:06:00 +0200 Subject: [PATCH 6/9] Format --- spacy/language.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 6d867820a..dc4896e11 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -2419,10 +2419,16 @@ def _get_instantiated_vocab( def _find_missing_components( - pipeline: Dict[str, Dict[str, Any]], pipe_instances: Dict[str, Any], exclude: Iterable[str] + pipeline: Dict[str, Dict[str, Any]], + pipe_instances: Dict[str, Any], + exclude: Iterable[str], ) -> List[str]: missing = [] for name, config in pipeline.items(): - if config.get("factory") == INSTANCE_FACTORY_NAME and name not in pipe_instances and name not in exclude: + if ( + config.get("factory") == INSTANCE_FACTORY_NAME + and name not in pipe_instances + and name not in exclude + ): missing.append(name) return missing From 4cc5bd3ef5c471b8891c0398a047e4848ed6e8ee Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 26 Jun 2023 11:25:16 +0200 Subject: [PATCH 7/9] fix --- spacy/language.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index ab4637c79..0b50b058a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -827,7 +827,6 @@ class Language: def add_pipe_instance( self, component: PipeCallable, - /, name: Optional[str] = None, *, before: Optional[Union[str, int]] = None, From 9fcbc8eb67e0f088bd40dcdddd2c0fd187916a76 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 26 Jun 2023 11:46:09 +0200 Subject: [PATCH 8/9] add pipe_instances also to load_model_from_init_py --- spacy/util.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/util.py b/spacy/util.py index 35278c848..24be79fa6 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -675,6 +675,7 @@ def load_model_from_init_py( enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES, config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), + pipe_instances: Dict[str, Any] = SimpleFrozenDict(), ) -> "Language": """Helper function to use in the `load()` method of a model package's __init__.py. @@ -690,6 +691,9 @@ def load_model_from_init_py( components won't be loaded. config (Dict[str, Any] / Config): Config overrides as nested dict or dict keyed by section values in dot notation. + pipe_instances (Dict[str, Any]): Dictionary of components + to be added to the pipeline directly (not created from + config) RETURNS (Language): The loaded nlp object. """ model_path = Path(init_file).parent From 0fd797e33c6f4839389215bf1c83683229a49ca6 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 5 Jul 2023 10:35:41 +0200 Subject: [PATCH 9/9] fix warning numbers --- spacy/errors.py | 8 +++----- spacy/language.py | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index b6f45dc0f..2a6653a5a 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -219,11 +219,9 @@ class Warnings(metaclass=ErrorsWithCodes): W125 = ("The StaticVectors key_attr is no longer used. To set a custom " "key attribute for vectors, configure it through Vectors(attr=) or " "'spacy init vectors --attr'") - W126 = ( - "Pipe instance '{name}' is being added with a vocab " - "instance that will not match other components. This is " - "usually an error." - ) + W126 = ("Pipe instance '{name}' is being added with a vocab " + "instance that will not match other components. This is " + "usually an error.") class Errors(metaclass=ErrorsWithCodes): diff --git a/spacy/language.py b/spacy/language.py index 439a8df87..da4530f53 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -2427,7 +2427,7 @@ def _get_instantiated_vocab( elif isinstance(vocab, Vocab): for name, inst_voc in vocab_instances.items(): if inst_voc is not vocab: - warnings.warn(Warnings.W125.format(name=name)) + warnings.warn(Warnings.W126.format(name=name)) return vocab else: resolved_vocab = None @@ -2435,7 +2435,7 @@ def _get_instantiated_vocab( if resolved_vocab is None: resolved_vocab = inst_voc elif inst_voc is not resolved_vocab: - warnings.warn(Warnings.W125.format(name=name)) + warnings.warn(Warnings.W126.format(name=name)) # This is supposed to only be for the type checker -- # it should be unreachable assert resolved_vocab is not None