Merge 0fd797e33c into ef20e114e0

2025-09-20 11:02:38 +03:00 · 2023-07-15 08:42:57 +02:00 · 2023-07-15 08:42:57 +02:00 · 708ee32f9f
commit 708ee32f9f
parent ef20e114e0 0fd797e33c
5 changed files with 187 additions and 2 deletions
--- a/spacy/init.py
+++ b/spacy/init.py
@ -32,6 +32,7 @@ def load(
    enable: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES,
    exclude: Union[str, Iterable[str]] = util._DEFAULT_EMPTY_PIPES,
    config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
    pipe_instances: Dict[str, Any] = util.SimpleFrozenDict(),
 ) -> Language:
    """Load a spaCy model from an installed package or a local path.
@ -55,6 +56,7 @@ def load(
        enable=enable,
        exclude=exclude,
        config=config,
        pipe_instances=pipe_instances,
    )
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -219,6 +219,9 @@ class Warnings(metaclass=ErrorsWithCodes):
    W125 = ("The StaticVectors key_attr is no longer used. To set a custom "
            "key attribute for vectors, configure it through Vectors(attr=) or "
            "'spacy init vectors --attr'")
    W126 = ("Pipe instance '{name}' is being added with a vocab "
            "instance that will not match other components. This is "
            "usually an error.")
 class Errors(metaclass=ErrorsWithCodes):
@ -981,6 +984,7 @@ class Errors(metaclass=ErrorsWithCodes):
             " 'min_length': {min_length}, 'max_length': {max_length}")
    E1054 = ("The text, including whitespace, must match between reference and "
             "predicted docs when training {component}.")
    E1055 = ("Cannot create Language instance from config: missing pipeline components. The following components were added by instance (rather than config) via the 'Language.add_pipe_instance()' method, but are not present in the 'pipe_instances' variable: {names}")
 # Deprecated model shortcuts, only used in errors and warnings
--- a/spacy/language.py
+++ b/spacy/language.py
@ -75,6 +75,9 @@ DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH)
 # This is the base config for the [pretraining] block and currently not included
 # in the main config and only added via the 'init fill-config' command
 DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg"
 # Factory name indicating that the component wasn't constructed by a factory,
 # and was instead passed by instance
 INSTANCE_FACTORY_NAME = "__added_by_instance__"
 # Type variable for contexts piped with documents
 _AnyContext = TypeVar("_AnyContext")
@ -771,6 +774,9 @@ class Language:
        """Add a component to the processing pipeline. Valid components are
        callables that take a `Doc` object, modify it and return it. Only one
        of before/after/first/last can be set. Default behaviour is "last".
        Components can be added either by factory name or by instance. If
        an instance is supplied and you serialize the pipeline, you'll need
        to also pass an instance into spacy.load() to construct the pipeline.
        factory_name (str): Name of the component factory.
        name (str): Name of pipeline component. Overwrites existing
@ -818,12 +824,61 @@ class Language:
                raw_config=raw_config,
                validate=validate,
            )
        pipe_index = self._get_pipe_index(before, after, first, last)
        self._pipe_meta[name] = self.get_factory_meta(factory_name)
        pipe_index = self._get_pipe_index(before, after, first, last)
        self._components.insert(pipe_index, (name, pipe_component))
        self._link_components()
        return pipe_component
    def add_pipe_instance(
        self,
        component: PipeCallable,
        name: Optional[str] = None,
        *,
        before: Optional[Union[str, int]] = None,
        after: Optional[Union[str, int]] = None,
        first: Optional[bool] = None,
        last: Optional[bool] = None,
    ) -> PipeCallable:
        """Add a component instance to the processing pipeline. Valid components
        are callables that take a `Doc` object, modify it and return it. Only one
        of before/after/first/last can be set. Default behaviour is "last".
        A limitation of this method is that spaCy will not know how to reconstruct
        your pipeline after you save it out (unlike the 'Language.add_pipe()' method,
        where you provide a config and let spaCy construct the instance). See 'spacy.load'
        for details of how to load back a pipeline with components added by instance.
        pipe_instance (Callable[[Doc], Doc]): The component to add.
        name (str): Name of pipeline component. Overwrites existing
            component.name attribute if available. If no name is set and
            the component exposes no name attribute, component.__name__ is
            used. An error is raised if a name already exists in the pipeline.
        before (Union[str, int]): Name or index of the component to insert new
            component directly before.
        after (Union[str, int]): Name or index of the component to insert new
            component directly after.
        first (bool): If True, insert component first in the pipeline.
        last (bool): If True, insert component last in the pipeline.
        RETURNS (Callable[[Doc], Doc]): The pipeline component.
        DOCS: https://spacy.io/api/language#add_pipe_instance
        """
        name = name if name is not None else getattr(component, "name")
        if name is None:
            raise ValueError("TODO error")
        if name in self.component_names:
            raise ValueError(Errors.E007.format(name=name, opts=self.component_names))
        # It would be possible to take arguments for the FactoryMeta here, but we'll then have
        # a problem on deserialization: where will the data be coming from?
        # I think if someone wants that, they should register a component function.
        self._pipe_meta[name] = FactoryMeta(INSTANCE_FACTORY_NAME)
        self._pipe_configs[name] = Config()
        pipe_index = self._get_pipe_index(before, after, first, last)
        self._components.insert(pipe_index, (name, component))
        return component
    def _get_pipe_index(
        self,
        before: Optional[Union[str, int]] = None,
@ -1735,6 +1790,7 @@ class Language:
        meta: Dict[str, Any] = SimpleFrozenDict(),
        auto_fill: bool = True,
        validate: bool = True,
        pipe_instances: Dict[str, Any] = SimpleFrozenDict(),
    ) -> "Language":
        """Create the nlp object from a loaded config. Will set up the tokenizer
        and language data, add pipeline components etc. If no config is provided,
@ -1810,6 +1866,11 @@ class Language:
        # Warn about require_gpu usage in jupyter notebook
        warn_if_jupyter_cupy()
        # If we've been passed pipe instances, check whether
        # they have a Vocab instance, and if they do, use
        # that one. This also performs some additional checks and
        # warns if there's a mismatch.
        vocab = _get_instantiated_vocab(vocab, pipe_instances)
        # Note that we don't load vectors here, instead they get loaded explicitly
        # inside stuff like the spacy train function. If we loaded them here,
@ -1826,6 +1887,11 @@ class Language:
        interpolated = filled.interpolate() if not filled.is_interpolated else filled
        pipeline = interpolated.get("components", {})
        sourced = util.get_sourced_components(interpolated)
        # Check for components that aren't in the pipe_instances dict, aren't disabled,
        # and aren't built by factory.
        missing_components = _find_missing_components(pipeline, pipe_instances, exclude)
        if missing_components:
            raise ValueError(Errors.E1055.format(names=", ".join(missing_components)))
        # If components are loaded from a source (existing models), we cache
        # them here so they're only loaded once
        source_nlps = {}
@ -1835,6 +1901,16 @@ class Language:
            if pipe_name not in pipeline:
                opts = ", ".join(pipeline.keys())
                raise ValueError(Errors.E956.format(name=pipe_name, opts=opts))
            if pipe_name in pipe_instances:
                if pipe_name in exclude:
                    continue
                else:
                    nlp.add_pipe_instance(pipe_instances[pipe_name])
            # Is it important that we instantiate pipes that
            # aren't excluded? It seems like we would want
            # the exclude check above. I've left it how it
            # is though, in case there's some sort of crazy
            # load-bearing side-effects someone is relying on?
            pipe_cfg = util.copy_config(pipeline[pipe_name])
            raw_config = Config(filled["components"][pipe_name])
            if pipe_name not in exclude:
@ -2337,3 +2413,46 @@ class _Sender:
        if self.count >= self.chunk_size:
            self.count = 0
            self.send()
 def _get_instantiated_vocab(
    vocab: Union[bool, Vocab], pipe_instances: Dict[str, Any]
 ) -> Union[bool, Vocab]:
    vocab_instances = {}
    for name, instance in pipe_instances.items():
        if hasattr(instance, "vocab") and isinstance(instance.vocab, Vocab):
            vocab_instances[name] = instance.vocab
    if not vocab_instances:
        return vocab
    elif isinstance(vocab, Vocab):
        for name, inst_voc in vocab_instances.items():
            if inst_voc is not vocab:
                warnings.warn(Warnings.W126.format(name=name))
        return vocab
    else:
        resolved_vocab = None
        for name, inst_voc in vocab_instances.items():
            if resolved_vocab is None:
                resolved_vocab = inst_voc
            elif inst_voc is not resolved_vocab:
                warnings.warn(Warnings.W126.format(name=name))
        # This is supposed to only be for the type checker --
        # it should be unreachable
        assert resolved_vocab is not None
        return resolved_vocab
 def _find_missing_components(
    pipeline: Dict[str, Dict[str, Any]],
    pipe_instances: Dict[str, Any],
    exclude: Iterable[str],
 ) -> List[str]:
    missing = []
    for name, config in pipeline.items():
        if (
            config.get("factory") == INSTANCE_FACTORY_NAME
            and name not in pipe_instances
            and name not in exclude
        ):
            missing.append(name)
    return missing
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -800,3 +800,40 @@ def test_component_return():
    nlp.add_pipe("test_component_bad_pipe")
    with pytest.raises(ValueError, match="instead of a Doc"):
        nlp("text")
@pytest.mark.parametrize(
    "components,kwargs,position",
    [
        (["t1", "t2"], {"before": "t1"}, 0),
        (["t1", "t2"], {"after": "t1"}, 1),
        (["t1", "t2"], {"after": "t1"}, 1),
        (["t1", "t2"], {"first": True}, 0),
        (["t1", "t2"], {"last": True}, 2),
        (["t1", "t2"], {"last": False}, 2),
        (["t1", "t2"], {"first": False}, ValueError),
    ],
 )
 def test_add_pipe_instance(components, kwargs, position):
    nlp = Language()
    for name in components:
        nlp.add_pipe("textcat", name=name)
    pipe_names = list(nlp.pipe_names)
    if isinstance(position, int):
        result = nlp.add_pipe_instance(evil_component, name="new_component", **kwargs)
        assert result is evil_component
        pipe_names.insert(position, "new_component")
        assert nlp.pipe_names == pipe_names
    else:
        with pytest.raises(ValueError):
            result = nlp.add_pipe_instance(
                evil_component, name="new_component", **kwargs
            )
 def test_add_pipe_instance_to_bytes():
    nlp = Language()
    nlp.add_pipe("textcat", name="t1")
    nlp.add_pipe("textcat", name="t2")
    nlp.add_pipe_instance(evil_component, name="new_component")
    b = nlp.to_bytes()
--- a/spacy/util.py
+++ b/spacy/util.py
@ -438,6 +438,7 @@ def load_model(
    enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
    exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
    pipe_instances: Dict[str, Any] = SimpleFrozenDict(),
 ) -> "Language":
    """Load a model from a package or data path.
@ -449,6 +450,9 @@ def load_model(
    exclude (Union[str, Iterable[str]]):  Name(s) of pipeline component(s) to exclude.
    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
        keyed by section values in dot notation.
    pipe_instances (Dict[str, Any]): Dictionary of components
        to be added to the pipeline directly (not created from
        config)
    RETURNS (Language): The loaded nlp object.
    """
    kwargs = {
@ -457,6 +461,7 @@ def load_model(
        "enable": enable,
        "exclude": exclude,
        "config": config,
        "pipe_instances": pipe_instances,
    }
    if isinstance(name, str):  # name or string path
        if name.startswith("blank:"):  # shortcut for blank model
@ -480,6 +485,7 @@ def load_model_from_package(
    enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
    exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
    pipe_instances: Dict[str, Any] = SimpleFrozenDict(),
 ) -> "Language":
    """Load a model from an installed package.
@ -495,10 +501,13 @@ def load_model_from_package(
        components won't be loaded.
    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
        keyed by section values in dot notation.
    pipe_instances (Dict[str, Any]): Dictionary of components
        to be added to the pipeline directly (not created from
        config)
    RETURNS (Language): The loaded nlp object.
    """
    cls = importlib.import_module(name)
-    return cls.load(vocab=vocab, disable=disable, enable=enable, exclude=exclude, config=config)  # type: ignore[attr-defined]
+    return cls.load(vocab=vocab, disable=disable, enable=enable, exclude=exclude, config=config, pipe_instances=pipe_instances)  # type: ignore[attr-defined]
 def load_model_from_path(
@ -510,6 +519,7 @@ def load_model_from_path(
    enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
    exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
    pipe_instances: Dict[str, Any] = SimpleFrozenDict(),
 ) -> "Language":
    """Load a model from a data directory path. Creates Language class with
    pipeline from config.cfg and then calls from_disk() with path.
@ -527,6 +537,9 @@ def load_model_from_path(
        components won't be loaded.
    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
        keyed by section values in dot notation.
    pipe_instances (Dict[str, Any]): Dictionary of components
        to be added to the pipeline directly (not created from
        config)
    RETURNS (Language): The loaded nlp object.
    """
    if not model_path.exists():
@ -543,6 +556,7 @@ def load_model_from_path(
        enable=enable,
        exclude=exclude,
        meta=meta,
        pipe_instances=pipe_instances,
    )
    return nlp.from_disk(model_path, exclude=exclude, overrides=overrides)
@ -557,6 +571,7 @@ def load_model_from_config(
    exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
    auto_fill: bool = False,
    validate: bool = True,
    pipe_instances: Dict[str, Any] = SimpleFrozenDict(),
 ) -> "Language":
    """Create an nlp object from a config. Expects the full config file including
    a section "nlp" containing the settings for the nlp object.
@ -574,6 +589,9 @@ def load_model_from_config(
        components won't be loaded.
    auto_fill (bool): Whether to auto-fill config with missing defaults.
    validate (bool): Whether to show config validation errors.
    pipe_instances (Dict[str, Any]): Dictionary of components
        to be added to the pipeline directly (not created from
        config)
    RETURNS (Language): The loaded nlp object.
    """
    if "nlp" not in config:
@ -593,6 +611,7 @@ def load_model_from_config(
        auto_fill=auto_fill,
        validate=validate,
        meta=meta,
        pipe_instances=pipe_instances,
    )
    return nlp
@ -656,6 +675,7 @@ def load_model_from_init_py(
    enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
    exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
    config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
    pipe_instances: Dict[str, Any] = SimpleFrozenDict(),
 ) -> "Language":
    """Helper function to use in the `load()` method of a model package's
    __init__.py.
@ -671,6 +691,9 @@ def load_model_from_init_py(
        components won't be loaded.
    config (Dict[str, Any] / Config): Config overrides as nested dict or dict
        keyed by section values in dot notation.
    pipe_instances (Dict[str, Any]): Dictionary of components
        to be added to the pipeline directly (not created from
        config)
    RETURNS (Language): The loaded nlp object.
    """
    model_path = Path(init_file).parent