Merge pull request #5993 from explosion/feature/disabled-components

This commit is contained in:
Ines Montani 2020-08-29 15:58:41 +02:00 committed by GitHub
commit 45f46a5c85
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 667 additions and 224 deletions

View File

@ -27,18 +27,23 @@ if sys.maxunicode == 65535:
def load( def load(
name: Union[str, Path], name: Union[str, Path],
disable: Iterable[str] = tuple(), disable: Iterable[str] = util.SimpleFrozenList(),
exclude: Iterable[str] = util.SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(), config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
) -> Language: ) -> Language:
"""Load a spaCy model from an installed package or a local path. """Load a spaCy model from an installed package or a local path.
name (str): Package name or model path. name (str): Package name or model path.
disable (Iterable[str]): Names of pipeline components to disable. disable (Iterable[str]): Names of pipeline components to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe.
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation. keyed by section values in dot notation.
RETURNS (Language): The loaded nlp object. RETURNS (Language): The loaded nlp object.
""" """
return util.load_model(name, disable=disable, config=config) return util.load_model(name, disable=disable, exclude=exclude, config=config)
def blank(name: str, **overrides) -> Language: def blank(name: str, **overrides) -> Language:

View File

@ -1,6 +1,6 @@
"""This module contains helpers and subcommands for integrating spaCy projects """This module contains helpers and subcommands for integrating spaCy projects
with Data Version Controk (DVC). https://dvc.org""" with Data Version Controk (DVC). https://dvc.org"""
from typing import Dict, Any, List, Optional from typing import Dict, Any, List, Optional, Iterable
import subprocess import subprocess
from pathlib import Path from pathlib import Path
from wasabi import msg from wasabi import msg
@ -8,6 +8,7 @@ from wasabi import msg
from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli
from .._util import Arg, Opt, NAME, COMMAND from .._util import Arg, Opt, NAME, COMMAND
from ...util import working_dir, split_command, join_command, run_command from ...util import working_dir, split_command, join_command, run_command
from ...util import SimpleFrozenList
DVC_CONFIG = "dvc.yaml" DVC_CONFIG = "dvc.yaml"
@ -130,7 +131,7 @@ def update_dvc_config(
def run_dvc_commands( def run_dvc_commands(
commands: List[str] = tuple(), flags: Dict[str, bool] = {}, commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {},
) -> None: ) -> None:
"""Run a sequence of DVC commands in a subprocess, in order. """Run a sequence of DVC commands in a subprocess, in order.

View File

@ -1,10 +1,11 @@
from typing import Optional, List, Dict, Sequence, Any from typing import Optional, List, Dict, Sequence, Any, Iterable
from pathlib import Path from pathlib import Path
from wasabi import msg from wasabi import msg
import sys import sys
import srsly import srsly
from ...util import working_dir, run_command, split_command, is_cwd, join_command from ...util import working_dir, run_command, split_command, is_cwd, join_command
from ...util import SimpleFrozenList
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND from .._util import get_checksum, project_cli, Arg, Opt, COMMAND
@ -115,7 +116,9 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
def run_commands( def run_commands(
commands: List[str] = tuple(), silent: bool = False, dry: bool = False, commands: Iterable[str] = SimpleFrozenList(),
silent: bool = False,
dry: bool = False,
) -> None: ) -> None:
"""Run a sequence of commands in a subprocess, in order. """Run a sequence of commands in a subprocess, in order.

View File

@ -11,6 +11,7 @@ use_pytorch_for_gpu_memory = false
[nlp] [nlp]
lang = null lang = null
pipeline = [] pipeline = []
disabled = []
load_vocab_data = true load_vocab_data = true
before_creation = null before_creation = null
after_creation = null after_creation = null

View File

@ -137,11 +137,10 @@ class Errors:
"after (component name or index), first (True) or last (True). " "after (component name or index), first (True) or last (True). "
"Invalid configuration: {args}. Existing components: {opts}") "Invalid configuration: {args}. Existing components: {opts}")
E007 = ("'{name}' already exists in pipeline. Existing names: {opts}") E007 = ("'{name}' already exists in pipeline. Existing names: {opts}")
E008 = ("Some current components would be lost when restoring previous " E008 = ("Can't restore disabled pipeline component '{name}' because it "
"pipeline state. If you added components after calling " "doesn't exist in the pipeline anymore. If you want to remove "
"`nlp.select_pipes()`, you should remove them explicitly with " "components from the pipeline, you should do it before calling "
"`nlp.remove_pipe()` before the pipeline is restored. Names of " "`nlp.select_pipes()` or after restoring the disabled components.")
"the new components: {names}")
E010 = ("Word vectors set to length 0. This may be because you don't have " E010 = ("Word vectors set to length 0. This may be because you don't have "
"a model installed or loaded, or because your model doesn't " "a model installed or loaded, or because your model doesn't "
"include word vectors. For more info, see the docs:\n" "include word vectors. For more info, see the docs:\n"
@ -474,6 +473,13 @@ class Errors:
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E926 = ("It looks like you're trying to modify nlp.{attr} directly. This "
"doesn't work because it's an immutable computed property. If you "
"need to modify the pipeline, use the built-in methods like "
"nlp.add_pipe, nlp.remove_pipe, nlp.disable_pipe or nlp.enable_pipe "
"instead.")
E927 = ("Can't write to frozen list Maybe you're trying to modify a computed "
"property or default function argument?")
E928 = ("A 'KnowledgeBase' should be written to / read from a file, but the " E928 = ("A 'KnowledgeBase' should be written to / read from a file, but the "
"provided argument {loc} is an existing directory.") "provided argument {loc} is an existing directory.")
E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does " E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "

View File

@ -6,7 +6,7 @@ import itertools
import weakref import weakref
import functools import functools
from contextlib import contextmanager from contextlib import contextmanager
from copy import copy, deepcopy from copy import deepcopy
from pathlib import Path from pathlib import Path
import warnings import warnings
from thinc.api import get_current_ops, Config, require_gpu, Optimizer from thinc.api import get_current_ops, Config, require_gpu, Optimizer
@ -20,7 +20,7 @@ from .vocab import Vocab, create_vocab
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
from .gold import Example, validate_examples from .gold import Example, validate_examples
from .scorer import Scorer from .scorer import Scorer
from .util import create_default_optimizer, registry from .util import create_default_optimizer, registry, SimpleFrozenList
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
@ -159,7 +159,8 @@ class Language:
self.vocab: Vocab = vocab self.vocab: Vocab = vocab
if self.lang is None: if self.lang is None:
self.lang = self.vocab.lang self.lang = self.vocab.lang
self.pipeline = [] self._components = []
self._disabled = set()
self.max_length = max_length self.max_length = max_length
self.resolved = {} self.resolved = {}
# Create the default tokenizer from the default config # Create the default tokenizer from the default config
@ -206,10 +207,11 @@ class Language:
"keys": self.vocab.vectors.n_keys, "keys": self.vocab.vectors.n_keys,
"name": self.vocab.vectors.name, "name": self.vocab.vectors.name,
} }
self._meta["labels"] = self.pipe_labels self._meta["labels"] = dict(self.pipe_labels)
# TODO: Adding this back to prevent breaking people's code etc., but # TODO: Adding this back to prevent breaking people's code etc., but
# we should consider removing it # we should consider removing it
self._meta["pipeline"] = self.pipe_names self._meta["pipeline"] = list(self.pipe_names)
self._meta["disabled"] = list(self.disabled)
return self._meta return self._meta
@meta.setter @meta.setter
@ -232,13 +234,14 @@ class Language:
# we can populate the config again later # we can populate the config again later
pipeline = {} pipeline = {}
score_weights = [] score_weights = []
for pipe_name in self.pipe_names: for pipe_name in self.component_names:
pipe_meta = self.get_pipe_meta(pipe_name) pipe_meta = self.get_pipe_meta(pipe_name)
pipe_config = self.get_pipe_config(pipe_name) pipe_config = self.get_pipe_config(pipe_name)
pipeline[pipe_name] = {"factory": pipe_meta.factory, **pipe_config} pipeline[pipe_name] = {"factory": pipe_meta.factory, **pipe_config}
if pipe_meta.default_score_weights: if pipe_meta.default_score_weights:
score_weights.append(pipe_meta.default_score_weights) score_weights.append(pipe_meta.default_score_weights)
self._config["nlp"]["pipeline"] = self.pipe_names self._config["nlp"]["pipeline"] = list(self.component_names)
self._config["nlp"]["disabled"] = list(self.disabled)
self._config["components"] = pipeline self._config["components"] = pipeline
self._config["training"]["score_weights"] = combine_score_weights(score_weights) self._config["training"]["score_weights"] = combine_score_weights(score_weights)
if not srsly.is_json_serializable(self._config): if not srsly.is_json_serializable(self._config):
@ -249,21 +252,64 @@ class Language:
def config(self, value: Config) -> None: def config(self, value: Config) -> None:
self._config = value self._config = value
@property
def disabled(self) -> List[str]:
"""Get the names of all disabled components.
RETURNS (List[str]): The disabled components.
"""
# Make sure the disabled components are returned in the order they
# appear in the pipeline (which isn't guaranteed by the set)
names = [name for name, _ in self._components if name in self._disabled]
return SimpleFrozenList(names, error=Errors.E926.format(attr="disabled"))
@property @property
def factory_names(self) -> List[str]: def factory_names(self) -> List[str]:
"""Get names of all available factories. """Get names of all available factories.
RETURNS (List[str]): The factory names. RETURNS (List[str]): The factory names.
""" """
return list(self.factories.keys()) names = list(self.factories.keys())
return SimpleFrozenList(names)
@property @property
def pipe_names(self) -> List[str]: def components(self) -> List[Tuple[str, Callable[[Doc], Doc]]]:
"""Get names of available pipeline components. """Get all (name, component) tuples in the pipeline, including the
currently disabled components.
"""
return SimpleFrozenList(
self._components, error=Errors.E926.format(attr="components")
)
@property
def component_names(self) -> List[str]:
"""Get the names of the available pipeline components. Includes all
active and inactive pipeline components.
RETURNS (List[str]): List of component name strings, in order. RETURNS (List[str]): List of component name strings, in order.
""" """
return [pipe_name for pipe_name, _ in self.pipeline] names = [pipe_name for pipe_name, _ in self._components]
return SimpleFrozenList(names, error=Errors.E926.format(attr="component_names"))
@property
def pipeline(self) -> List[Tuple[str, Callable[[Doc], Doc]]]:
"""The processing pipeline consisting of (name, component) tuples. The
components are called on the Doc in order as it passes through the
pipeline.
RETURNS (List[Tuple[str, Callable[[Doc], Doc]]]): The pipeline.
"""
pipes = [(n, p) for n, p in self._components if n not in self._disabled]
return SimpleFrozenList(pipes, error=Errors.E926.format(attr="pipeline"))
@property
def pipe_names(self) -> List[str]:
"""Get names of available active pipeline components.
RETURNS (List[str]): List of component name strings, in order.
"""
names = [pipe_name for pipe_name, _ in self.pipeline]
return SimpleFrozenList(names, error=Errors.E926.format(attr="pipe_names"))
@property @property
def pipe_factories(self) -> Dict[str, str]: def pipe_factories(self) -> Dict[str, str]:
@ -272,9 +318,9 @@ class Language:
RETURNS (Dict[str, str]): Factory names, keyed by component names. RETURNS (Dict[str, str]): Factory names, keyed by component names.
""" """
factories = {} factories = {}
for pipe_name, pipe in self.pipeline: for pipe_name, pipe in self._components:
factories[pipe_name] = self.get_pipe_meta(pipe_name).factory factories[pipe_name] = self.get_pipe_meta(pipe_name).factory
return factories return SimpleFrozenDict(factories)
@property @property
def pipe_labels(self) -> Dict[str, List[str]]: def pipe_labels(self) -> Dict[str, List[str]]:
@ -284,10 +330,10 @@ class Language:
RETURNS (Dict[str, List[str]]): Labels keyed by component name. RETURNS (Dict[str, List[str]]): Labels keyed by component name.
""" """
labels = {} labels = {}
for name, pipe in self.pipeline: for name, pipe in self._components:
if hasattr(pipe, "labels"): if hasattr(pipe, "labels"):
labels[name] = list(pipe.labels) labels[name] = list(pipe.labels)
return labels return SimpleFrozenDict(labels)
@classmethod @classmethod
def has_factory(cls, name: str) -> bool: def has_factory(cls, name: str) -> bool:
@ -358,10 +404,10 @@ class Language:
name: str, name: str,
*, *,
default_config: Dict[str, Any] = SimpleFrozenDict(), default_config: Dict[str, Any] = SimpleFrozenDict(),
assigns: Iterable[str] = tuple(), assigns: Iterable[str] = SimpleFrozenList(),
requires: Iterable[str] = tuple(), requires: Iterable[str] = SimpleFrozenList(),
retokenizes: bool = False, retokenizes: bool = False,
scores: Iterable[str] = tuple(), scores: Iterable[str] = SimpleFrozenList(),
default_score_weights: Dict[str, float] = SimpleFrozenDict(), default_score_weights: Dict[str, float] = SimpleFrozenDict(),
func: Optional[Callable] = None, func: Optional[Callable] = None,
) -> Callable: ) -> Callable:
@ -447,8 +493,8 @@ class Language:
cls, cls,
name: Optional[str] = None, name: Optional[str] = None,
*, *,
assigns: Iterable[str] = tuple(), assigns: Iterable[str] = SimpleFrozenList(),
requires: Iterable[str] = tuple(), requires: Iterable[str] = SimpleFrozenList(),
retokenizes: bool = False, retokenizes: bool = False,
func: Optional[Callable[[Doc], Doc]] = None, func: Optional[Callable[[Doc], Doc]] = None,
) -> Callable: ) -> Callable:
@ -535,10 +581,10 @@ class Language:
DOCS: https://spacy.io/api/language#get_pipe DOCS: https://spacy.io/api/language#get_pipe
""" """
for pipe_name, component in self.pipeline: for pipe_name, component in self._components:
if pipe_name == name: if pipe_name == name:
return component return component
raise KeyError(Errors.E001.format(name=name, opts=self.pipe_names)) raise KeyError(Errors.E001.format(name=name, opts=self.component_names))
def create_pipe( def create_pipe(
self, self,
@ -683,8 +729,8 @@ class Language:
err = Errors.E966.format(component=bad_val, name=name) err = Errors.E966.format(component=bad_val, name=name)
raise ValueError(err) raise ValueError(err)
name = name if name is not None else factory_name name = name if name is not None else factory_name
if name in self.pipe_names: if name in self.component_names:
raise ValueError(Errors.E007.format(name=name, opts=self.pipe_names)) raise ValueError(Errors.E007.format(name=name, opts=self.component_names))
if source is not None: if source is not None:
# We're loading the component from a model. After loading the # We're loading the component from a model. After loading the
# component, we know its real factory name # component, we know its real factory name
@ -709,7 +755,7 @@ class Language:
) )
pipe_index = self._get_pipe_index(before, after, first, last) pipe_index = self._get_pipe_index(before, after, first, last)
self._pipe_meta[name] = self.get_factory_meta(factory_name) self._pipe_meta[name] = self.get_factory_meta(factory_name)
self.pipeline.insert(pipe_index, (name, pipe_component)) self._components.insert(pipe_index, (name, pipe_component))
return pipe_component return pipe_component
def _get_pipe_index( def _get_pipe_index(
@ -730,32 +776,42 @@ class Language:
""" """
all_args = {"before": before, "after": after, "first": first, "last": last} all_args = {"before": before, "after": after, "first": first, "last": last}
if sum(arg is not None for arg in [before, after, first, last]) >= 2: if sum(arg is not None for arg in [before, after, first, last]) >= 2:
raise ValueError(Errors.E006.format(args=all_args, opts=self.pipe_names)) raise ValueError(
Errors.E006.format(args=all_args, opts=self.component_names)
)
if last or not any(value is not None for value in [first, before, after]): if last or not any(value is not None for value in [first, before, after]):
return len(self.pipeline) return len(self._components)
elif first: elif first:
return 0 return 0
elif isinstance(before, str): elif isinstance(before, str):
if before not in self.pipe_names: if before not in self.component_names:
raise ValueError(Errors.E001.format(name=before, opts=self.pipe_names)) raise ValueError(
return self.pipe_names.index(before) Errors.E001.format(name=before, opts=self.component_names)
)
return self.component_names.index(before)
elif isinstance(after, str): elif isinstance(after, str):
if after not in self.pipe_names: if after not in self.component_names:
raise ValueError(Errors.E001.format(name=after, opts=self.pipe_names)) raise ValueError(
return self.pipe_names.index(after) + 1 Errors.E001.format(name=after, opts=self.component_names)
)
return self.component_names.index(after) + 1
# We're only accepting indices referring to components that exist # We're only accepting indices referring to components that exist
# (can't just do isinstance here because bools are instance of int, too) # (can't just do isinstance here because bools are instance of int, too)
elif type(before) == int: elif type(before) == int:
if before >= len(self.pipeline) or before < 0: if before >= len(self._components) or before < 0:
err = Errors.E959.format(dir="before", idx=before, opts=self.pipe_names) err = Errors.E959.format(
dir="before", idx=before, opts=self.component_names
)
raise ValueError(err) raise ValueError(err)
return before return before
elif type(after) == int: elif type(after) == int:
if after >= len(self.pipeline) or after < 0: if after >= len(self._components) or after < 0:
err = Errors.E959.format(dir="after", idx=after, opts=self.pipe_names) err = Errors.E959.format(
dir="after", idx=after, opts=self.component_names
)
raise ValueError(err) raise ValueError(err)
return after + 1 return after + 1
raise ValueError(Errors.E006.format(args=all_args, opts=self.pipe_names)) raise ValueError(Errors.E006.format(args=all_args, opts=self.component_names))
def has_pipe(self, name: str) -> bool: def has_pipe(self, name: str) -> bool:
"""Check if a component name is present in the pipeline. Equivalent to """Check if a component name is present in the pipeline. Equivalent to
@ -796,7 +852,7 @@ class Language:
# to Language.pipeline to make sure the configs are handled correctly # to Language.pipeline to make sure the configs are handled correctly
pipe_index = self.pipe_names.index(name) pipe_index = self.pipe_names.index(name)
self.remove_pipe(name) self.remove_pipe(name)
if not len(self.pipeline) or pipe_index == len(self.pipeline): if not len(self._components) or pipe_index == len(self._components):
# we have no components to insert before/after, or we're replacing the last component # we have no components to insert before/after, or we're replacing the last component
self.add_pipe(factory_name, name=name, config=config, validate=validate) self.add_pipe(factory_name, name=name, config=config, validate=validate)
else: else:
@ -816,12 +872,16 @@ class Language:
DOCS: https://spacy.io/api/language#rename_pipe DOCS: https://spacy.io/api/language#rename_pipe
""" """
if old_name not in self.pipe_names: if old_name not in self.component_names:
raise ValueError(Errors.E001.format(name=old_name, opts=self.pipe_names)) raise ValueError(
if new_name in self.pipe_names: Errors.E001.format(name=old_name, opts=self.component_names)
raise ValueError(Errors.E007.format(name=new_name, opts=self.pipe_names)) )
i = self.pipe_names.index(old_name) if new_name in self.component_names:
self.pipeline[i] = (new_name, self.pipeline[i][1]) raise ValueError(
Errors.E007.format(name=new_name, opts=self.component_names)
)
i = self.component_names.index(old_name)
self._components[i] = (new_name, self._components[i][1])
self._pipe_meta[new_name] = self._pipe_meta.pop(old_name) self._pipe_meta[new_name] = self._pipe_meta.pop(old_name)
self._pipe_configs[new_name] = self._pipe_configs.pop(old_name) self._pipe_configs[new_name] = self._pipe_configs.pop(old_name)
@ -833,20 +893,45 @@ class Language:
DOCS: https://spacy.io/api/language#remove_pipe DOCS: https://spacy.io/api/language#remove_pipe
""" """
if name not in self.pipe_names: if name not in self.component_names:
raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names)) raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
removed = self.pipeline.pop(self.pipe_names.index(name)) removed = self._components.pop(self.component_names.index(name))
# We're only removing the component itself from the metas/configs here # We're only removing the component itself from the metas/configs here
# because factory may be used for something else # because factory may be used for something else
self._pipe_meta.pop(name) self._pipe_meta.pop(name)
self._pipe_configs.pop(name) self._pipe_configs.pop(name)
# Make sure the name is also removed from the set of disabled components
if name in self.disabled:
self._disabled.remove(name)
return removed return removed
def disable_pipe(self, name: str) -> None:
"""Disable a pipeline component. The component will still exist on
the nlp object, but it won't be run as part of the pipeline. Does
nothing if the component is already disabled.
name (str): The name of the component to disable.
"""
if name not in self.component_names:
raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
self._disabled.add(name)
def enable_pipe(self, name: str) -> None:
"""Enable a previously disabled pipeline component so it's run as part
of the pipeline. Does nothing if the component is already enabled.
name (str): The name of the component to enable.
"""
if name not in self.component_names:
raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
if name in self.disabled:
self._disabled.remove(name)
def __call__( def __call__(
self, self,
text: str, text: str,
*, *,
disable: Iterable[str] = tuple(), disable: Iterable[str] = SimpleFrozenList(),
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
) -> Doc: ) -> Doc:
"""Apply the pipeline to some text. The text can span multiple sentences, """Apply the pipeline to some text. The text can span multiple sentences,
@ -892,7 +977,7 @@ class Language:
warnings.warn(Warnings.W096, DeprecationWarning) warnings.warn(Warnings.W096, DeprecationWarning)
if len(names) == 1 and isinstance(names[0], (list, tuple)): if len(names) == 1 and isinstance(names[0], (list, tuple)):
names = names[0] # support list of names instead of spread names = names[0] # support list of names instead of spread
return DisabledPipes(self, names) return self.select_pipes(disable=names)
def select_pipes( def select_pipes(
self, self,
@ -945,7 +1030,7 @@ class Language:
sgd: Optional[Optimizer] = None, sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None, losses: Optional[Dict[str, float]] = None,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
exclude: Iterable[str] = tuple(), exclude: Iterable[str] = SimpleFrozenList(),
): ):
"""Update the models in the pipeline. """Update the models in the pipeline.
@ -999,7 +1084,7 @@ class Language:
sgd: Optional[Optimizer] = None, sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None, losses: Optional[Dict[str, float]] = None,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
exclude: Iterable[str] = tuple(), exclude: Iterable[str] = SimpleFrozenList(),
) -> Dict[str, float]: ) -> Dict[str, float]:
"""Make a "rehearsal" update to the models in the pipeline, to prevent """Make a "rehearsal" update to the models in the pipeline, to prevent
forgetting. Rehearsal updates run an initial copy of the model over some forgetting. Rehearsal updates run an initial copy of the model over some
@ -1228,7 +1313,7 @@ class Language:
*, *,
as_tuples: bool = False, as_tuples: bool = False,
batch_size: int = 1000, batch_size: int = 1000,
disable: Iterable[str] = tuple(), disable: Iterable[str] = SimpleFrozenList(),
cleanup: bool = False, cleanup: bool = False,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
n_process: int = 1, n_process: int = 1,
@ -1388,7 +1473,8 @@ class Language:
config: Union[Dict[str, Any], Config] = {}, config: Union[Dict[str, Any], Config] = {},
*, *,
vocab: Union[Vocab, bool] = True, vocab: Union[Vocab, bool] = True,
disable: Iterable[str] = tuple(), disable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(),
auto_fill: bool = True, auto_fill: bool = True,
validate: bool = True, validate: bool = True,
) -> "Language": ) -> "Language":
@ -1398,7 +1484,11 @@ class Language:
config (Dict[str, Any] / Config): The loaded config. config (Dict[str, Any] / Config): The loaded config.
vocab (Vocab): A Vocab object. If True, a vocab is created. vocab (Vocab): A Vocab object. If True, a vocab is created.
disable (Iterable[str]): List of pipeline component names to disable. disable (Iterable[str]): Names of pipeline components to disable.
Disabled pipes will be loaded but they won't be run unless you
explicitly enable them by calling nlp.enable_pipe.
exclude (Iterable[str]): Names of pipeline components to exclude.
Excluded components won't be loaded.
auto_fill (bool): Automatically fill in missing values in config based auto_fill (bool): Automatically fill in missing values in config based
on defaults and function argument annotations. on defaults and function argument annotations.
validate (bool): Validate the component config and arguments against validate (bool): Validate the component config and arguments against
@ -1471,7 +1561,7 @@ class Language:
raise ValueError(Errors.E956.format(name=pipe_name, opts=opts)) raise ValueError(Errors.E956.format(name=pipe_name, opts=opts))
pipe_cfg = util.copy_config(pipeline[pipe_name]) pipe_cfg = util.copy_config(pipeline[pipe_name])
raw_config = Config(filled["components"][pipe_name]) raw_config = Config(filled["components"][pipe_name])
if pipe_name not in disable: if pipe_name not in exclude:
if "factory" not in pipe_cfg and "source" not in pipe_cfg: if "factory" not in pipe_cfg and "source" not in pipe_cfg:
err = Errors.E984.format(name=pipe_name, config=pipe_cfg) err = Errors.E984.format(name=pipe_name, config=pipe_cfg)
raise ValueError(err) raise ValueError(err)
@ -1496,6 +1586,8 @@ class Language:
) )
source_name = pipe_cfg.get("component", pipe_name) source_name = pipe_cfg.get("component", pipe_name)
nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name) nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name)
disabled_pipes = [*config["nlp"]["disabled"], *disable]
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
nlp.config = filled if auto_fill else config nlp.config = filled if auto_fill else config
nlp.resolved = resolved nlp.resolved = resolved
if after_pipeline_creation is not None: if after_pipeline_creation is not None:
@ -1507,7 +1599,7 @@ class Language:
return nlp return nlp
def to_disk( def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> None: ) -> None:
"""Save the current state to a directory. If a model is loaded, this """Save the current state to a directory. If a model is loaded, this
will include the model. will include the model.
@ -1525,9 +1617,7 @@ class Language:
) )
serializers["meta.json"] = lambda p: srsly.write_json(p, self.meta) serializers["meta.json"] = lambda p: srsly.write_json(p, self.meta)
serializers["config.cfg"] = lambda p: self.config.to_disk(p) serializers["config.cfg"] = lambda p: self.config.to_disk(p)
for name, proc in self.pipeline: for name, proc in self._components:
if not hasattr(proc, "name"):
continue
if name in exclude: if name in exclude:
continue continue
if not hasattr(proc, "to_disk"): if not hasattr(proc, "to_disk"):
@ -1537,7 +1627,7 @@ class Language:
util.to_disk(path, serializers, exclude) util.to_disk(path, serializers, exclude)
def from_disk( def from_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> "Language": ) -> "Language":
"""Loads state from a directory. Modifies the object in place and """Loads state from a directory. Modifies the object in place and
returns it. If the saved `Language` object contains a model, the returns it. If the saved `Language` object contains a model, the
@ -1573,7 +1663,7 @@ class Language:
deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk( deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(
p, exclude=["vocab"] p, exclude=["vocab"]
) )
for name, proc in self.pipeline: for name, proc in self._components:
if name in exclude: if name in exclude:
continue continue
if not hasattr(proc, "from_disk"): if not hasattr(proc, "from_disk"):
@ -1589,7 +1679,7 @@ class Language:
self._link_components() self._link_components()
return self return self
def to_bytes(self, *, exclude: Iterable[str] = tuple()) -> bytes: def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
"""Serialize the current state to a binary string. """Serialize the current state to a binary string.
exclude (list): Names of components or serialization fields to exclude. exclude (list): Names of components or serialization fields to exclude.
@ -1602,7 +1692,7 @@ class Language:
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"]) serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
serializers["meta.json"] = lambda: srsly.json_dumps(self.meta) serializers["meta.json"] = lambda: srsly.json_dumps(self.meta)
serializers["config.cfg"] = lambda: self.config.to_bytes() serializers["config.cfg"] = lambda: self.config.to_bytes()
for name, proc in self.pipeline: for name, proc in self._components:
if name in exclude: if name in exclude:
continue continue
if not hasattr(proc, "to_bytes"): if not hasattr(proc, "to_bytes"):
@ -1611,7 +1701,7 @@ class Language:
return util.to_bytes(serializers, exclude) return util.to_bytes(serializers, exclude)
def from_bytes( def from_bytes(
self, bytes_data: bytes, *, exclude: Iterable[str] = tuple() self, bytes_data: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
) -> "Language": ) -> "Language":
"""Load state from a binary string. """Load state from a binary string.
@ -1638,7 +1728,7 @@ class Language:
deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes( deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(
b, exclude=["vocab"] b, exclude=["vocab"]
) )
for name, proc in self.pipeline: for name, proc in self._components:
if name in exclude: if name in exclude:
continue continue
if not hasattr(proc, "from_bytes"): if not hasattr(proc, "from_bytes"):
@ -1674,14 +1764,10 @@ class DisabledPipes(list):
def __init__(self, nlp: Language, names: List[str]) -> None: def __init__(self, nlp: Language, names: List[str]) -> None:
self.nlp = nlp self.nlp = nlp
self.names = names self.names = names
# Important! Not deep copy -- we just want the container (but we also for name in self.names:
# want to support people providing arbitrarily typed nlp.pipeline self.nlp.disable_pipe(name)
# objects.)
self.original_pipeline = copy(nlp.pipeline)
self.metas = {name: nlp.get_pipe_meta(name) for name in names}
self.configs = {name: nlp.get_pipe_config(name) for name in names}
list.__init__(self) list.__init__(self)
self.extend(nlp.remove_pipe(name) for name in names) self.extend(self.names)
def __enter__(self): def __enter__(self):
return self return self
@ -1691,14 +1777,10 @@ class DisabledPipes(list):
def restore(self) -> None: def restore(self) -> None:
"""Restore the pipeline to its state when DisabledPipes was created.""" """Restore the pipeline to its state when DisabledPipes was created."""
current, self.nlp.pipeline = self.nlp.pipeline, self.original_pipeline for name in self.names:
unexpected = [name for name, pipe in current if not self.nlp.has_pipe(name)] if name not in self.nlp.component_names:
if unexpected: raise ValueError(Errors.E008.format(name=name))
# Don't change the pipeline if we're raising an error. self.nlp.enable_pipe(name)
self.nlp.pipeline = current
raise ValueError(Errors.E008.format(names=unexpected))
self.nlp._pipe_meta.update(self.metas)
self.nlp._pipe_configs.update(self.configs)
self[:] = [] self[:] = []

View File

@ -12,6 +12,7 @@ from ..symbols import IDS, TAG, POS, MORPH, LEMMA
from ..tokens import Doc, Span from ..tokens import Doc, Span
from ..tokens._retokenize import normalize_token_attrs, set_token_attrs from ..tokens._retokenize import normalize_token_attrs, set_token_attrs
from ..vocab import Vocab from ..vocab import Vocab
from ..util import SimpleFrozenList
from .. import util from .. import util
@ -220,7 +221,7 @@ class AttributeRuler(Pipe):
results.update(Scorer.score_token_attr(examples, "lemma", **kwargs)) results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
return results return results
def to_bytes(self, exclude: Iterable[str] = tuple()) -> bytes: def to_bytes(self, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
"""Serialize the AttributeRuler to a bytestring. """Serialize the AttributeRuler to a bytestring.
exclude (Iterable[str]): String names of serialization fields to exclude. exclude (Iterable[str]): String names of serialization fields to exclude.
@ -233,7 +234,9 @@ class AttributeRuler(Pipe):
serialize["patterns"] = lambda: srsly.msgpack_dumps(self.patterns) serialize["patterns"] = lambda: srsly.msgpack_dumps(self.patterns)
return util.to_bytes(serialize, exclude) return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data: bytes, exclude: Iterable[str] = tuple()): def from_bytes(
self, bytes_data: bytes, exclude: Iterable[str] = SimpleFrozenList()
):
"""Load the AttributeRuler from a bytestring. """Load the AttributeRuler from a bytestring.
bytes_data (bytes): The data to load. bytes_data (bytes): The data to load.
@ -254,7 +257,9 @@ class AttributeRuler(Pipe):
return self return self
def to_disk(self, path: Union[Path, str], exclude: Iterable[str] = tuple()) -> None: def to_disk(
self, path: Union[Path, str], exclude: Iterable[str] = SimpleFrozenList()
) -> None:
"""Serialize the AttributeRuler to disk. """Serialize the AttributeRuler to disk.
path (Union[Path, str]): A path to a directory. path (Union[Path, str]): A path to a directory.
@ -268,7 +273,7 @@ class AttributeRuler(Pipe):
util.to_disk(path, serialize, exclude) util.to_disk(path, serialize, exclude)
def from_disk( def from_disk(
self, path: Union[Path, str], exclude: Iterable[str] = tuple() self, path: Union[Path, str], exclude: Iterable[str] = SimpleFrozenList()
) -> None: ) -> None:
"""Load the AttributeRuler from disk. """Load the AttributeRuler from disk.

View File

@ -13,6 +13,7 @@ from ..language import Language
from ..vocab import Vocab from ..vocab import Vocab
from ..gold import Example, validate_examples from ..gold import Example, validate_examples
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..util import SimpleFrozenList
from .. import util from .. import util
@ -404,7 +405,7 @@ class EntityLinker(Pipe):
token.ent_kb_id_ = kb_id token.ent_kb_id_ = kb_id
def to_disk( def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList(),
) -> None: ) -> None:
"""Serialize the pipe to disk. """Serialize the pipe to disk.
@ -421,7 +422,7 @@ class EntityLinker(Pipe):
util.to_disk(path, serialize, exclude) util.to_disk(path, serialize, exclude)
def from_disk( def from_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList(),
) -> "EntityLinker": ) -> "EntityLinker":
"""Load the pipe from disk. Modifies the object in place and returns it. """Load the pipe from disk. Modifies the object in place and returns it.

View File

@ -5,7 +5,7 @@ import srsly
from ..language import Language from ..language import Language
from ..errors import Errors from ..errors import Errors
from ..util import ensure_path, to_disk, from_disk from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
from ..tokens import Doc, Span from ..tokens import Doc, Span
from ..matcher import Matcher, PhraseMatcher from ..matcher import Matcher, PhraseMatcher
from ..scorer import Scorer from ..scorer import Scorer
@ -317,7 +317,7 @@ class EntityRuler:
return Scorer.score_spans(examples, "ents", **kwargs) return Scorer.score_spans(examples, "ents", **kwargs)
def from_bytes( def from_bytes(
self, patterns_bytes: bytes, *, exclude: Iterable[str] = tuple() self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
) -> "EntityRuler": ) -> "EntityRuler":
"""Load the entity ruler from a bytestring. """Load the entity ruler from a bytestring.
@ -341,7 +341,7 @@ class EntityRuler:
self.add_patterns(cfg) self.add_patterns(cfg)
return self return self
def to_bytes(self, *, exclude: Iterable[str] = tuple()) -> bytes: def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
"""Serialize the entity ruler patterns to a bytestring. """Serialize the entity ruler patterns to a bytestring.
RETURNS (bytes): The serialized patterns. RETURNS (bytes): The serialized patterns.
@ -357,7 +357,7 @@ class EntityRuler:
return srsly.msgpack_dumps(serial) return srsly.msgpack_dumps(serial)
def from_disk( def from_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> "EntityRuler": ) -> "EntityRuler":
"""Load the entity ruler from a file. Expects a file containing """Load the entity ruler from a file. Expects a file containing
newline-delimited JSON (JSONL) with one entry per line. newline-delimited JSON (JSONL) with one entry per line.
@ -394,7 +394,7 @@ class EntityRuler:
return self return self
def to_disk( def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = tuple() self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> None: ) -> None:
"""Save the entity ruler patterns to a directory. The patterns will be """Save the entity ruler patterns to a directory. The patterns will be
saved as newline-delimited JSON (JSONL). saved as newline-delimited JSON (JSONL).

View File

@ -223,6 +223,7 @@ class ConfigSchemaNlp(BaseModel):
# fmt: off # fmt: off
lang: StrictStr = Field(..., title="The base language to use") lang: StrictStr = Field(..., title="The base language to use")
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order") pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default")
tokenizer: Callable = Field(..., title="The tokenizer to use") tokenizer: Callable = Field(..., title="The tokenizer to use")
load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data") load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization") before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")

View File

@ -1,10 +1,10 @@
from typing import Optional, Iterable, Dict, Any, Callable, Tuple, TYPE_CHECKING from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING
import numpy as np import numpy as np
from .gold import Example from .gold import Example
from .tokens import Token, Doc, Span from .tokens import Token, Doc, Span
from .errors import Errors from .errors import Errors
from .util import get_lang_class from .util import get_lang_class, SimpleFrozenList
from .morphology import Morphology from .morphology import Morphology
if TYPE_CHECKING: if TYPE_CHECKING:
@ -317,7 +317,7 @@ class Scorer:
attr: str, attr: str,
*, *,
getter: Callable[[Doc, str], Any] = getattr, getter: Callable[[Doc, str], Any] = getattr,
labels: Iterable[str] = tuple(), labels: Iterable[str] = SimpleFrozenList(),
multi_label: bool = True, multi_label: bool = True,
positive_label: Optional[str] = None, positive_label: Optional[str] = None,
threshold: Optional[float] = None, threshold: Optional[float] = None,
@ -447,7 +447,7 @@ class Scorer:
getter: Callable[[Token, str], Any] = getattr, getter: Callable[[Token, str], Any] = getattr,
head_attr: str = "head", head_attr: str = "head",
head_getter: Callable[[Token, str], Token] = getattr, head_getter: Callable[[Token, str], Token] = getattr,
ignore_labels: Tuple[str] = tuple(), ignore_labels: Iterable[str] = SimpleFrozenList(),
**cfg, **cfg,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Returns the UAS, LAS, and LAS per type scores for dependency """Returns the UAS, LAS, and LAS per type scores for dependency

View File

@ -1,5 +1,6 @@
import pytest import pytest
from spacy.language import Language from spacy.language import Language
from spacy.util import SimpleFrozenList
@pytest.fixture @pytest.fixture
@ -181,6 +182,11 @@ def test_select_pipes_errors(nlp):
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.select_pipes(enable=[], disable=["c3"]) nlp.select_pipes(enable=[], disable=["c3"])
disabled = nlp.select_pipes(disable=["c2"])
nlp.remove_pipe("c2")
with pytest.raises(ValueError):
disabled.restore()
@pytest.mark.parametrize("n_pipes", [100]) @pytest.mark.parametrize("n_pipes", [100])
def test_add_lots_of_pipes(nlp, n_pipes): def test_add_lots_of_pipes(nlp, n_pipes):
@ -249,3 +255,94 @@ def test_add_pipe_before_after():
nlp.add_pipe("entity_ruler", before=True) nlp.add_pipe("entity_ruler", before=True)
with pytest.raises(ValueError): with pytest.raises(ValueError):
nlp.add_pipe("entity_ruler", first=False) nlp.add_pipe("entity_ruler", first=False)
def test_disable_enable_pipes():
name = "test_disable_enable_pipes"
results = {}
def make_component(name):
results[name] = ""
def component(doc):
nonlocal results
results[name] = doc.text
return doc
return component
c1 = Language.component(f"{name}1", func=make_component(f"{name}1"))
c2 = Language.component(f"{name}2", func=make_component(f"{name}2"))
nlp = Language()
nlp.add_pipe(f"{name}1")
nlp.add_pipe(f"{name}2")
assert results[f"{name}1"] == ""
assert results[f"{name}2"] == ""
assert nlp.pipeline == [(f"{name}1", c1), (f"{name}2", c2)]
assert nlp.pipe_names == [f"{name}1", f"{name}2"]
nlp.disable_pipe(f"{name}1")
assert nlp.disabled == [f"{name}1"]
assert nlp.component_names == [f"{name}1", f"{name}2"]
assert nlp.pipe_names == [f"{name}2"]
assert nlp.config["nlp"]["disabled"] == [f"{name}1"]
nlp("hello")
assert results[f"{name}1"] == "" # didn't run
assert results[f"{name}2"] == "hello" # ran
nlp.enable_pipe(f"{name}1")
assert nlp.disabled == []
assert nlp.pipe_names == [f"{name}1", f"{name}2"]
assert nlp.config["nlp"]["disabled"] == []
nlp("world")
assert results[f"{name}1"] == "world"
assert results[f"{name}2"] == "world"
nlp.disable_pipe(f"{name}2")
nlp.remove_pipe(f"{name}2")
assert nlp.components == [(f"{name}1", c1)]
assert nlp.pipeline == [(f"{name}1", c1)]
assert nlp.component_names == [f"{name}1"]
assert nlp.pipe_names == [f"{name}1"]
assert nlp.disabled == []
assert nlp.config["nlp"]["disabled"] == []
nlp.rename_pipe(f"{name}1", name)
assert nlp.components == [(name, c1)]
assert nlp.component_names == [name]
nlp("!")
assert results[f"{name}1"] == "!"
assert results[f"{name}2"] == "world"
with pytest.raises(ValueError):
nlp.disable_pipe(f"{name}2")
nlp.disable_pipe(name)
assert nlp.component_names == [name]
assert nlp.pipe_names == []
assert nlp.config["nlp"]["disabled"] == [name]
nlp("?")
assert results[f"{name}1"] == "!"
def test_pipe_methods_frozen():
"""Test that spaCy raises custom error messages if "frozen" properties are
accessed. We still want to use a list here to not break backwards
compatibility, but users should see an error if they're trying to append
to nlp.pipeline etc."""
nlp = Language()
ner = nlp.add_pipe("ner")
assert nlp.pipe_names == ["ner"]
for prop in [
nlp.pipeline,
nlp.pipe_names,
nlp.components,
nlp.component_names,
nlp.disabled,
nlp.factory_names,
]:
assert isinstance(prop, list)
assert isinstance(prop, SimpleFrozenList)
with pytest.raises(NotImplementedError):
nlp.pipeline.append(("ner2", ner))
with pytest.raises(NotImplementedError):
nlp.pipe_names.pop()
with pytest.raises(NotImplementedError):
nlp.components.sort()
with pytest.raises(NotImplementedError):
nlp.component_names.clear()

View File

@ -161,6 +161,7 @@ def test_issue4674():
assert kb2.get_size_entities() == 1 assert kb2.get_size_entities() == 1
@pytest.mark.skip(reason="API change: disable just disables, new exclude arg")
def test_issue4707(): def test_issue4707():
"""Tests that disabled component names are also excluded from nlp.from_disk """Tests that disabled component names are also excluded from nlp.from_disk
by default when loading a model. by default when loading a model.

View File

@ -6,6 +6,8 @@ from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL
from spacy.pipeline.senter import DEFAULT_SENTER_MODEL from spacy.pipeline.senter import DEFAULT_SENTER_MODEL
from spacy.lang.en import English
import spacy
from ..util import make_tempdir from ..util import make_tempdir
@ -173,3 +175,34 @@ def test_serialize_sentencerecognizer(en_vocab):
sr_b = sr.to_bytes() sr_b = sr.to_bytes()
sr_d = SentenceRecognizer(en_vocab, model).from_bytes(sr_b) sr_d = SentenceRecognizer(en_vocab, model).from_bytes(sr_b)
assert sr.to_bytes() == sr_d.to_bytes() assert sr.to_bytes() == sr_d.to_bytes()
def test_serialize_pipeline_disable_enable():
nlp = English()
nlp.add_pipe("ner")
nlp.add_pipe("tagger")
nlp.disable_pipe("tagger")
assert nlp.config["nlp"]["disabled"] == ["tagger"]
config = nlp.config.copy()
nlp2 = English.from_config(config)
assert nlp2.pipe_names == ["ner"]
assert nlp2.component_names == ["ner", "tagger"]
assert nlp2.disabled == ["tagger"]
assert nlp2.config["nlp"]["disabled"] == ["tagger"]
with make_tempdir() as d:
nlp2.to_disk(d)
nlp3 = spacy.load(d)
assert nlp3.pipe_names == ["ner"]
assert nlp3.component_names == ["ner", "tagger"]
with make_tempdir() as d:
nlp3.to_disk(d)
nlp4 = spacy.load(d, disable=["ner"])
assert nlp4.pipe_names == []
assert nlp4.component_names == ["ner", "tagger"]
assert nlp4.disabled == ["ner", "tagger"]
with make_tempdir() as d:
nlp.to_disk(d)
nlp5 = spacy.load(d, exclude=["tagger"])
assert nlp5.pipe_names == ["ner"]
assert nlp5.component_names == ["ner"]
assert nlp5.disabled == []

View File

@ -3,10 +3,9 @@ import pytest
from .util import get_random_doc from .util import get_random_doc
from spacy import util from spacy import util
from spacy.util import dot_to_object from spacy.util import dot_to_object, SimpleFrozenList
from thinc.api import Config, Optimizer from thinc.api import Config, Optimizer
from spacy.gold.batchers import minibatch_by_words from spacy.gold.batchers import minibatch_by_words
from ..lang.en import English from ..lang.en import English
from ..lang.nl import Dutch from ..lang.nl import Dutch
from ..language import DEFAULT_CONFIG_PATH from ..language import DEFAULT_CONFIG_PATH
@ -106,3 +105,20 @@ def test_util_dot_section():
assert not dot_to_object(en_config, "nlp.load_vocab_data") assert not dot_to_object(en_config, "nlp.load_vocab_data")
assert dot_to_object(nl_config, "nlp.load_vocab_data") assert dot_to_object(nl_config, "nlp.load_vocab_data")
assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer) assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)
def test_simple_frozen_list():
t = SimpleFrozenList(["foo", "bar"])
assert t == ["foo", "bar"]
assert t.index("bar") == 1 # okay method
with pytest.raises(NotImplementedError):
t.append("baz")
with pytest.raises(NotImplementedError):
t.sort()
with pytest.raises(NotImplementedError):
t.extend(["baz"])
with pytest.raises(NotImplementedError):
t.pop()
t = SimpleFrozenList(["foo", "bar"], error="Error!")
with pytest.raises(NotImplementedError):
t.append("baz")

View File

@ -10,7 +10,7 @@ from ..vocab import Vocab
from ..compat import copy_reg from ..compat import copy_reg
from ..attrs import SPACY, ORTH, intify_attr from ..attrs import SPACY, ORTH, intify_attr
from ..errors import Errors from ..errors import Errors
from ..util import ensure_path from ..util import ensure_path, SimpleFrozenList
# fmt: off # fmt: off
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS") ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
@ -52,7 +52,7 @@ class DocBin:
self, self,
attrs: Iterable[str] = ALL_ATTRS, attrs: Iterable[str] = ALL_ATTRS,
store_user_data: bool = False, store_user_data: bool = False,
docs: Iterable[Doc] = tuple(), docs: Iterable[Doc] = SimpleFrozenList(),
) -> None: ) -> None:
"""Create a DocBin object to hold serialized annotations. """Create a DocBin object to hold serialized annotations.

View File

@ -120,6 +120,47 @@ class SimpleFrozenDict(dict):
raise NotImplementedError(self.error) raise NotImplementedError(self.error)
class SimpleFrozenList(list):
"""Wrapper class around a list that lets us raise custom errors if certain
attributes/methods are accessed. Mostly used for properties like
Language.pipeline that return an immutable list (and that we don't want to
convert to a tuple to not break too much backwards compatibility). If a user
accidentally calls nlp.pipeline.append(), we can raise a more helpful error.
"""
def __init__(self, *args, error: str = Errors.E927) -> None:
"""Initialize the frozen list.
error (str): The error message when user tries to mutate the list.
"""
self.error = error
super().__init__(*args)
def append(self, *args, **kwargs):
raise NotImplementedError(self.error)
def clear(self, *args, **kwargs):
raise NotImplementedError(self.error)
def extend(self, *args, **kwargs):
raise NotImplementedError(self.error)
def insert(self, *args, **kwargs):
raise NotImplementedError(self.error)
def pop(self, *args, **kwargs):
raise NotImplementedError(self.error)
def remove(self, *args, **kwargs):
raise NotImplementedError(self.error)
def reverse(self, *args, **kwargs):
raise NotImplementedError(self.error)
def sort(self, *args, **kwargs):
raise NotImplementedError(self.error)
def lang_class_is_loaded(lang: str) -> bool: def lang_class_is_loaded(lang: str) -> bool:
"""Check whether a Language class is already loaded. Language classes are """Check whether a Language class is already loaded. Language classes are
loaded lazily, to avoid expensive setup code associated with the language loaded lazily, to avoid expensive setup code associated with the language
@ -215,7 +256,8 @@ def load_model(
name: Union[str, Path], name: Union[str, Path],
*, *,
vocab: Union["Vocab", bool] = True, vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = tuple(), disable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language": ) -> "Language":
"""Load a model from a package or data path. """Load a model from a package or data path.
@ -228,7 +270,7 @@ def load_model(
keyed by section values in dot notation. keyed by section values in dot notation.
RETURNS (Language): The loaded nlp object. RETURNS (Language): The loaded nlp object.
""" """
kwargs = {"vocab": vocab, "disable": disable, "config": config} kwargs = {"vocab": vocab, "disable": disable, "exclude": exclude, "config": config}
if isinstance(name, str): # name or string path if isinstance(name, str): # name or string path
if name.startswith("blank:"): # shortcut for blank model if name.startswith("blank:"): # shortcut for blank model
return get_lang_class(name.replace("blank:", ""))() return get_lang_class(name.replace("blank:", ""))()
@ -247,7 +289,8 @@ def load_model_from_package(
name: str, name: str,
*, *,
vocab: Union["Vocab", bool] = True, vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = tuple(), disable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language": ) -> "Language":
"""Load a model from an installed package. """Load a model from an installed package.
@ -255,13 +298,17 @@ def load_model_from_package(
name (str): The package name. name (str): The package name.
vocab (Vocab / True): Optional vocab to pass in on initialization. If True, vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created. a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable. disable (Iterable[str]): Names of pipeline components to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe.
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation. keyed by section values in dot notation.
RETURNS (Language): The loaded nlp object. RETURNS (Language): The loaded nlp object.
""" """
cls = importlib.import_module(name) cls = importlib.import_module(name)
return cls.load(vocab=vocab, disable=disable, config=config) return cls.load(vocab=vocab, disable=disable, exclude=exclude, config=config)
def load_model_from_path( def load_model_from_path(
@ -269,7 +316,8 @@ def load_model_from_path(
*, *,
meta: Optional[Dict[str, Any]] = None, meta: Optional[Dict[str, Any]] = None,
vocab: Union["Vocab", bool] = True, vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = tuple(), disable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language": ) -> "Language":
"""Load a model from a data directory path. Creates Language class with """Load a model from a data directory path. Creates Language class with
@ -279,7 +327,11 @@ def load_model_from_path(
meta (Dict[str, Any]): Optional model meta. meta (Dict[str, Any]): Optional model meta.
vocab (Vocab / True): Optional vocab to pass in on initialization. If True, vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created. a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable. disable (Iterable[str]): Names of pipeline components to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe.
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation. keyed by section values in dot notation.
RETURNS (Language): The loaded nlp object. RETURNS (Language): The loaded nlp object.
@ -290,15 +342,18 @@ def load_model_from_path(
meta = get_model_meta(model_path) meta = get_model_meta(model_path)
config_path = model_path / "config.cfg" config_path = model_path / "config.cfg"
config = load_config(config_path, overrides=dict_to_dot(config)) config = load_config(config_path, overrides=dict_to_dot(config))
nlp, _ = load_model_from_config(config, vocab=vocab, disable=disable) nlp, _ = load_model_from_config(
return nlp.from_disk(model_path, exclude=disable) config, vocab=vocab, disable=disable, exclude=exclude
)
return nlp.from_disk(model_path, exclude=exclude)
def load_model_from_config( def load_model_from_config(
config: Union[Dict[str, Any], Config], config: Union[Dict[str, Any], Config],
*, *,
vocab: Union["Vocab", bool] = True, vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = tuple(), disable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(),
auto_fill: bool = False, auto_fill: bool = False,
validate: bool = True, validate: bool = True,
) -> Tuple["Language", Config]: ) -> Tuple["Language", Config]:
@ -309,7 +364,11 @@ def load_model_from_config(
meta (Dict[str, Any]): Optional model meta. meta (Dict[str, Any]): Optional model meta.
vocab (Vocab / True): Optional vocab to pass in on initialization. If True, vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created. a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable. disable (Iterable[str]): Names of pipeline components to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe.
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
components won't be loaded.
auto_fill (bool): Whether to auto-fill config with missing defaults. auto_fill (bool): Whether to auto-fill config with missing defaults.
validate (bool): Whether to show config validation errors. validate (bool): Whether to show config validation errors.
RETURNS (Language): The loaded nlp object. RETURNS (Language): The loaded nlp object.
@ -323,7 +382,12 @@ def load_model_from_config(
# registry, including custom subclasses provided via entry points # registry, including custom subclasses provided via entry points
lang_cls = get_lang_class(nlp_config["lang"]) lang_cls = get_lang_class(nlp_config["lang"])
nlp = lang_cls.from_config( nlp = lang_cls.from_config(
config, vocab=vocab, disable=disable, auto_fill=auto_fill, validate=validate, config,
vocab=vocab,
disable=disable,
exclude=exclude,
auto_fill=auto_fill,
validate=validate,
) )
return nlp, nlp.resolved return nlp, nlp.resolved
@ -332,7 +396,8 @@ def load_model_from_init_py(
init_file: Union[Path, str], init_file: Union[Path, str],
*, *,
vocab: Union["Vocab", bool] = True, vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = tuple(), disable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(), config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language": ) -> "Language":
"""Helper function to use in the `load()` method of a model package's """Helper function to use in the `load()` method of a model package's
@ -340,7 +405,11 @@ def load_model_from_init_py(
vocab (Vocab / True): Optional vocab to pass in on initialization. If True, vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created. a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable. disable (Iterable[str]): Names of pipeline components to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe.
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation. keyed by section values in dot notation.
RETURNS (Language): The loaded nlp object. RETURNS (Language): The loaded nlp object.
@ -352,7 +421,12 @@ def load_model_from_init_py(
if not model_path.exists(): if not model_path.exists():
raise IOError(Errors.E052.format(path=data_path)) raise IOError(Errors.E052.format(path=data_path))
return load_model_from_path( return load_model_from_path(
data_path, vocab=vocab, meta=meta, disable=disable, config=config data_path,
vocab=vocab,
meta=meta,
disable=disable,
exclude=exclude,
config=config,
) )

View File

@ -75,9 +75,10 @@ Defines the `nlp` object, its tokenizer and
[processing pipeline](/usage/processing-pipelines) component names. [processing pipeline](/usage/processing-pipelines) component names.
| Name | Description | | Name | Description |
| ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `lang` | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `null`. ~~str~~ | | `lang` | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `null`. ~~str~~ |
| `pipeline` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). Defaults to `[]`. ~~List[str]~~ | | `pipeline` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). Defaults to `[]`. ~~List[str]~~ |
| `disabled` | Names of pipeline components that are loaded but disabled by default and not run as part of the pipeline. Should correspond to components listed in `pipeline`. After a model is loaded, disabled components can be enabled using [`Language.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
| `load_vocab_data` | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. Defaults to `true`. ~~bool~~ | | `load_vocab_data` | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. Defaults to `true`. ~~bool~~ |
| `before_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. Defaults to `null`. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~ | | `before_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. Defaults to `null`. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~ |
| `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | | `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |

View File

@ -357,35 +357,6 @@ their original weights after the block.
| -------- | ------------------------------------------------------ | | -------- | ------------------------------------------------------ |
| `params` | A dictionary of parameters keyed by model ID. ~~dict~~ | | `params` | A dictionary of parameters keyed by model ID. ~~dict~~ |
## Language.create_pipe {#create_pipe tag="method" new="2"}
Create a pipeline component from a factory.
<Infobox title="Changed in v3.0" variant="warning">
As of v3.0, the [`Language.add_pipe`](/api/language#add_pipe) method also takes
the string name of the factory, creates the component, adds it to the pipeline
and returns it. The `Language.create_pipe` method is now mostly used internally.
To create a component and add it to the pipeline, you should always use
`Language.add_pipe`.
</Infobox>
> #### Example
>
> ```python
> parser = nlp.create_pipe("parser")
> ```
| Name | Description |
| ------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `factory_name` | Name of the registered component factory. ~~str~~ |
| `name` | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. ~~Optional[str]~~ |
| _keyword-only_ | |
| `config` <Tag variant="new">3</Tag> | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ |
| `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
| **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ |
## Language.add_pipe {#add_pipe tag="method" new="2"} ## Language.add_pipe {#add_pipe tag="method" new="2"}
Add a component to the processing pipeline. Expects a name that maps to a Add a component to the processing pipeline. Expects a name that maps to a
@ -434,6 +405,35 @@ component, adds it to the pipeline and returns it.
| `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ | | `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
| **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ | | **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ |
## Language.create_pipe {#create_pipe tag="method" new="2"}
Create a pipeline component from a factory.
<Infobox title="Changed in v3.0" variant="warning">
As of v3.0, the [`Language.add_pipe`](/api/language#add_pipe) method also takes
the string name of the factory, creates the component, adds it to the pipeline
and returns it. The `Language.create_pipe` method is now mostly used internally.
To create a component and add it to the pipeline, you should always use
`Language.add_pipe`.
</Infobox>
> #### Example
>
> ```python
> parser = nlp.create_pipe("parser")
> ```
| Name | Description |
| ------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `factory_name` | Name of the registered component factory. ~~str~~ |
| `name` | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. ~~Optional[str]~~ |
| _keyword-only_ | |
| `config` <Tag variant="new">3</Tag> | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ |
| `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
| **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ |
## Language.has_factory {#has_factory tag="classmethod" new="3"} ## Language.has_factory {#has_factory tag="classmethod" new="3"}
Check whether a factory name is registered on the `Language` class or subclass. Check whether a factory name is registered on the `Language` class or subclass.
@ -561,6 +561,54 @@ component function.
| `name` | Name of the component to remove. ~~str~~ | | `name` | Name of the component to remove. ~~str~~ |
| **RETURNS** | A `(name, component)` tuple of the removed component. ~~Tuple[str, Callable[[Doc], Doc]]~~ | | **RETURNS** | A `(name, component)` tuple of the removed component. ~~Tuple[str, Callable[[Doc], Doc]]~~ |
## Language.disable_pipe {#disable_pipe tag="method" new="3"}
Temporarily disable a pipeline component so it's not run as part of the
pipeline. Disabled components are listed in
[`nlp.disabled`](/api/language#attributes) and included in
[`nlp.components`](/api/language#attributes), but not in
[`nlp.pipeline`](/api/language#pipeline), so they're not run when you process a
`Doc` with the `nlp` object. If the component is already disabled, this method
does nothing.
> #### Example
>
> ```python
> nlp.add_pipe("ner")
> nlp.add_pipe("textcat")
> assert nlp.pipe_names == ["ner", "textcat"]
> nlp.disable_pipe("ner")
> assert nlp.pipe_names == ["textcat"]
> assert nlp.component_names == ["ner", "textcat"]
> assert nlp.disabled == ["ner"]
> ```
| Name | Description |
| ------ | ----------------------------------------- |
| `name` | Name of the component to disable. ~~str~~ |
## Language.enable_pipe {#enable_pipe tag="method" new="3"}
Enable a previously disable component (e.g. via
[`Language.disable_pipes`](/api/language#disable_pipes)) so it's run as part of
the pipeline, [`nlp.pipeline`](/api/language#pipeline). If the component is
already enabled, this method does nothing.
> #### Example
>
> ```python
> nlp.disable_pipe("ner")
> assert "ner" in nlp.disabled
> assert not "ner" in nlp.pipe_names
> nlp.enable_pipe("ner")
> assert not "ner" in nlp.disabled
> assert "ner" in nlp.pipe_names
> ```
| Name | Description |
| ------ | ---------------------------------------- |
| `name` | Name of the component to enable. ~~str~~ |
## Language.select_pipes {#select_pipes tag="contextmanager, method" new="3"} ## Language.select_pipes {#select_pipes tag="contextmanager, method" new="3"}
Disable one or more pipeline components. If used as a context manager, the Disable one or more pipeline components. If used as a context manager, the
@ -568,7 +616,9 @@ pipeline will be restored to the initial state at the end of the block.
Otherwise, a `DisabledPipes` object is returned, that has a `.restore()` method Otherwise, a `DisabledPipes` object is returned, that has a `.restore()` method
you can use to undo your changes. You can specify either `disable` (as a list or you can use to undo your changes. You can specify either `disable` (as a list or
string), or `enable`. In the latter case, all components not in the `enable` string), or `enable`. In the latter case, all components not in the `enable`
list, will be disabled. list, will be disabled. Under the hood, this method calls into
[`disable_pipe`](/api/language#disable_pipe) and
[`enable_pipe`](/api/language#enable_pipe).
> #### Example > #### Example
> >
@ -861,16 +911,19 @@ available to the loaded object.
## Attributes {#attributes} ## Attributes {#attributes}
| Name | Description | | Name | Description |
| --------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------- | | --------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | A container for the lexical types. ~~Vocab~~ | | `vocab` | A container for the lexical types. ~~Vocab~~ |
| `tokenizer` | The tokenizer. ~~Tokenizer~~ | | `tokenizer` | The tokenizer. ~~Tokenizer~~ |
| `make_doc` | Callable that takes a string and returns a `Doc`. ~~Callable[[str], Doc]~~ | | `make_doc` | Callable that takes a string and returns a `Doc`. ~~Callable[[str], Doc]~~ |
| `pipeline` | List of `(name, component)` tuples describing the current processing pipeline, in order. ~~List[str, Callable[[Doc], Doc]]~~ | | `pipeline` | List of `(name, component)` tuples describing the current processing pipeline, in order. ~~List[Tuple[str, Callable[[Doc], Doc]]]~~ |
| `pipe_names` <Tag variant="new">2</Tag> | List of pipeline component names, in order. ~~List[str]~~ | | `pipe_names` <Tag variant="new">2</Tag> | List of pipeline component names, in order. ~~List[str]~~ |
| `pipe_labels` <Tag variant="new">2.2</Tag> | List of labels set by the pipeline components, if available, keyed by component name. ~~Dict[str, List[str]]~~ | | `pipe_labels` <Tag variant="new">2.2</Tag> | List of labels set by the pipeline components, if available, keyed by component name. ~~Dict[str, List[str]]~~ |
| `pipe_factories` <Tag variant="new">2.2</Tag> | Dictionary of pipeline component names, mapped to their factory names. ~~Dict[str, str]~~ | | `pipe_factories` <Tag variant="new">2.2</Tag> | Dictionary of pipeline component names, mapped to their factory names. ~~Dict[str, str]~~ |
| `factories` | All available factory functions, keyed by name. ~~Dict[str, Callable[[...], Callable[[Doc], Doc]]]~~ | | `factories` | All available factory functions, keyed by name. ~~Dict[str, Callable[[...], Callable[[Doc], Doc]]]~~ |
| `factory_names` <Tag variant="new">3</Tag> | List of all available factory names. ~~List[str]~~ | | `factory_names` <Tag variant="new">3</Tag> | List of all available factory names. ~~List[str]~~ |
| `components` <Tag variant="new">3</Tag> | List of all available `(name, component)` tuples, including components that are currently disabled. ~~List[Tuple[str, Callable[[Doc], Doc]]]~~ |
| `component_names` <Tag variant="new">3</Tag> | List of all available component names, including components that are currently disabled. ~~List[str]~~ |
| `disabled` <Tag variant="new">3</Tag> | Names of components that are currently disabled and don't run as part of the pipeline. ~~List[str]~~ |
| `path` <Tag variant="new">2</Tag> | Path to the model data directory, if a model is loaded. Otherwise `None`. ~~Optional[Path]~~ | | `path` <Tag variant="new">2</Tag> | Path to the model data directory, if a model is loaded. Otherwise `None`. ~~Optional[Path]~~ |
## Class attributes {#class-attributes} ## Class attributes {#class-attributes}

View File

@ -23,6 +23,14 @@ path, spaCy will assume it's a data directory, load its
information to construct the `Language` class. The data will be loaded in via information to construct the `Language` class. The data will be loaded in via
[`Language.from_disk`](/api/language#from_disk). [`Language.from_disk`](/api/language#from_disk).
<Infobox variant="warning" title="Changed in v3.0">
As of v3.0, the `disable` keyword argument specifies components to load but
disable, instead of components to not load at all. Those components can now be
specified separately using the new `exclude` keyword argument.
</Infobox>
> #### Example > #### Example
> >
> ```python > ```python
@ -30,14 +38,15 @@ information to construct the `Language` class. The data will be loaded in via
> nlp = spacy.load("/path/to/en") # string path > nlp = spacy.load("/path/to/en") # string path
> nlp = spacy.load(Path("/path/to/en")) # pathlib Path > nlp = spacy.load(Path("/path/to/en")) # pathlib Path
> >
> nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"]) > nlp = spacy.load("en_core_web_sm", exclude=["parser", "tagger"])
> ``` > ```
| Name | Description | | Name | Description |
| ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | Model to load, i.e. package name or path. ~~Union[str, Path]~~ | | `name` | Model to load, i.e. package name or path. ~~Union[str, Path]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ | | `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | | `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
| **RETURNS** | A `Language` object with the loaded model. ~~Language~~ | | **RETURNS** | A `Language` object with the loaded model. ~~Language~~ |
@ -562,15 +571,16 @@ and create a `Language` object. The model data will then be loaded in via
> >
> ```python > ```python
> nlp = util.load_model("en_core_web_sm") > nlp = util.load_model("en_core_web_sm")
> nlp = util.load_model("en_core_web_sm", disable=["ner"]) > nlp = util.load_model("en_core_web_sm", exclude=["ner"])
> nlp = util.load_model("/path/to/data") > nlp = util.load_model("/path/to/data")
> ``` > ```
| Name | Description | | Name | Description |
| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | Package name or model path. ~~str~~ | | `name` | Package name or model path. ~~str~~ |
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. | | `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
| `disable` | Names of pipeline components to disable. ~~Iterable[str]~~ | | `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ | | `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
| **RETURNS** | `Language` class with the loaded model. ~~Language~~ | | **RETURNS** | `Language` class with the loaded model. ~~Language~~ |
@ -589,10 +599,11 @@ A helper function to use in the `load()` method of a model package's
> ``` > ```
| Name | Description | | Name | Description |
| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `init_file` | Path to model's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ | | `init_file` | Path to model's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ |
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. | | `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
| `disable` | Names of pipeline components to disable. ~~Iterable[str]~~ | | `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ | | `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
| **RETURNS** | `Language` class with the loaded model. ~~Language~~ | | **RETURNS** | `Language` class with the loaded model. ~~Language~~ |

View File

@ -235,38 +235,54 @@ available pipeline components and component functions.
| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | Assign token-to-vector embeddings. | | `tok2vec` | [`Tok2Vec`](/api/tok2vec) | Assign token-to-vector embeddings. |
| `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. | | `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. |
### Disabling and modifying pipeline components {#disabling} ### Disabling, excluding and modifying components {#disabling}
If you don't need a particular component of the pipeline for example, the If you don't need a particular component of the pipeline for example, the
tagger or the parser, you can **disable loading** it. This can sometimes make a tagger or the parser, you can **disable or exclude** it. This can sometimes make
big difference and improve loading speed. Disabled component names can be a big difference and improve loading and inference speed. There are two
provided to [`spacy.load`](/api/top-level#spacy.load), different mechanisms you can use:
[`Language.from_disk`](/api/language#from_disk) or the `nlp` object itself as a
list: 1. **Disable:** The component and its data will be loaded with the model, but it
will be disabled by default and not run as part of the processing pipeline.
To run it, you can explicitly enable it by calling
[`nlp.enable_pipe`](/api/language#enable_pipe). When you save out the `nlp`
object, the disabled component will be included but disabled by default.
2. **Exclude:** Don't load the component and its data with the model. Once the
model is loaded, there will be no reference to the excluded component.
Disabled and excluded component names can be provided to
[`spacy.load`](/api/top-level#spacy.load) as a list.
<!-- TODO: update with info on our models shipped with optional components -->
> #### 💡 Models with optional components
>
> The `disable` mechanism makes it easy to distribute models with optional
> components that you can enable or disable at runtime. For instance, your model
> may include a statistical _and_ a rule-based component for sentence
> segmentation, and you can choose which one to run depending on your use case.
```python ```python
### Disable loading # Load the model without the entity recognizer
nlp = spacy.load("en_core_web_sm", exclude=["ner"])
# Load the tagger and parser but don't enable them
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser"]) nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser"])
# Explicitly enable the tagger later on
nlp.enable_pipe("tagger")
``` ```
In some cases, you do want to load all pipeline components and their weights, <Infobox variant="warning" title="Changed in v3.0">
because you need them at different points in your application. However, if you
only need a `Doc` object with named entities, there's no need to run all
pipeline components on it that can potentially make processing much slower.
Instead, you can use the `disable` keyword argument on
[`nlp.pipe`](/api/language#pipe) to temporarily disable the components **during
processing**:
```python As of v3.0, the `disable` keyword argument specifies components to load but
### Disable for processing disable, instead of components to not load at all. Those components can now be
for doc in nlp.pipe(texts, disable=["tagger", "parser"]): specified separately using the new `exclude` keyword argument.
# Do something with the doc here
```
If you need to **execute more code** with components disabled e.g. to reset </Infobox>
the weights or update only some components during training you can use the
[`nlp.select_pipes`](/api/language#select_pipes) context manager. At the end of As a shortcut, you can use the [`nlp.select_pipes`](/api/language#select_pipes)
the `with` block, the disabled pipeline components will be restored context manager to temporarily disable certain components for a given block. At
the end of the `with` block, the disabled pipeline components will be restored
automatically. Alternatively, `select_pipes` returns an object that lets you automatically. Alternatively, `select_pipes` returns an object that lets you
call its `restore()` method to restore the disabled components when needed. This call its `restore()` method to restore the disabled components when needed. This
can be useful if you want to prevent unnecessary code indentation of large can be useful if you want to prevent unnecessary code indentation of large
@ -295,6 +311,14 @@ with nlp.select_pipes(enable="parser"):
doc = nlp("I will only be parsed") doc = nlp("I will only be parsed")
``` ```
The [`nlp.pipe`](/api/language#pipe) method also supports a `disable` keyword
argument if you only want to disable components during processing:
```python
for doc in nlp.pipe(texts, disable=["tagger", "parser"]):
# Do something with the doc here
```
Finally, you can also use the [`remove_pipe`](/api/language#remove_pipe) method Finally, you can also use the [`remove_pipe`](/api/language#remove_pipe) method
to remove pipeline components from an existing pipeline, the to remove pipeline components from an existing pipeline, the
[`rename_pipe`](/api/language#rename_pipe) method to rename them, or the [`rename_pipe`](/api/language#rename_pipe) method to rename them, or the
@ -308,6 +332,31 @@ nlp.rename_pipe("ner", "entityrecognizer")
nlp.replace_pipe("tagger", my_custom_tagger) nlp.replace_pipe("tagger", my_custom_tagger)
``` ```
The `Language` object exposes different [attributes](/api/language#attributes)
that let you inspect all available components and the components that currently
run as part of the pipeline.
> #### Example
>
> ```python
> nlp = spacy.blank("en")
> nlp.add_pipe("ner")
> nlp.add_pipe("textcat")
> assert nlp.pipe_names == ["ner", "textcat"]
> nlp.disable_pipe("ner")
> assert nlp.pipe_names == ["textcat"]
> assert nlp.component_names == ["ner", "textcat"]
> assert nlp.disabled == ["ner"]
> ```
| Name | Description |
| --------------------- | ---------------------------------------------------------------- |
| `nlp.pipeline` | `(name, component)` tuples of the processing pipeline, in order. |
| `nlp.pipe_names` | Pipeline component names, in order. |
| `nlp.components` | All `(name, component)` tuples, including disabled components. |
| `nlp.component_names` | All component names, including disabled components. |
| `nlp.disabled` | Names of components that are currently disabled. |
### Sourcing pipeline components from existing models {#sourced-components new="3"} ### Sourcing pipeline components from existing models {#sourced-components new="3"}
Pipeline components that are independent can also be reused across models. Pipeline components that are independent can also be reused across models.

View File

@ -254,12 +254,15 @@ The following methods, attributes and commands are new in spaCy v3.0.
| [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). | | [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). |
| [`Token.morph`](/api/token#attributes) [`Token.morph_`](/api/token#attributes) | Access a token's morphological analysis. | | [`Token.morph`](/api/token#attributes) [`Token.morph_`](/api/token#attributes) | Access a token's morphological analysis. |
| [`Language.select_pipes`](/api/language#select_pipes) | Context manager for enabling or disabling specific pipeline components for a block. | | [`Language.select_pipes`](/api/language#select_pipes) | Context manager for enabling or disabling specific pipeline components for a block. |
| [`Language.disable_pipe`](/api/language#disable_pipe) [`Language.enable_pipe`](/api/language#enable_pipe) | Disable or enable a loaded pipeline component (but don't remove it). |
| [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. | | [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. |
| [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a pretrained model and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. | | [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a pretrained model and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. |
| [`@Language.factory`](/api/language#factory) [`@Language.component`](/api/language#component) | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions. | | [`@Language.factory`](/api/language#factory) [`@Language.component`](/api/language#component) | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions. |
| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class.s | | [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class.s |
| [`Language.get_factory_meta`](/api/language#get_factory_meta) [`Language.get_pipe_meta`](/api/language#get_factory_meta) | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name. | | [`Language.get_factory_meta`](/api/language#get_factory_meta) [`Language.get_pipe_meta`](/api/language#get_factory_meta) | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name. |
| [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. | | [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. |
| [`Language.components`](/api/language#attributes) [`Language.component_names`](/api/language#attributes) | All available components and component names, including disabled components that are not run as part of the pipeline. |
| [`Language.disabled`](/api/language#attributes) | Names of disabled components that are not run as part of the pipeline. |
| [`Pipe.score`](/api/pipe#score) | Method on trainable pipeline components that returns a dictionary of evaluation scores. | | [`Pipe.score`](/api/pipe#score) | Method on trainable pipeline components that returns a dictionary of evaluation scores. |
| [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). | | [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). |
| [`util.load_meta`](/api/top-level#util.load_meta) [`util.load_config`](/api/top-level#util.load_config) | Updated helpers for loading a model's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config). | | [`util.load_meta`](/api/top-level#util.load_meta) [`util.load_config`](/api/top-level#util.load_config) | Updated helpers for loading a model's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config). |