Merge pull request #5993 from explosion/feature/disabled-components

This commit is contained in:
Ines Montani 2020-08-29 15:58:41 +02:00 committed by GitHub
commit 45f46a5c85
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 667 additions and 224 deletions

View File

@ -27,18 +27,23 @@ if sys.maxunicode == 65535:
def load(
name: Union[str, Path],
disable: Iterable[str] = tuple(),
disable: Iterable[str] = util.SimpleFrozenList(),
exclude: Iterable[str] = util.SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
) -> Language:
"""Load a spaCy model from an installed package or a local path.
name (str): Package name or model path.
disable (Iterable[str]): Names of pipeline components to disable.
disable (Iterable[str]): Names of pipeline components to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe.
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation.
RETURNS (Language): The loaded nlp object.
"""
return util.load_model(name, disable=disable, config=config)
return util.load_model(name, disable=disable, exclude=exclude, config=config)
def blank(name: str, **overrides) -> Language:

View File

@ -1,6 +1,6 @@
"""This module contains helpers and subcommands for integrating spaCy projects
with Data Version Controk (DVC). https://dvc.org"""
from typing import Dict, Any, List, Optional
from typing import Dict, Any, List, Optional, Iterable
import subprocess
from pathlib import Path
from wasabi import msg
@ -8,6 +8,7 @@ from wasabi import msg
from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli
from .._util import Arg, Opt, NAME, COMMAND
from ...util import working_dir, split_command, join_command, run_command
from ...util import SimpleFrozenList
DVC_CONFIG = "dvc.yaml"
@ -130,7 +131,7 @@ def update_dvc_config(
def run_dvc_commands(
commands: List[str] = tuple(), flags: Dict[str, bool] = {},
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {},
) -> None:
"""Run a sequence of DVC commands in a subprocess, in order.

View File

@ -1,10 +1,11 @@
from typing import Optional, List, Dict, Sequence, Any
from typing import Optional, List, Dict, Sequence, Any, Iterable
from pathlib import Path
from wasabi import msg
import sys
import srsly
from ...util import working_dir, run_command, split_command, is_cwd, join_command
from ...util import SimpleFrozenList
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND
@ -115,7 +116,9 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
def run_commands(
commands: List[str] = tuple(), silent: bool = False, dry: bool = False,
commands: Iterable[str] = SimpleFrozenList(),
silent: bool = False,
dry: bool = False,
) -> None:
"""Run a sequence of commands in a subprocess, in order.

View File

@ -11,6 +11,7 @@ use_pytorch_for_gpu_memory = false
[nlp]
lang = null
pipeline = []
disabled = []
load_vocab_data = true
before_creation = null
after_creation = null

View File

@ -137,11 +137,10 @@ class Errors:
"after (component name or index), first (True) or last (True). "
"Invalid configuration: {args}. Existing components: {opts}")
E007 = ("'{name}' already exists in pipeline. Existing names: {opts}")
E008 = ("Some current components would be lost when restoring previous "
"pipeline state. If you added components after calling "
"`nlp.select_pipes()`, you should remove them explicitly with "
"`nlp.remove_pipe()` before the pipeline is restored. Names of "
"the new components: {names}")
E008 = ("Can't restore disabled pipeline component '{name}' because it "
"doesn't exist in the pipeline anymore. If you want to remove "
"components from the pipeline, you should do it before calling "
"`nlp.select_pipes()` or after restoring the disabled components.")
E010 = ("Word vectors set to length 0. This may be because you don't have "
"a model installed or loaded, or because your model doesn't "
"include word vectors. For more info, see the docs:\n"
@ -474,6 +473,13 @@ class Errors:
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
# TODO: fix numbering after merging develop into master
E926 = ("It looks like you're trying to modify nlp.{attr} directly. This "
"doesn't work because it's an immutable computed property. If you "
"need to modify the pipeline, use the built-in methods like "
"nlp.add_pipe, nlp.remove_pipe, nlp.disable_pipe or nlp.enable_pipe "
"instead.")
E927 = ("Can't write to frozen list Maybe you're trying to modify a computed "
"property or default function argument?")
E928 = ("A 'KnowledgeBase' should be written to / read from a file, but the "
"provided argument {loc} is an existing directory.")
E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "

View File

@ -6,7 +6,7 @@ import itertools
import weakref
import functools
from contextlib import contextmanager
from copy import copy, deepcopy
from copy import deepcopy
from pathlib import Path
import warnings
from thinc.api import get_current_ops, Config, require_gpu, Optimizer
@ -20,7 +20,7 @@ from .vocab import Vocab, create_vocab
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
from .gold import Example, validate_examples
from .scorer import Scorer
from .util import create_default_optimizer, registry
from .util import create_default_optimizer, registry, SimpleFrozenList
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
@ -159,7 +159,8 @@ class Language:
self.vocab: Vocab = vocab
if self.lang is None:
self.lang = self.vocab.lang
self.pipeline = []
self._components = []
self._disabled = set()
self.max_length = max_length
self.resolved = {}
# Create the default tokenizer from the default config
@ -206,10 +207,11 @@ class Language:
"keys": self.vocab.vectors.n_keys,
"name": self.vocab.vectors.name,
}
self._meta["labels"] = self.pipe_labels
self._meta["labels"] = dict(self.pipe_labels)
# TODO: Adding this back to prevent breaking people's code etc., but
# we should consider removing it
self._meta["pipeline"] = self.pipe_names
self._meta["pipeline"] = list(self.pipe_names)
self._meta["disabled"] = list(self.disabled)
return self._meta
@meta.setter
@ -232,13 +234,14 @@ class Language:
# we can populate the config again later
pipeline = {}
score_weights = []
for pipe_name in self.pipe_names:
for pipe_name in self.component_names:
pipe_meta = self.get_pipe_meta(pipe_name)
pipe_config = self.get_pipe_config(pipe_name)
pipeline[pipe_name] = {"factory": pipe_meta.factory, **pipe_config}
if pipe_meta.default_score_weights:
score_weights.append(pipe_meta.default_score_weights)
self._config["nlp"]["pipeline"] = self.pipe_names
self._config["nlp"]["pipeline"] = list(self.component_names)
self._config["nlp"]["disabled"] = list(self.disabled)
self._config["components"] = pipeline
self._config["training"]["score_weights"] = combine_score_weights(score_weights)
if not srsly.is_json_serializable(self._config):
@ -249,21 +252,64 @@ class Language:
def config(self, value: Config) -> None:
self._config = value
@property
def disabled(self) -> List[str]:
"""Get the names of all disabled components.
RETURNS (List[str]): The disabled components.
"""
# Make sure the disabled components are returned in the order they
# appear in the pipeline (which isn't guaranteed by the set)
names = [name for name, _ in self._components if name in self._disabled]
return SimpleFrozenList(names, error=Errors.E926.format(attr="disabled"))
@property
def factory_names(self) -> List[str]:
"""Get names of all available factories.
RETURNS (List[str]): The factory names.
"""
return list(self.factories.keys())
names = list(self.factories.keys())
return SimpleFrozenList(names)
@property
def pipe_names(self) -> List[str]:
"""Get names of available pipeline components.
def components(self) -> List[Tuple[str, Callable[[Doc], Doc]]]:
"""Get all (name, component) tuples in the pipeline, including the
currently disabled components.
"""
return SimpleFrozenList(
self._components, error=Errors.E926.format(attr="components")
)
@property
def component_names(self) -> List[str]:
"""Get the names of the available pipeline components. Includes all
active and inactive pipeline components.
RETURNS (List[str]): List of component name strings, in order.
"""
return [pipe_name for pipe_name, _ in self.pipeline]
names = [pipe_name for pipe_name, _ in self._components]
return SimpleFrozenList(names, error=Errors.E926.format(attr="component_names"))
@property
def pipeline(self) -> List[Tuple[str, Callable[[Doc], Doc]]]:
"""The processing pipeline consisting of (name, component) tuples. The
components are called on the Doc in order as it passes through the
pipeline.
RETURNS (List[Tuple[str, Callable[[Doc], Doc]]]): The pipeline.
"""
pipes = [(n, p) for n, p in self._components if n not in self._disabled]
return SimpleFrozenList(pipes, error=Errors.E926.format(attr="pipeline"))
@property
def pipe_names(self) -> List[str]:
"""Get names of available active pipeline components.
RETURNS (List[str]): List of component name strings, in order.
"""
names = [pipe_name for pipe_name, _ in self.pipeline]
return SimpleFrozenList(names, error=Errors.E926.format(attr="pipe_names"))
@property
def pipe_factories(self) -> Dict[str, str]:
@ -272,9 +318,9 @@ class Language:
RETURNS (Dict[str, str]): Factory names, keyed by component names.
"""
factories = {}
for pipe_name, pipe in self.pipeline:
for pipe_name, pipe in self._components:
factories[pipe_name] = self.get_pipe_meta(pipe_name).factory
return factories
return SimpleFrozenDict(factories)
@property
def pipe_labels(self) -> Dict[str, List[str]]:
@ -284,10 +330,10 @@ class Language:
RETURNS (Dict[str, List[str]]): Labels keyed by component name.
"""
labels = {}
for name, pipe in self.pipeline:
for name, pipe in self._components:
if hasattr(pipe, "labels"):
labels[name] = list(pipe.labels)
return labels
return SimpleFrozenDict(labels)
@classmethod
def has_factory(cls, name: str) -> bool:
@ -358,10 +404,10 @@ class Language:
name: str,
*,
default_config: Dict[str, Any] = SimpleFrozenDict(),
assigns: Iterable[str] = tuple(),
requires: Iterable[str] = tuple(),
assigns: Iterable[str] = SimpleFrozenList(),
requires: Iterable[str] = SimpleFrozenList(),
retokenizes: bool = False,
scores: Iterable[str] = tuple(),
scores: Iterable[str] = SimpleFrozenList(),
default_score_weights: Dict[str, float] = SimpleFrozenDict(),
func: Optional[Callable] = None,
) -> Callable:
@ -447,8 +493,8 @@ class Language:
cls,
name: Optional[str] = None,
*,
assigns: Iterable[str] = tuple(),
requires: Iterable[str] = tuple(),
assigns: Iterable[str] = SimpleFrozenList(),
requires: Iterable[str] = SimpleFrozenList(),
retokenizes: bool = False,
func: Optional[Callable[[Doc], Doc]] = None,
) -> Callable:
@ -535,10 +581,10 @@ class Language:
DOCS: https://spacy.io/api/language#get_pipe
"""
for pipe_name, component in self.pipeline:
for pipe_name, component in self._components:
if pipe_name == name:
return component
raise KeyError(Errors.E001.format(name=name, opts=self.pipe_names))
raise KeyError(Errors.E001.format(name=name, opts=self.component_names))
def create_pipe(
self,
@ -683,8 +729,8 @@ class Language:
err = Errors.E966.format(component=bad_val, name=name)
raise ValueError(err)
name = name if name is not None else factory_name
if name in self.pipe_names:
raise ValueError(Errors.E007.format(name=name, opts=self.pipe_names))
if name in self.component_names:
raise ValueError(Errors.E007.format(name=name, opts=self.component_names))
if source is not None:
# We're loading the component from a model. After loading the
# component, we know its real factory name
@ -709,7 +755,7 @@ class Language:
)
pipe_index = self._get_pipe_index(before, after, first, last)
self._pipe_meta[name] = self.get_factory_meta(factory_name)
self.pipeline.insert(pipe_index, (name, pipe_component))
self._components.insert(pipe_index, (name, pipe_component))
return pipe_component
def _get_pipe_index(
@ -730,32 +776,42 @@ class Language:
"""
all_args = {"before": before, "after": after, "first": first, "last": last}
if sum(arg is not None for arg in [before, after, first, last]) >= 2:
raise ValueError(Errors.E006.format(args=all_args, opts=self.pipe_names))
raise ValueError(
Errors.E006.format(args=all_args, opts=self.component_names)
)
if last or not any(value is not None for value in [first, before, after]):
return len(self.pipeline)
return len(self._components)
elif first:
return 0
elif isinstance(before, str):
if before not in self.pipe_names:
raise ValueError(Errors.E001.format(name=before, opts=self.pipe_names))
return self.pipe_names.index(before)
if before not in self.component_names:
raise ValueError(
Errors.E001.format(name=before, opts=self.component_names)
)
return self.component_names.index(before)
elif isinstance(after, str):
if after not in self.pipe_names:
raise ValueError(Errors.E001.format(name=after, opts=self.pipe_names))
return self.pipe_names.index(after) + 1
if after not in self.component_names:
raise ValueError(
Errors.E001.format(name=after, opts=self.component_names)
)
return self.component_names.index(after) + 1
# We're only accepting indices referring to components that exist
# (can't just do isinstance here because bools are instance of int, too)
elif type(before) == int:
if before >= len(self.pipeline) or before < 0:
err = Errors.E959.format(dir="before", idx=before, opts=self.pipe_names)
if before >= len(self._components) or before < 0:
err = Errors.E959.format(
dir="before", idx=before, opts=self.component_names
)
raise ValueError(err)
return before
elif type(after) == int:
if after >= len(self.pipeline) or after < 0:
err = Errors.E959.format(dir="after", idx=after, opts=self.pipe_names)
if after >= len(self._components) or after < 0:
err = Errors.E959.format(
dir="after", idx=after, opts=self.component_names
)
raise ValueError(err)
return after + 1
raise ValueError(Errors.E006.format(args=all_args, opts=self.pipe_names))
raise ValueError(Errors.E006.format(args=all_args, opts=self.component_names))
def has_pipe(self, name: str) -> bool:
"""Check if a component name is present in the pipeline. Equivalent to
@ -796,7 +852,7 @@ class Language:
# to Language.pipeline to make sure the configs are handled correctly
pipe_index = self.pipe_names.index(name)
self.remove_pipe(name)
if not len(self.pipeline) or pipe_index == len(self.pipeline):
if not len(self._components) or pipe_index == len(self._components):
# we have no components to insert before/after, or we're replacing the last component
self.add_pipe(factory_name, name=name, config=config, validate=validate)
else:
@ -816,12 +872,16 @@ class Language:
DOCS: https://spacy.io/api/language#rename_pipe
"""
if old_name not in self.pipe_names:
raise ValueError(Errors.E001.format(name=old_name, opts=self.pipe_names))
if new_name in self.pipe_names:
raise ValueError(Errors.E007.format(name=new_name, opts=self.pipe_names))
i = self.pipe_names.index(old_name)
self.pipeline[i] = (new_name, self.pipeline[i][1])
if old_name not in self.component_names:
raise ValueError(
Errors.E001.format(name=old_name, opts=self.component_names)
)
if new_name in self.component_names:
raise ValueError(
Errors.E007.format(name=new_name, opts=self.component_names)
)
i = self.component_names.index(old_name)
self._components[i] = (new_name, self._components[i][1])
self._pipe_meta[new_name] = self._pipe_meta.pop(old_name)
self._pipe_configs[new_name] = self._pipe_configs.pop(old_name)
@ -833,20 +893,45 @@ class Language:
DOCS: https://spacy.io/api/language#remove_pipe
"""
if name not in self.pipe_names:
raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
removed = self.pipeline.pop(self.pipe_names.index(name))
if name not in self.component_names:
raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
removed = self._components.pop(self.component_names.index(name))
# We're only removing the component itself from the metas/configs here
# because factory may be used for something else
self._pipe_meta.pop(name)
self._pipe_configs.pop(name)
# Make sure the name is also removed from the set of disabled components
if name in self.disabled:
self._disabled.remove(name)
return removed
def disable_pipe(self, name: str) -> None:
"""Disable a pipeline component. The component will still exist on
the nlp object, but it won't be run as part of the pipeline. Does
nothing if the component is already disabled.
name (str): The name of the component to disable.
"""
if name not in self.component_names:
raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
self._disabled.add(name)
def enable_pipe(self, name: str) -> None:
"""Enable a previously disabled pipeline component so it's run as part
of the pipeline. Does nothing if the component is already enabled.
name (str): The name of the component to enable.
"""
if name not in self.component_names:
raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
if name in self.disabled:
self._disabled.remove(name)
def __call__(
self,
text: str,
*,
disable: Iterable[str] = tuple(),
disable: Iterable[str] = SimpleFrozenList(),
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
) -> Doc:
"""Apply the pipeline to some text. The text can span multiple sentences,
@ -892,7 +977,7 @@ class Language:
warnings.warn(Warnings.W096, DeprecationWarning)
if len(names) == 1 and isinstance(names[0], (list, tuple)):
names = names[0] # support list of names instead of spread
return DisabledPipes(self, names)
return self.select_pipes(disable=names)
def select_pipes(
self,
@ -945,7 +1030,7 @@ class Language:
sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
exclude: Iterable[str] = tuple(),
exclude: Iterable[str] = SimpleFrozenList(),
):
"""Update the models in the pipeline.
@ -999,7 +1084,7 @@ class Language:
sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
exclude: Iterable[str] = tuple(),
exclude: Iterable[str] = SimpleFrozenList(),
) -> Dict[str, float]:
"""Make a "rehearsal" update to the models in the pipeline, to prevent
forgetting. Rehearsal updates run an initial copy of the model over some
@ -1228,7 +1313,7 @@ class Language:
*,
as_tuples: bool = False,
batch_size: int = 1000,
disable: Iterable[str] = tuple(),
disable: Iterable[str] = SimpleFrozenList(),
cleanup: bool = False,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
n_process: int = 1,
@ -1388,7 +1473,8 @@ class Language:
config: Union[Dict[str, Any], Config] = {},
*,
vocab: Union[Vocab, bool] = True,
disable: Iterable[str] = tuple(),
disable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(),
auto_fill: bool = True,
validate: bool = True,
) -> "Language":
@ -1398,7 +1484,11 @@ class Language:
config (Dict[str, Any] / Config): The loaded config.
vocab (Vocab): A Vocab object. If True, a vocab is created.
disable (Iterable[str]): List of pipeline component names to disable.
disable (Iterable[str]): Names of pipeline components to disable.
Disabled pipes will be loaded but they won't be run unless you
explicitly enable them by calling nlp.enable_pipe.
exclude (Iterable[str]): Names of pipeline components to exclude.
Excluded components won't be loaded.
auto_fill (bool): Automatically fill in missing values in config based
on defaults and function argument annotations.
validate (bool): Validate the component config and arguments against
@ -1471,7 +1561,7 @@ class Language:
raise ValueError(Errors.E956.format(name=pipe_name, opts=opts))
pipe_cfg = util.copy_config(pipeline[pipe_name])
raw_config = Config(filled["components"][pipe_name])
if pipe_name not in disable:
if pipe_name not in exclude:
if "factory" not in pipe_cfg and "source" not in pipe_cfg:
err = Errors.E984.format(name=pipe_name, config=pipe_cfg)
raise ValueError(err)
@ -1496,6 +1586,8 @@ class Language:
)
source_name = pipe_cfg.get("component", pipe_name)
nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name)
disabled_pipes = [*config["nlp"]["disabled"], *disable]
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
nlp.config = filled if auto_fill else config
nlp.resolved = resolved
if after_pipeline_creation is not None:
@ -1507,7 +1599,7 @@ class Language:
return nlp
def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = tuple()
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> None:
"""Save the current state to a directory. If a model is loaded, this
will include the model.
@ -1525,9 +1617,7 @@ class Language:
)
serializers["meta.json"] = lambda p: srsly.write_json(p, self.meta)
serializers["config.cfg"] = lambda p: self.config.to_disk(p)
for name, proc in self.pipeline:
if not hasattr(proc, "name"):
continue
for name, proc in self._components:
if name in exclude:
continue
if not hasattr(proc, "to_disk"):
@ -1537,7 +1627,7 @@ class Language:
util.to_disk(path, serializers, exclude)
def from_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = tuple()
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> "Language":
"""Loads state from a directory. Modifies the object in place and
returns it. If the saved `Language` object contains a model, the
@ -1573,7 +1663,7 @@ class Language:
deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(
p, exclude=["vocab"]
)
for name, proc in self.pipeline:
for name, proc in self._components:
if name in exclude:
continue
if not hasattr(proc, "from_disk"):
@ -1589,7 +1679,7 @@ class Language:
self._link_components()
return self
def to_bytes(self, *, exclude: Iterable[str] = tuple()) -> bytes:
def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
"""Serialize the current state to a binary string.
exclude (list): Names of components or serialization fields to exclude.
@ -1602,7 +1692,7 @@ class Language:
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
serializers["meta.json"] = lambda: srsly.json_dumps(self.meta)
serializers["config.cfg"] = lambda: self.config.to_bytes()
for name, proc in self.pipeline:
for name, proc in self._components:
if name in exclude:
continue
if not hasattr(proc, "to_bytes"):
@ -1611,7 +1701,7 @@ class Language:
return util.to_bytes(serializers, exclude)
def from_bytes(
self, bytes_data: bytes, *, exclude: Iterable[str] = tuple()
self, bytes_data: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
) -> "Language":
"""Load state from a binary string.
@ -1638,7 +1728,7 @@ class Language:
deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(
b, exclude=["vocab"]
)
for name, proc in self.pipeline:
for name, proc in self._components:
if name in exclude:
continue
if not hasattr(proc, "from_bytes"):
@ -1674,14 +1764,10 @@ class DisabledPipes(list):
def __init__(self, nlp: Language, names: List[str]) -> None:
self.nlp = nlp
self.names = names
# Important! Not deep copy -- we just want the container (but we also
# want to support people providing arbitrarily typed nlp.pipeline
# objects.)
self.original_pipeline = copy(nlp.pipeline)
self.metas = {name: nlp.get_pipe_meta(name) for name in names}
self.configs = {name: nlp.get_pipe_config(name) for name in names}
for name in self.names:
self.nlp.disable_pipe(name)
list.__init__(self)
self.extend(nlp.remove_pipe(name) for name in names)
self.extend(self.names)
def __enter__(self):
return self
@ -1691,14 +1777,10 @@ class DisabledPipes(list):
def restore(self) -> None:
"""Restore the pipeline to its state when DisabledPipes was created."""
current, self.nlp.pipeline = self.nlp.pipeline, self.original_pipeline
unexpected = [name for name, pipe in current if not self.nlp.has_pipe(name)]
if unexpected:
# Don't change the pipeline if we're raising an error.
self.nlp.pipeline = current
raise ValueError(Errors.E008.format(names=unexpected))
self.nlp._pipe_meta.update(self.metas)
self.nlp._pipe_configs.update(self.configs)
for name in self.names:
if name not in self.nlp.component_names:
raise ValueError(Errors.E008.format(name=name))
self.nlp.enable_pipe(name)
self[:] = []

View File

@ -12,6 +12,7 @@ from ..symbols import IDS, TAG, POS, MORPH, LEMMA
from ..tokens import Doc, Span
from ..tokens._retokenize import normalize_token_attrs, set_token_attrs
from ..vocab import Vocab
from ..util import SimpleFrozenList
from .. import util
@ -220,7 +221,7 @@ class AttributeRuler(Pipe):
results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
return results
def to_bytes(self, exclude: Iterable[str] = tuple()) -> bytes:
def to_bytes(self, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
"""Serialize the AttributeRuler to a bytestring.
exclude (Iterable[str]): String names of serialization fields to exclude.
@ -233,7 +234,9 @@ class AttributeRuler(Pipe):
serialize["patterns"] = lambda: srsly.msgpack_dumps(self.patterns)
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data: bytes, exclude: Iterable[str] = tuple()):
def from_bytes(
self, bytes_data: bytes, exclude: Iterable[str] = SimpleFrozenList()
):
"""Load the AttributeRuler from a bytestring.
bytes_data (bytes): The data to load.
@ -254,7 +257,9 @@ class AttributeRuler(Pipe):
return self
def to_disk(self, path: Union[Path, str], exclude: Iterable[str] = tuple()) -> None:
def to_disk(
self, path: Union[Path, str], exclude: Iterable[str] = SimpleFrozenList()
) -> None:
"""Serialize the AttributeRuler to disk.
path (Union[Path, str]): A path to a directory.
@ -268,7 +273,7 @@ class AttributeRuler(Pipe):
util.to_disk(path, serialize, exclude)
def from_disk(
self, path: Union[Path, str], exclude: Iterable[str] = tuple()
self, path: Union[Path, str], exclude: Iterable[str] = SimpleFrozenList()
) -> None:
"""Load the AttributeRuler from disk.

View File

@ -13,6 +13,7 @@ from ..language import Language
from ..vocab import Vocab
from ..gold import Example, validate_examples
from ..errors import Errors, Warnings
from ..util import SimpleFrozenList
from .. import util
@ -404,7 +405,7 @@ class EntityLinker(Pipe):
token.ent_kb_id_ = kb_id
def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = tuple()
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList(),
) -> None:
"""Serialize the pipe to disk.
@ -421,7 +422,7 @@ class EntityLinker(Pipe):
util.to_disk(path, serialize, exclude)
def from_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = tuple()
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList(),
) -> "EntityLinker":
"""Load the pipe from disk. Modifies the object in place and returns it.

View File

@ -5,7 +5,7 @@ import srsly
from ..language import Language
from ..errors import Errors
from ..util import ensure_path, to_disk, from_disk
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
from ..tokens import Doc, Span
from ..matcher import Matcher, PhraseMatcher
from ..scorer import Scorer
@ -317,7 +317,7 @@ class EntityRuler:
return Scorer.score_spans(examples, "ents", **kwargs)
def from_bytes(
self, patterns_bytes: bytes, *, exclude: Iterable[str] = tuple()
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
) -> "EntityRuler":
"""Load the entity ruler from a bytestring.
@ -341,7 +341,7 @@ class EntityRuler:
self.add_patterns(cfg)
return self
def to_bytes(self, *, exclude: Iterable[str] = tuple()) -> bytes:
def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
"""Serialize the entity ruler patterns to a bytestring.
RETURNS (bytes): The serialized patterns.
@ -357,7 +357,7 @@ class EntityRuler:
return srsly.msgpack_dumps(serial)
def from_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = tuple()
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> "EntityRuler":
"""Load the entity ruler from a file. Expects a file containing
newline-delimited JSON (JSONL) with one entry per line.
@ -394,7 +394,7 @@ class EntityRuler:
return self
def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = tuple()
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> None:
"""Save the entity ruler patterns to a directory. The patterns will be
saved as newline-delimited JSON (JSONL).

View File

@ -223,6 +223,7 @@ class ConfigSchemaNlp(BaseModel):
# fmt: off
lang: StrictStr = Field(..., title="The base language to use")
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default")
tokenizer: Callable = Field(..., title="The tokenizer to use")
load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")

View File

@ -1,10 +1,10 @@
from typing import Optional, Iterable, Dict, Any, Callable, Tuple, TYPE_CHECKING
from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING
import numpy as np
from .gold import Example
from .tokens import Token, Doc, Span
from .errors import Errors
from .util import get_lang_class
from .util import get_lang_class, SimpleFrozenList
from .morphology import Morphology
if TYPE_CHECKING:
@ -317,7 +317,7 @@ class Scorer:
attr: str,
*,
getter: Callable[[Doc, str], Any] = getattr,
labels: Iterable[str] = tuple(),
labels: Iterable[str] = SimpleFrozenList(),
multi_label: bool = True,
positive_label: Optional[str] = None,
threshold: Optional[float] = None,
@ -447,7 +447,7 @@ class Scorer:
getter: Callable[[Token, str], Any] = getattr,
head_attr: str = "head",
head_getter: Callable[[Token, str], Token] = getattr,
ignore_labels: Tuple[str] = tuple(),
ignore_labels: Iterable[str] = SimpleFrozenList(),
**cfg,
) -> Dict[str, Any]:
"""Returns the UAS, LAS, and LAS per type scores for dependency

View File

@ -1,5 +1,6 @@
import pytest
from spacy.language import Language
from spacy.util import SimpleFrozenList
@pytest.fixture
@ -181,6 +182,11 @@ def test_select_pipes_errors(nlp):
with pytest.raises(ValueError):
nlp.select_pipes(enable=[], disable=["c3"])
disabled = nlp.select_pipes(disable=["c2"])
nlp.remove_pipe("c2")
with pytest.raises(ValueError):
disabled.restore()
@pytest.mark.parametrize("n_pipes", [100])
def test_add_lots_of_pipes(nlp, n_pipes):
@ -249,3 +255,94 @@ def test_add_pipe_before_after():
nlp.add_pipe("entity_ruler", before=True)
with pytest.raises(ValueError):
nlp.add_pipe("entity_ruler", first=False)
def test_disable_enable_pipes():
name = "test_disable_enable_pipes"
results = {}
def make_component(name):
results[name] = ""
def component(doc):
nonlocal results
results[name] = doc.text
return doc
return component
c1 = Language.component(f"{name}1", func=make_component(f"{name}1"))
c2 = Language.component(f"{name}2", func=make_component(f"{name}2"))
nlp = Language()
nlp.add_pipe(f"{name}1")
nlp.add_pipe(f"{name}2")
assert results[f"{name}1"] == ""
assert results[f"{name}2"] == ""
assert nlp.pipeline == [(f"{name}1", c1), (f"{name}2", c2)]
assert nlp.pipe_names == [f"{name}1", f"{name}2"]
nlp.disable_pipe(f"{name}1")
assert nlp.disabled == [f"{name}1"]
assert nlp.component_names == [f"{name}1", f"{name}2"]
assert nlp.pipe_names == [f"{name}2"]
assert nlp.config["nlp"]["disabled"] == [f"{name}1"]
nlp("hello")
assert results[f"{name}1"] == "" # didn't run
assert results[f"{name}2"] == "hello" # ran
nlp.enable_pipe(f"{name}1")
assert nlp.disabled == []
assert nlp.pipe_names == [f"{name}1", f"{name}2"]
assert nlp.config["nlp"]["disabled"] == []
nlp("world")
assert results[f"{name}1"] == "world"
assert results[f"{name}2"] == "world"
nlp.disable_pipe(f"{name}2")
nlp.remove_pipe(f"{name}2")
assert nlp.components == [(f"{name}1", c1)]
assert nlp.pipeline == [(f"{name}1", c1)]
assert nlp.component_names == [f"{name}1"]
assert nlp.pipe_names == [f"{name}1"]
assert nlp.disabled == []
assert nlp.config["nlp"]["disabled"] == []
nlp.rename_pipe(f"{name}1", name)
assert nlp.components == [(name, c1)]
assert nlp.component_names == [name]
nlp("!")
assert results[f"{name}1"] == "!"
assert results[f"{name}2"] == "world"
with pytest.raises(ValueError):
nlp.disable_pipe(f"{name}2")
nlp.disable_pipe(name)
assert nlp.component_names == [name]
assert nlp.pipe_names == []
assert nlp.config["nlp"]["disabled"] == [name]
nlp("?")
assert results[f"{name}1"] == "!"
def test_pipe_methods_frozen():
"""Test that spaCy raises custom error messages if "frozen" properties are
accessed. We still want to use a list here to not break backwards
compatibility, but users should see an error if they're trying to append
to nlp.pipeline etc."""
nlp = Language()
ner = nlp.add_pipe("ner")
assert nlp.pipe_names == ["ner"]
for prop in [
nlp.pipeline,
nlp.pipe_names,
nlp.components,
nlp.component_names,
nlp.disabled,
nlp.factory_names,
]:
assert isinstance(prop, list)
assert isinstance(prop, SimpleFrozenList)
with pytest.raises(NotImplementedError):
nlp.pipeline.append(("ner2", ner))
with pytest.raises(NotImplementedError):
nlp.pipe_names.pop()
with pytest.raises(NotImplementedError):
nlp.components.sort()
with pytest.raises(NotImplementedError):
nlp.component_names.clear()

View File

@ -161,6 +161,7 @@ def test_issue4674():
assert kb2.get_size_entities() == 1
@pytest.mark.skip(reason="API change: disable just disables, new exclude arg")
def test_issue4707():
"""Tests that disabled component names are also excluded from nlp.from_disk
by default when loading a model.

View File

@ -6,6 +6,8 @@ from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL
from spacy.pipeline.senter import DEFAULT_SENTER_MODEL
from spacy.lang.en import English
import spacy
from ..util import make_tempdir
@ -173,3 +175,34 @@ def test_serialize_sentencerecognizer(en_vocab):
sr_b = sr.to_bytes()
sr_d = SentenceRecognizer(en_vocab, model).from_bytes(sr_b)
assert sr.to_bytes() == sr_d.to_bytes()
def test_serialize_pipeline_disable_enable():
nlp = English()
nlp.add_pipe("ner")
nlp.add_pipe("tagger")
nlp.disable_pipe("tagger")
assert nlp.config["nlp"]["disabled"] == ["tagger"]
config = nlp.config.copy()
nlp2 = English.from_config(config)
assert nlp2.pipe_names == ["ner"]
assert nlp2.component_names == ["ner", "tagger"]
assert nlp2.disabled == ["tagger"]
assert nlp2.config["nlp"]["disabled"] == ["tagger"]
with make_tempdir() as d:
nlp2.to_disk(d)
nlp3 = spacy.load(d)
assert nlp3.pipe_names == ["ner"]
assert nlp3.component_names == ["ner", "tagger"]
with make_tempdir() as d:
nlp3.to_disk(d)
nlp4 = spacy.load(d, disable=["ner"])
assert nlp4.pipe_names == []
assert nlp4.component_names == ["ner", "tagger"]
assert nlp4.disabled == ["ner", "tagger"]
with make_tempdir() as d:
nlp.to_disk(d)
nlp5 = spacy.load(d, exclude=["tagger"])
assert nlp5.pipe_names == ["ner"]
assert nlp5.component_names == ["ner"]
assert nlp5.disabled == []

View File

@ -3,10 +3,9 @@ import pytest
from .util import get_random_doc
from spacy import util
from spacy.util import dot_to_object
from spacy.util import dot_to_object, SimpleFrozenList
from thinc.api import Config, Optimizer
from spacy.gold.batchers import minibatch_by_words
from ..lang.en import English
from ..lang.nl import Dutch
from ..language import DEFAULT_CONFIG_PATH
@ -106,3 +105,20 @@ def test_util_dot_section():
assert not dot_to_object(en_config, "nlp.load_vocab_data")
assert dot_to_object(nl_config, "nlp.load_vocab_data")
assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)
def test_simple_frozen_list():
t = SimpleFrozenList(["foo", "bar"])
assert t == ["foo", "bar"]
assert t.index("bar") == 1 # okay method
with pytest.raises(NotImplementedError):
t.append("baz")
with pytest.raises(NotImplementedError):
t.sort()
with pytest.raises(NotImplementedError):
t.extend(["baz"])
with pytest.raises(NotImplementedError):
t.pop()
t = SimpleFrozenList(["foo", "bar"], error="Error!")
with pytest.raises(NotImplementedError):
t.append("baz")

View File

@ -10,7 +10,7 @@ from ..vocab import Vocab
from ..compat import copy_reg
from ..attrs import SPACY, ORTH, intify_attr
from ..errors import Errors
from ..util import ensure_path
from ..util import ensure_path, SimpleFrozenList
# fmt: off
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
@ -52,7 +52,7 @@ class DocBin:
self,
attrs: Iterable[str] = ALL_ATTRS,
store_user_data: bool = False,
docs: Iterable[Doc] = tuple(),
docs: Iterable[Doc] = SimpleFrozenList(),
) -> None:
"""Create a DocBin object to hold serialized annotations.

View File

@ -120,6 +120,47 @@ class SimpleFrozenDict(dict):
raise NotImplementedError(self.error)
class SimpleFrozenList(list):
"""Wrapper class around a list that lets us raise custom errors if certain
attributes/methods are accessed. Mostly used for properties like
Language.pipeline that return an immutable list (and that we don't want to
convert to a tuple to not break too much backwards compatibility). If a user
accidentally calls nlp.pipeline.append(), we can raise a more helpful error.
"""
def __init__(self, *args, error: str = Errors.E927) -> None:
"""Initialize the frozen list.
error (str): The error message when user tries to mutate the list.
"""
self.error = error
super().__init__(*args)
def append(self, *args, **kwargs):
raise NotImplementedError(self.error)
def clear(self, *args, **kwargs):
raise NotImplementedError(self.error)
def extend(self, *args, **kwargs):
raise NotImplementedError(self.error)
def insert(self, *args, **kwargs):
raise NotImplementedError(self.error)
def pop(self, *args, **kwargs):
raise NotImplementedError(self.error)
def remove(self, *args, **kwargs):
raise NotImplementedError(self.error)
def reverse(self, *args, **kwargs):
raise NotImplementedError(self.error)
def sort(self, *args, **kwargs):
raise NotImplementedError(self.error)
def lang_class_is_loaded(lang: str) -> bool:
"""Check whether a Language class is already loaded. Language classes are
loaded lazily, to avoid expensive setup code associated with the language
@ -215,7 +256,8 @@ def load_model(
name: Union[str, Path],
*,
vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = tuple(),
disable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
"""Load a model from a package or data path.
@ -228,7 +270,7 @@ def load_model(
keyed by section values in dot notation.
RETURNS (Language): The loaded nlp object.
"""
kwargs = {"vocab": vocab, "disable": disable, "config": config}
kwargs = {"vocab": vocab, "disable": disable, "exclude": exclude, "config": config}
if isinstance(name, str): # name or string path
if name.startswith("blank:"): # shortcut for blank model
return get_lang_class(name.replace("blank:", ""))()
@ -247,7 +289,8 @@ def load_model_from_package(
name: str,
*,
vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = tuple(),
disable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
"""Load a model from an installed package.
@ -255,13 +298,17 @@ def load_model_from_package(
name (str): The package name.
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable.
disable (Iterable[str]): Names of pipeline components to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe.
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation.
RETURNS (Language): The loaded nlp object.
"""
cls = importlib.import_module(name)
return cls.load(vocab=vocab, disable=disable, config=config)
return cls.load(vocab=vocab, disable=disable, exclude=exclude, config=config)
def load_model_from_path(
@ -269,7 +316,8 @@ def load_model_from_path(
*,
meta: Optional[Dict[str, Any]] = None,
vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = tuple(),
disable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
"""Load a model from a data directory path. Creates Language class with
@ -279,7 +327,11 @@ def load_model_from_path(
meta (Dict[str, Any]): Optional model meta.
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable.
disable (Iterable[str]): Names of pipeline components to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe.
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation.
RETURNS (Language): The loaded nlp object.
@ -290,15 +342,18 @@ def load_model_from_path(
meta = get_model_meta(model_path)
config_path = model_path / "config.cfg"
config = load_config(config_path, overrides=dict_to_dot(config))
nlp, _ = load_model_from_config(config, vocab=vocab, disable=disable)
return nlp.from_disk(model_path, exclude=disable)
nlp, _ = load_model_from_config(
config, vocab=vocab, disable=disable, exclude=exclude
)
return nlp.from_disk(model_path, exclude=exclude)
def load_model_from_config(
config: Union[Dict[str, Any], Config],
*,
vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = tuple(),
disable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(),
auto_fill: bool = False,
validate: bool = True,
) -> Tuple["Language", Config]:
@ -309,7 +364,11 @@ def load_model_from_config(
meta (Dict[str, Any]): Optional model meta.
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable.
disable (Iterable[str]): Names of pipeline components to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe.
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
components won't be loaded.
auto_fill (bool): Whether to auto-fill config with missing defaults.
validate (bool): Whether to show config validation errors.
RETURNS (Language): The loaded nlp object.
@ -323,7 +382,12 @@ def load_model_from_config(
# registry, including custom subclasses provided via entry points
lang_cls = get_lang_class(nlp_config["lang"])
nlp = lang_cls.from_config(
config, vocab=vocab, disable=disable, auto_fill=auto_fill, validate=validate,
config,
vocab=vocab,
disable=disable,
exclude=exclude,
auto_fill=auto_fill,
validate=validate,
)
return nlp, nlp.resolved
@ -332,7 +396,8 @@ def load_model_from_init_py(
init_file: Union[Path, str],
*,
vocab: Union["Vocab", bool] = True,
disable: Iterable[str] = tuple(),
disable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(),
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
) -> "Language":
"""Helper function to use in the `load()` method of a model package's
@ -340,7 +405,11 @@ def load_model_from_init_py(
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
a new Vocab object will be created.
disable (Iterable[str]): Names of pipeline components to disable.
disable (Iterable[str]): Names of pipeline components to disable. Disabled
pipes will be loaded but they won't be run unless you explicitly
enable them by calling nlp.enable_pipe.
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
components won't be loaded.
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
keyed by section values in dot notation.
RETURNS (Language): The loaded nlp object.
@ -352,7 +421,12 @@ def load_model_from_init_py(
if not model_path.exists():
raise IOError(Errors.E052.format(path=data_path))
return load_model_from_path(
data_path, vocab=vocab, meta=meta, disable=disable, config=config
data_path,
vocab=vocab,
meta=meta,
disable=disable,
exclude=exclude,
config=config,
)

View File

@ -75,9 +75,10 @@ Defines the `nlp` object, its tokenizer and
[processing pipeline](/usage/processing-pipelines) component names.
| Name | Description |
| ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `lang` | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `null`. ~~str~~ |
| `pipeline` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). Defaults to `[]`. ~~List[str]~~ |
| `disabled` | Names of pipeline components that are loaded but disabled by default and not run as part of the pipeline. Should correspond to components listed in `pipeline`. After a model is loaded, disabled components can be enabled using [`Language.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
| `load_vocab_data` | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. Defaults to `true`. ~~bool~~ |
| `before_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. Defaults to `null`. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~ |
| `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |

View File

@ -357,35 +357,6 @@ their original weights after the block.
| -------- | ------------------------------------------------------ |
| `params` | A dictionary of parameters keyed by model ID. ~~dict~~ |
## Language.create_pipe {#create_pipe tag="method" new="2"}
Create a pipeline component from a factory.
<Infobox title="Changed in v3.0" variant="warning">
As of v3.0, the [`Language.add_pipe`](/api/language#add_pipe) method also takes
the string name of the factory, creates the component, adds it to the pipeline
and returns it. The `Language.create_pipe` method is now mostly used internally.
To create a component and add it to the pipeline, you should always use
`Language.add_pipe`.
</Infobox>
> #### Example
>
> ```python
> parser = nlp.create_pipe("parser")
> ```
| Name | Description |
| ------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `factory_name` | Name of the registered component factory. ~~str~~ |
| `name` | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. ~~Optional[str]~~ |
| _keyword-only_ | |
| `config` <Tag variant="new">3</Tag> | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ |
| `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
| **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ |
## Language.add_pipe {#add_pipe tag="method" new="2"}
Add a component to the processing pipeline. Expects a name that maps to a
@ -434,6 +405,35 @@ component, adds it to the pipeline and returns it.
| `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
| **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ |
## Language.create_pipe {#create_pipe tag="method" new="2"}
Create a pipeline component from a factory.
<Infobox title="Changed in v3.0" variant="warning">
As of v3.0, the [`Language.add_pipe`](/api/language#add_pipe) method also takes
the string name of the factory, creates the component, adds it to the pipeline
and returns it. The `Language.create_pipe` method is now mostly used internally.
To create a component and add it to the pipeline, you should always use
`Language.add_pipe`.
</Infobox>
> #### Example
>
> ```python
> parser = nlp.create_pipe("parser")
> ```
| Name | Description |
| ------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `factory_name` | Name of the registered component factory. ~~str~~ |
| `name` | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. ~~Optional[str]~~ |
| _keyword-only_ | |
| `config` <Tag variant="new">3</Tag> | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ |
| `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
| **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ |
## Language.has_factory {#has_factory tag="classmethod" new="3"}
Check whether a factory name is registered on the `Language` class or subclass.
@ -561,6 +561,54 @@ component function.
| `name` | Name of the component to remove. ~~str~~ |
| **RETURNS** | A `(name, component)` tuple of the removed component. ~~Tuple[str, Callable[[Doc], Doc]]~~ |
## Language.disable_pipe {#disable_pipe tag="method" new="3"}
Temporarily disable a pipeline component so it's not run as part of the
pipeline. Disabled components are listed in
[`nlp.disabled`](/api/language#attributes) and included in
[`nlp.components`](/api/language#attributes), but not in
[`nlp.pipeline`](/api/language#pipeline), so they're not run when you process a
`Doc` with the `nlp` object. If the component is already disabled, this method
does nothing.
> #### Example
>
> ```python
> nlp.add_pipe("ner")
> nlp.add_pipe("textcat")
> assert nlp.pipe_names == ["ner", "textcat"]
> nlp.disable_pipe("ner")
> assert nlp.pipe_names == ["textcat"]
> assert nlp.component_names == ["ner", "textcat"]
> assert nlp.disabled == ["ner"]
> ```
| Name | Description |
| ------ | ----------------------------------------- |
| `name` | Name of the component to disable. ~~str~~ |
## Language.enable_pipe {#enable_pipe tag="method" new="3"}
Enable a previously disable component (e.g. via
[`Language.disable_pipes`](/api/language#disable_pipes)) so it's run as part of
the pipeline, [`nlp.pipeline`](/api/language#pipeline). If the component is
already enabled, this method does nothing.
> #### Example
>
> ```python
> nlp.disable_pipe("ner")
> assert "ner" in nlp.disabled
> assert not "ner" in nlp.pipe_names
> nlp.enable_pipe("ner")
> assert not "ner" in nlp.disabled
> assert "ner" in nlp.pipe_names
> ```
| Name | Description |
| ------ | ---------------------------------------- |
| `name` | Name of the component to enable. ~~str~~ |
## Language.select_pipes {#select_pipes tag="contextmanager, method" new="3"}
Disable one or more pipeline components. If used as a context manager, the
@ -568,7 +616,9 @@ pipeline will be restored to the initial state at the end of the block.
Otherwise, a `DisabledPipes` object is returned, that has a `.restore()` method
you can use to undo your changes. You can specify either `disable` (as a list or
string), or `enable`. In the latter case, all components not in the `enable`
list, will be disabled.
list, will be disabled. Under the hood, this method calls into
[`disable_pipe`](/api/language#disable_pipe) and
[`enable_pipe`](/api/language#enable_pipe).
> #### Example
>
@ -861,16 +911,19 @@ available to the loaded object.
## Attributes {#attributes}
| Name | Description |
| --------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------- |
| --------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | A container for the lexical types. ~~Vocab~~ |
| `tokenizer` | The tokenizer. ~~Tokenizer~~ |
| `make_doc` | Callable that takes a string and returns a `Doc`. ~~Callable[[str], Doc]~~ |
| `pipeline` | List of `(name, component)` tuples describing the current processing pipeline, in order. ~~List[str, Callable[[Doc], Doc]]~~ |
| `pipeline` | List of `(name, component)` tuples describing the current processing pipeline, in order. ~~List[Tuple[str, Callable[[Doc], Doc]]]~~ |
| `pipe_names` <Tag variant="new">2</Tag> | List of pipeline component names, in order. ~~List[str]~~ |
| `pipe_labels` <Tag variant="new">2.2</Tag> | List of labels set by the pipeline components, if available, keyed by component name. ~~Dict[str, List[str]]~~ |
| `pipe_factories` <Tag variant="new">2.2</Tag> | Dictionary of pipeline component names, mapped to their factory names. ~~Dict[str, str]~~ |
| `factories` | All available factory functions, keyed by name. ~~Dict[str, Callable[[...], Callable[[Doc], Doc]]]~~ |
| `factory_names` <Tag variant="new">3</Tag> | List of all available factory names. ~~List[str]~~ |
| `components` <Tag variant="new">3</Tag> | List of all available `(name, component)` tuples, including components that are currently disabled. ~~List[Tuple[str, Callable[[Doc], Doc]]]~~ |
| `component_names` <Tag variant="new">3</Tag> | List of all available component names, including components that are currently disabled. ~~List[str]~~ |
| `disabled` <Tag variant="new">3</Tag> | Names of components that are currently disabled and don't run as part of the pipeline. ~~List[str]~~ |
| `path` <Tag variant="new">2</Tag> | Path to the model data directory, if a model is loaded. Otherwise `None`. ~~Optional[Path]~~ |
## Class attributes {#class-attributes}

View File

@ -23,6 +23,14 @@ path, spaCy will assume it's a data directory, load its
information to construct the `Language` class. The data will be loaded in via
[`Language.from_disk`](/api/language#from_disk).
<Infobox variant="warning" title="Changed in v3.0">
As of v3.0, the `disable` keyword argument specifies components to load but
disable, instead of components to not load at all. Those components can now be
specified separately using the new `exclude` keyword argument.
</Infobox>
> #### Example
>
> ```python
@ -30,14 +38,15 @@ information to construct the `Language` class. The data will be loaded in via
> nlp = spacy.load("/path/to/en") # string path
> nlp = spacy.load(Path("/path/to/en")) # pathlib Path
>
> nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"])
> nlp = spacy.load("en_core_web_sm", exclude=["parser", "tagger"])
> ```
| Name | Description |
| ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | Model to load, i.e. package name or path. ~~Union[str, Path]~~ |
| _keyword-only_ | |
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ |
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
| **RETURNS** | A `Language` object with the loaded model. ~~Language~~ |
@ -562,15 +571,16 @@ and create a `Language` object. The model data will then be loaded in via
>
> ```python
> nlp = util.load_model("en_core_web_sm")
> nlp = util.load_model("en_core_web_sm", disable=["ner"])
> nlp = util.load_model("en_core_web_sm", exclude=["ner"])
> nlp = util.load_model("/path/to/data")
> ```
| Name | Description |
| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | Package name or model path. ~~str~~ |
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
| `disable` | Names of pipeline components to disable. ~~Iterable[str]~~ |
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
| **RETURNS** | `Language` class with the loaded model. ~~Language~~ |
@ -589,10 +599,11 @@ A helper function to use in the `load()` method of a model package's
> ```
| Name | Description |
| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `init_file` | Path to model's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ |
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
| `disable` | Names of pipeline components to disable. ~~Iterable[str]~~ |
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
| **RETURNS** | `Language` class with the loaded model. ~~Language~~ |

View File

@ -235,38 +235,54 @@ available pipeline components and component functions.
| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | Assign token-to-vector embeddings. |
| `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. |
### Disabling and modifying pipeline components {#disabling}
### Disabling, excluding and modifying components {#disabling}
If you don't need a particular component of the pipeline for example, the
tagger or the parser, you can **disable loading** it. This can sometimes make a
big difference and improve loading speed. Disabled component names can be
provided to [`spacy.load`](/api/top-level#spacy.load),
[`Language.from_disk`](/api/language#from_disk) or the `nlp` object itself as a
list:
tagger or the parser, you can **disable or exclude** it. This can sometimes make
a big difference and improve loading and inference speed. There are two
different mechanisms you can use:
1. **Disable:** The component and its data will be loaded with the model, but it
will be disabled by default and not run as part of the processing pipeline.
To run it, you can explicitly enable it by calling
[`nlp.enable_pipe`](/api/language#enable_pipe). When you save out the `nlp`
object, the disabled component will be included but disabled by default.
2. **Exclude:** Don't load the component and its data with the model. Once the
model is loaded, there will be no reference to the excluded component.
Disabled and excluded component names can be provided to
[`spacy.load`](/api/top-level#spacy.load) as a list.
<!-- TODO: update with info on our models shipped with optional components -->
> #### 💡 Models with optional components
>
> The `disable` mechanism makes it easy to distribute models with optional
> components that you can enable or disable at runtime. For instance, your model
> may include a statistical _and_ a rule-based component for sentence
> segmentation, and you can choose which one to run depending on your use case.
```python
### Disable loading
# Load the model without the entity recognizer
nlp = spacy.load("en_core_web_sm", exclude=["ner"])
# Load the tagger and parser but don't enable them
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser"])
# Explicitly enable the tagger later on
nlp.enable_pipe("tagger")
```
In some cases, you do want to load all pipeline components and their weights,
because you need them at different points in your application. However, if you
only need a `Doc` object with named entities, there's no need to run all
pipeline components on it that can potentially make processing much slower.
Instead, you can use the `disable` keyword argument on
[`nlp.pipe`](/api/language#pipe) to temporarily disable the components **during
processing**:
<Infobox variant="warning" title="Changed in v3.0">
```python
### Disable for processing
for doc in nlp.pipe(texts, disable=["tagger", "parser"]):
# Do something with the doc here
```
As of v3.0, the `disable` keyword argument specifies components to load but
disable, instead of components to not load at all. Those components can now be
specified separately using the new `exclude` keyword argument.
If you need to **execute more code** with components disabled e.g. to reset
the weights or update only some components during training you can use the
[`nlp.select_pipes`](/api/language#select_pipes) context manager. At the end of
the `with` block, the disabled pipeline components will be restored
</Infobox>
As a shortcut, you can use the [`nlp.select_pipes`](/api/language#select_pipes)
context manager to temporarily disable certain components for a given block. At
the end of the `with` block, the disabled pipeline components will be restored
automatically. Alternatively, `select_pipes` returns an object that lets you
call its `restore()` method to restore the disabled components when needed. This
can be useful if you want to prevent unnecessary code indentation of large
@ -295,6 +311,14 @@ with nlp.select_pipes(enable="parser"):
doc = nlp("I will only be parsed")
```
The [`nlp.pipe`](/api/language#pipe) method also supports a `disable` keyword
argument if you only want to disable components during processing:
```python
for doc in nlp.pipe(texts, disable=["tagger", "parser"]):
# Do something with the doc here
```
Finally, you can also use the [`remove_pipe`](/api/language#remove_pipe) method
to remove pipeline components from an existing pipeline, the
[`rename_pipe`](/api/language#rename_pipe) method to rename them, or the
@ -308,6 +332,31 @@ nlp.rename_pipe("ner", "entityrecognizer")
nlp.replace_pipe("tagger", my_custom_tagger)
```
The `Language` object exposes different [attributes](/api/language#attributes)
that let you inspect all available components and the components that currently
run as part of the pipeline.
> #### Example
>
> ```python
> nlp = spacy.blank("en")
> nlp.add_pipe("ner")
> nlp.add_pipe("textcat")
> assert nlp.pipe_names == ["ner", "textcat"]
> nlp.disable_pipe("ner")
> assert nlp.pipe_names == ["textcat"]
> assert nlp.component_names == ["ner", "textcat"]
> assert nlp.disabled == ["ner"]
> ```
| Name | Description |
| --------------------- | ---------------------------------------------------------------- |
| `nlp.pipeline` | `(name, component)` tuples of the processing pipeline, in order. |
| `nlp.pipe_names` | Pipeline component names, in order. |
| `nlp.components` | All `(name, component)` tuples, including disabled components. |
| `nlp.component_names` | All component names, including disabled components. |
| `nlp.disabled` | Names of components that are currently disabled. |
### Sourcing pipeline components from existing models {#sourced-components new="3"}
Pipeline components that are independent can also be reused across models.

View File

@ -254,12 +254,15 @@ The following methods, attributes and commands are new in spaCy v3.0.
| [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). |
| [`Token.morph`](/api/token#attributes) [`Token.morph_`](/api/token#attributes) | Access a token's morphological analysis. |
| [`Language.select_pipes`](/api/language#select_pipes) | Context manager for enabling or disabling specific pipeline components for a block. |
| [`Language.disable_pipe`](/api/language#disable_pipe) [`Language.enable_pipe`](/api/language#enable_pipe) | Disable or enable a loaded pipeline component (but don't remove it). |
| [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. |
| [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a pretrained model and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. |
| [`@Language.factory`](/api/language#factory) [`@Language.component`](/api/language#component) | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions. |
| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class.s |
| [`Language.get_factory_meta`](/api/language#get_factory_meta) [`Language.get_pipe_meta`](/api/language#get_factory_meta) | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name. |
| [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. |
| [`Language.components`](/api/language#attributes) [`Language.component_names`](/api/language#attributes) | All available components and component names, including disabled components that are not run as part of the pipeline. |
| [`Language.disabled`](/api/language#attributes) | Names of disabled components that are not run as part of the pipeline. |
| [`Pipe.score`](/api/pipe#score) | Method on trainable pipeline components that returns a dictionary of evaluation scores. |
| [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). |
| [`util.load_meta`](/api/top-level#util.load_meta) [`util.load_config`](/api/top-level#util.load_config) | Updated helpers for loading a model's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config). |