mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Merge pull request #5993 from explosion/feature/disabled-components
This commit is contained in:
commit
45f46a5c85
|
@ -27,18 +27,23 @@ if sys.maxunicode == 65535:
|
|||
|
||||
def load(
|
||||
name: Union[str, Path],
|
||||
disable: Iterable[str] = tuple(),
|
||||
disable: Iterable[str] = util.SimpleFrozenList(),
|
||||
exclude: Iterable[str] = util.SimpleFrozenList(),
|
||||
config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
|
||||
) -> Language:
|
||||
"""Load a spaCy model from an installed package or a local path.
|
||||
|
||||
name (str): Package name or model path.
|
||||
disable (Iterable[str]): Names of pipeline components to disable.
|
||||
disable (Iterable[str]): Names of pipeline components to disable. Disabled
|
||||
pipes will be loaded but they won't be run unless you explicitly
|
||||
enable them by calling nlp.enable_pipe.
|
||||
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
|
||||
components won't be loaded.
|
||||
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
|
||||
keyed by section values in dot notation.
|
||||
RETURNS (Language): The loaded nlp object.
|
||||
"""
|
||||
return util.load_model(name, disable=disable, config=config)
|
||||
return util.load_model(name, disable=disable, exclude=exclude, config=config)
|
||||
|
||||
|
||||
def blank(name: str, **overrides) -> Language:
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
"""This module contains helpers and subcommands for integrating spaCy projects
|
||||
with Data Version Controk (DVC). https://dvc.org"""
|
||||
from typing import Dict, Any, List, Optional
|
||||
from typing import Dict, Any, List, Optional, Iterable
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
|
@ -8,6 +8,7 @@ from wasabi import msg
|
|||
from .._util import PROJECT_FILE, load_project_config, get_hash, project_cli
|
||||
from .._util import Arg, Opt, NAME, COMMAND
|
||||
from ...util import working_dir, split_command, join_command, run_command
|
||||
from ...util import SimpleFrozenList
|
||||
|
||||
|
||||
DVC_CONFIG = "dvc.yaml"
|
||||
|
@ -130,7 +131,7 @@ def update_dvc_config(
|
|||
|
||||
|
||||
def run_dvc_commands(
|
||||
commands: List[str] = tuple(), flags: Dict[str, bool] = {},
|
||||
commands: Iterable[str] = SimpleFrozenList(), flags: Dict[str, bool] = {},
|
||||
) -> None:
|
||||
"""Run a sequence of DVC commands in a subprocess, in order.
|
||||
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
from typing import Optional, List, Dict, Sequence, Any
|
||||
from typing import Optional, List, Dict, Sequence, Any, Iterable
|
||||
from pathlib import Path
|
||||
from wasabi import msg
|
||||
import sys
|
||||
import srsly
|
||||
|
||||
from ...util import working_dir, run_command, split_command, is_cwd, join_command
|
||||
from ...util import SimpleFrozenList
|
||||
from .._util import PROJECT_FILE, PROJECT_LOCK, load_project_config, get_hash
|
||||
from .._util import get_checksum, project_cli, Arg, Opt, COMMAND
|
||||
|
||||
|
@ -115,7 +116,9 @@ def print_run_help(project_dir: Path, subcommand: Optional[str] = None) -> None:
|
|||
|
||||
|
||||
def run_commands(
|
||||
commands: List[str] = tuple(), silent: bool = False, dry: bool = False,
|
||||
commands: Iterable[str] = SimpleFrozenList(),
|
||||
silent: bool = False,
|
||||
dry: bool = False,
|
||||
) -> None:
|
||||
"""Run a sequence of commands in a subprocess, in order.
|
||||
|
||||
|
|
|
@ -11,6 +11,7 @@ use_pytorch_for_gpu_memory = false
|
|||
[nlp]
|
||||
lang = null
|
||||
pipeline = []
|
||||
disabled = []
|
||||
load_vocab_data = true
|
||||
before_creation = null
|
||||
after_creation = null
|
||||
|
|
|
@ -137,11 +137,10 @@ class Errors:
|
|||
"after (component name or index), first (True) or last (True). "
|
||||
"Invalid configuration: {args}. Existing components: {opts}")
|
||||
E007 = ("'{name}' already exists in pipeline. Existing names: {opts}")
|
||||
E008 = ("Some current components would be lost when restoring previous "
|
||||
"pipeline state. If you added components after calling "
|
||||
"`nlp.select_pipes()`, you should remove them explicitly with "
|
||||
"`nlp.remove_pipe()` before the pipeline is restored. Names of "
|
||||
"the new components: {names}")
|
||||
E008 = ("Can't restore disabled pipeline component '{name}' because it "
|
||||
"doesn't exist in the pipeline anymore. If you want to remove "
|
||||
"components from the pipeline, you should do it before calling "
|
||||
"`nlp.select_pipes()` or after restoring the disabled components.")
|
||||
E010 = ("Word vectors set to length 0. This may be because you don't have "
|
||||
"a model installed or loaded, or because your model doesn't "
|
||||
"include word vectors. For more info, see the docs:\n"
|
||||
|
@ -474,6 +473,13 @@ class Errors:
|
|||
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
E926 = ("It looks like you're trying to modify nlp.{attr} directly. This "
|
||||
"doesn't work because it's an immutable computed property. If you "
|
||||
"need to modify the pipeline, use the built-in methods like "
|
||||
"nlp.add_pipe, nlp.remove_pipe, nlp.disable_pipe or nlp.enable_pipe "
|
||||
"instead.")
|
||||
E927 = ("Can't write to frozen list Maybe you're trying to modify a computed "
|
||||
"property or default function argument?")
|
||||
E928 = ("A 'KnowledgeBase' should be written to / read from a file, but the "
|
||||
"provided argument {loc} is an existing directory.")
|
||||
E929 = ("A 'KnowledgeBase' could not be read from {loc} - the path does "
|
||||
|
|
|
@ -6,7 +6,7 @@ import itertools
|
|||
import weakref
|
||||
import functools
|
||||
from contextlib import contextmanager
|
||||
from copy import copy, deepcopy
|
||||
from copy import deepcopy
|
||||
from pathlib import Path
|
||||
import warnings
|
||||
from thinc.api import get_current_ops, Config, require_gpu, Optimizer
|
||||
|
@ -20,7 +20,7 @@ from .vocab import Vocab, create_vocab
|
|||
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
|
||||
from .gold import Example, validate_examples
|
||||
from .scorer import Scorer
|
||||
from .util import create_default_optimizer, registry
|
||||
from .util import create_default_optimizer, registry, SimpleFrozenList
|
||||
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
|
||||
from .lang.tokenizer_exceptions import URL_MATCH, BASE_EXCEPTIONS
|
||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||||
|
@ -159,7 +159,8 @@ class Language:
|
|||
self.vocab: Vocab = vocab
|
||||
if self.lang is None:
|
||||
self.lang = self.vocab.lang
|
||||
self.pipeline = []
|
||||
self._components = []
|
||||
self._disabled = set()
|
||||
self.max_length = max_length
|
||||
self.resolved = {}
|
||||
# Create the default tokenizer from the default config
|
||||
|
@ -206,10 +207,11 @@ class Language:
|
|||
"keys": self.vocab.vectors.n_keys,
|
||||
"name": self.vocab.vectors.name,
|
||||
}
|
||||
self._meta["labels"] = self.pipe_labels
|
||||
self._meta["labels"] = dict(self.pipe_labels)
|
||||
# TODO: Adding this back to prevent breaking people's code etc., but
|
||||
# we should consider removing it
|
||||
self._meta["pipeline"] = self.pipe_names
|
||||
self._meta["pipeline"] = list(self.pipe_names)
|
||||
self._meta["disabled"] = list(self.disabled)
|
||||
return self._meta
|
||||
|
||||
@meta.setter
|
||||
|
@ -232,13 +234,14 @@ class Language:
|
|||
# we can populate the config again later
|
||||
pipeline = {}
|
||||
score_weights = []
|
||||
for pipe_name in self.pipe_names:
|
||||
for pipe_name in self.component_names:
|
||||
pipe_meta = self.get_pipe_meta(pipe_name)
|
||||
pipe_config = self.get_pipe_config(pipe_name)
|
||||
pipeline[pipe_name] = {"factory": pipe_meta.factory, **pipe_config}
|
||||
if pipe_meta.default_score_weights:
|
||||
score_weights.append(pipe_meta.default_score_weights)
|
||||
self._config["nlp"]["pipeline"] = self.pipe_names
|
||||
self._config["nlp"]["pipeline"] = list(self.component_names)
|
||||
self._config["nlp"]["disabled"] = list(self.disabled)
|
||||
self._config["components"] = pipeline
|
||||
self._config["training"]["score_weights"] = combine_score_weights(score_weights)
|
||||
if not srsly.is_json_serializable(self._config):
|
||||
|
@ -249,21 +252,64 @@ class Language:
|
|||
def config(self, value: Config) -> None:
|
||||
self._config = value
|
||||
|
||||
@property
|
||||
def disabled(self) -> List[str]:
|
||||
"""Get the names of all disabled components.
|
||||
|
||||
RETURNS (List[str]): The disabled components.
|
||||
"""
|
||||
# Make sure the disabled components are returned in the order they
|
||||
# appear in the pipeline (which isn't guaranteed by the set)
|
||||
names = [name for name, _ in self._components if name in self._disabled]
|
||||
return SimpleFrozenList(names, error=Errors.E926.format(attr="disabled"))
|
||||
|
||||
@property
|
||||
def factory_names(self) -> List[str]:
|
||||
"""Get names of all available factories.
|
||||
|
||||
RETURNS (List[str]): The factory names.
|
||||
"""
|
||||
return list(self.factories.keys())
|
||||
names = list(self.factories.keys())
|
||||
return SimpleFrozenList(names)
|
||||
|
||||
@property
|
||||
def pipe_names(self) -> List[str]:
|
||||
"""Get names of available pipeline components.
|
||||
def components(self) -> List[Tuple[str, Callable[[Doc], Doc]]]:
|
||||
"""Get all (name, component) tuples in the pipeline, including the
|
||||
currently disabled components.
|
||||
"""
|
||||
return SimpleFrozenList(
|
||||
self._components, error=Errors.E926.format(attr="components")
|
||||
)
|
||||
|
||||
@property
|
||||
def component_names(self) -> List[str]:
|
||||
"""Get the names of the available pipeline components. Includes all
|
||||
active and inactive pipeline components.
|
||||
|
||||
RETURNS (List[str]): List of component name strings, in order.
|
||||
"""
|
||||
return [pipe_name for pipe_name, _ in self.pipeline]
|
||||
names = [pipe_name for pipe_name, _ in self._components]
|
||||
return SimpleFrozenList(names, error=Errors.E926.format(attr="component_names"))
|
||||
|
||||
@property
|
||||
def pipeline(self) -> List[Tuple[str, Callable[[Doc], Doc]]]:
|
||||
"""The processing pipeline consisting of (name, component) tuples. The
|
||||
components are called on the Doc in order as it passes through the
|
||||
pipeline.
|
||||
|
||||
RETURNS (List[Tuple[str, Callable[[Doc], Doc]]]): The pipeline.
|
||||
"""
|
||||
pipes = [(n, p) for n, p in self._components if n not in self._disabled]
|
||||
return SimpleFrozenList(pipes, error=Errors.E926.format(attr="pipeline"))
|
||||
|
||||
@property
|
||||
def pipe_names(self) -> List[str]:
|
||||
"""Get names of available active pipeline components.
|
||||
|
||||
RETURNS (List[str]): List of component name strings, in order.
|
||||
"""
|
||||
names = [pipe_name for pipe_name, _ in self.pipeline]
|
||||
return SimpleFrozenList(names, error=Errors.E926.format(attr="pipe_names"))
|
||||
|
||||
@property
|
||||
def pipe_factories(self) -> Dict[str, str]:
|
||||
|
@ -272,9 +318,9 @@ class Language:
|
|||
RETURNS (Dict[str, str]): Factory names, keyed by component names.
|
||||
"""
|
||||
factories = {}
|
||||
for pipe_name, pipe in self.pipeline:
|
||||
for pipe_name, pipe in self._components:
|
||||
factories[pipe_name] = self.get_pipe_meta(pipe_name).factory
|
||||
return factories
|
||||
return SimpleFrozenDict(factories)
|
||||
|
||||
@property
|
||||
def pipe_labels(self) -> Dict[str, List[str]]:
|
||||
|
@ -284,10 +330,10 @@ class Language:
|
|||
RETURNS (Dict[str, List[str]]): Labels keyed by component name.
|
||||
"""
|
||||
labels = {}
|
||||
for name, pipe in self.pipeline:
|
||||
for name, pipe in self._components:
|
||||
if hasattr(pipe, "labels"):
|
||||
labels[name] = list(pipe.labels)
|
||||
return labels
|
||||
return SimpleFrozenDict(labels)
|
||||
|
||||
@classmethod
|
||||
def has_factory(cls, name: str) -> bool:
|
||||
|
@ -358,10 +404,10 @@ class Language:
|
|||
name: str,
|
||||
*,
|
||||
default_config: Dict[str, Any] = SimpleFrozenDict(),
|
||||
assigns: Iterable[str] = tuple(),
|
||||
requires: Iterable[str] = tuple(),
|
||||
assigns: Iterable[str] = SimpleFrozenList(),
|
||||
requires: Iterable[str] = SimpleFrozenList(),
|
||||
retokenizes: bool = False,
|
||||
scores: Iterable[str] = tuple(),
|
||||
scores: Iterable[str] = SimpleFrozenList(),
|
||||
default_score_weights: Dict[str, float] = SimpleFrozenDict(),
|
||||
func: Optional[Callable] = None,
|
||||
) -> Callable:
|
||||
|
@ -447,8 +493,8 @@ class Language:
|
|||
cls,
|
||||
name: Optional[str] = None,
|
||||
*,
|
||||
assigns: Iterable[str] = tuple(),
|
||||
requires: Iterable[str] = tuple(),
|
||||
assigns: Iterable[str] = SimpleFrozenList(),
|
||||
requires: Iterable[str] = SimpleFrozenList(),
|
||||
retokenizes: bool = False,
|
||||
func: Optional[Callable[[Doc], Doc]] = None,
|
||||
) -> Callable:
|
||||
|
@ -535,10 +581,10 @@ class Language:
|
|||
|
||||
DOCS: https://spacy.io/api/language#get_pipe
|
||||
"""
|
||||
for pipe_name, component in self.pipeline:
|
||||
for pipe_name, component in self._components:
|
||||
if pipe_name == name:
|
||||
return component
|
||||
raise KeyError(Errors.E001.format(name=name, opts=self.pipe_names))
|
||||
raise KeyError(Errors.E001.format(name=name, opts=self.component_names))
|
||||
|
||||
def create_pipe(
|
||||
self,
|
||||
|
@ -683,8 +729,8 @@ class Language:
|
|||
err = Errors.E966.format(component=bad_val, name=name)
|
||||
raise ValueError(err)
|
||||
name = name if name is not None else factory_name
|
||||
if name in self.pipe_names:
|
||||
raise ValueError(Errors.E007.format(name=name, opts=self.pipe_names))
|
||||
if name in self.component_names:
|
||||
raise ValueError(Errors.E007.format(name=name, opts=self.component_names))
|
||||
if source is not None:
|
||||
# We're loading the component from a model. After loading the
|
||||
# component, we know its real factory name
|
||||
|
@ -709,7 +755,7 @@ class Language:
|
|||
)
|
||||
pipe_index = self._get_pipe_index(before, after, first, last)
|
||||
self._pipe_meta[name] = self.get_factory_meta(factory_name)
|
||||
self.pipeline.insert(pipe_index, (name, pipe_component))
|
||||
self._components.insert(pipe_index, (name, pipe_component))
|
||||
return pipe_component
|
||||
|
||||
def _get_pipe_index(
|
||||
|
@ -730,32 +776,42 @@ class Language:
|
|||
"""
|
||||
all_args = {"before": before, "after": after, "first": first, "last": last}
|
||||
if sum(arg is not None for arg in [before, after, first, last]) >= 2:
|
||||
raise ValueError(Errors.E006.format(args=all_args, opts=self.pipe_names))
|
||||
raise ValueError(
|
||||
Errors.E006.format(args=all_args, opts=self.component_names)
|
||||
)
|
||||
if last or not any(value is not None for value in [first, before, after]):
|
||||
return len(self.pipeline)
|
||||
return len(self._components)
|
||||
elif first:
|
||||
return 0
|
||||
elif isinstance(before, str):
|
||||
if before not in self.pipe_names:
|
||||
raise ValueError(Errors.E001.format(name=before, opts=self.pipe_names))
|
||||
return self.pipe_names.index(before)
|
||||
if before not in self.component_names:
|
||||
raise ValueError(
|
||||
Errors.E001.format(name=before, opts=self.component_names)
|
||||
)
|
||||
return self.component_names.index(before)
|
||||
elif isinstance(after, str):
|
||||
if after not in self.pipe_names:
|
||||
raise ValueError(Errors.E001.format(name=after, opts=self.pipe_names))
|
||||
return self.pipe_names.index(after) + 1
|
||||
if after not in self.component_names:
|
||||
raise ValueError(
|
||||
Errors.E001.format(name=after, opts=self.component_names)
|
||||
)
|
||||
return self.component_names.index(after) + 1
|
||||
# We're only accepting indices referring to components that exist
|
||||
# (can't just do isinstance here because bools are instance of int, too)
|
||||
elif type(before) == int:
|
||||
if before >= len(self.pipeline) or before < 0:
|
||||
err = Errors.E959.format(dir="before", idx=before, opts=self.pipe_names)
|
||||
if before >= len(self._components) or before < 0:
|
||||
err = Errors.E959.format(
|
||||
dir="before", idx=before, opts=self.component_names
|
||||
)
|
||||
raise ValueError(err)
|
||||
return before
|
||||
elif type(after) == int:
|
||||
if after >= len(self.pipeline) or after < 0:
|
||||
err = Errors.E959.format(dir="after", idx=after, opts=self.pipe_names)
|
||||
if after >= len(self._components) or after < 0:
|
||||
err = Errors.E959.format(
|
||||
dir="after", idx=after, opts=self.component_names
|
||||
)
|
||||
raise ValueError(err)
|
||||
return after + 1
|
||||
raise ValueError(Errors.E006.format(args=all_args, opts=self.pipe_names))
|
||||
raise ValueError(Errors.E006.format(args=all_args, opts=self.component_names))
|
||||
|
||||
def has_pipe(self, name: str) -> bool:
|
||||
"""Check if a component name is present in the pipeline. Equivalent to
|
||||
|
@ -796,7 +852,7 @@ class Language:
|
|||
# to Language.pipeline to make sure the configs are handled correctly
|
||||
pipe_index = self.pipe_names.index(name)
|
||||
self.remove_pipe(name)
|
||||
if not len(self.pipeline) or pipe_index == len(self.pipeline):
|
||||
if not len(self._components) or pipe_index == len(self._components):
|
||||
# we have no components to insert before/after, or we're replacing the last component
|
||||
self.add_pipe(factory_name, name=name, config=config, validate=validate)
|
||||
else:
|
||||
|
@ -816,12 +872,16 @@ class Language:
|
|||
|
||||
DOCS: https://spacy.io/api/language#rename_pipe
|
||||
"""
|
||||
if old_name not in self.pipe_names:
|
||||
raise ValueError(Errors.E001.format(name=old_name, opts=self.pipe_names))
|
||||
if new_name in self.pipe_names:
|
||||
raise ValueError(Errors.E007.format(name=new_name, opts=self.pipe_names))
|
||||
i = self.pipe_names.index(old_name)
|
||||
self.pipeline[i] = (new_name, self.pipeline[i][1])
|
||||
if old_name not in self.component_names:
|
||||
raise ValueError(
|
||||
Errors.E001.format(name=old_name, opts=self.component_names)
|
||||
)
|
||||
if new_name in self.component_names:
|
||||
raise ValueError(
|
||||
Errors.E007.format(name=new_name, opts=self.component_names)
|
||||
)
|
||||
i = self.component_names.index(old_name)
|
||||
self._components[i] = (new_name, self._components[i][1])
|
||||
self._pipe_meta[new_name] = self._pipe_meta.pop(old_name)
|
||||
self._pipe_configs[new_name] = self._pipe_configs.pop(old_name)
|
||||
|
||||
|
@ -833,20 +893,45 @@ class Language:
|
|||
|
||||
DOCS: https://spacy.io/api/language#remove_pipe
|
||||
"""
|
||||
if name not in self.pipe_names:
|
||||
raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
|
||||
removed = self.pipeline.pop(self.pipe_names.index(name))
|
||||
if name not in self.component_names:
|
||||
raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
|
||||
removed = self._components.pop(self.component_names.index(name))
|
||||
# We're only removing the component itself from the metas/configs here
|
||||
# because factory may be used for something else
|
||||
self._pipe_meta.pop(name)
|
||||
self._pipe_configs.pop(name)
|
||||
# Make sure the name is also removed from the set of disabled components
|
||||
if name in self.disabled:
|
||||
self._disabled.remove(name)
|
||||
return removed
|
||||
|
||||
def disable_pipe(self, name: str) -> None:
|
||||
"""Disable a pipeline component. The component will still exist on
|
||||
the nlp object, but it won't be run as part of the pipeline. Does
|
||||
nothing if the component is already disabled.
|
||||
|
||||
name (str): The name of the component to disable.
|
||||
"""
|
||||
if name not in self.component_names:
|
||||
raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
|
||||
self._disabled.add(name)
|
||||
|
||||
def enable_pipe(self, name: str) -> None:
|
||||
"""Enable a previously disabled pipeline component so it's run as part
|
||||
of the pipeline. Does nothing if the component is already enabled.
|
||||
|
||||
name (str): The name of the component to enable.
|
||||
"""
|
||||
if name not in self.component_names:
|
||||
raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
|
||||
if name in self.disabled:
|
||||
self._disabled.remove(name)
|
||||
|
||||
def __call__(
|
||||
self,
|
||||
text: str,
|
||||
*,
|
||||
disable: Iterable[str] = tuple(),
|
||||
disable: Iterable[str] = SimpleFrozenList(),
|
||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||
) -> Doc:
|
||||
"""Apply the pipeline to some text. The text can span multiple sentences,
|
||||
|
@ -892,7 +977,7 @@ class Language:
|
|||
warnings.warn(Warnings.W096, DeprecationWarning)
|
||||
if len(names) == 1 and isinstance(names[0], (list, tuple)):
|
||||
names = names[0] # support list of names instead of spread
|
||||
return DisabledPipes(self, names)
|
||||
return self.select_pipes(disable=names)
|
||||
|
||||
def select_pipes(
|
||||
self,
|
||||
|
@ -945,7 +1030,7 @@ class Language:
|
|||
sgd: Optional[Optimizer] = None,
|
||||
losses: Optional[Dict[str, float]] = None,
|
||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||
exclude: Iterable[str] = tuple(),
|
||||
exclude: Iterable[str] = SimpleFrozenList(),
|
||||
):
|
||||
"""Update the models in the pipeline.
|
||||
|
||||
|
@ -999,7 +1084,7 @@ class Language:
|
|||
sgd: Optional[Optimizer] = None,
|
||||
losses: Optional[Dict[str, float]] = None,
|
||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||
exclude: Iterable[str] = tuple(),
|
||||
exclude: Iterable[str] = SimpleFrozenList(),
|
||||
) -> Dict[str, float]:
|
||||
"""Make a "rehearsal" update to the models in the pipeline, to prevent
|
||||
forgetting. Rehearsal updates run an initial copy of the model over some
|
||||
|
@ -1228,7 +1313,7 @@ class Language:
|
|||
*,
|
||||
as_tuples: bool = False,
|
||||
batch_size: int = 1000,
|
||||
disable: Iterable[str] = tuple(),
|
||||
disable: Iterable[str] = SimpleFrozenList(),
|
||||
cleanup: bool = False,
|
||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||
n_process: int = 1,
|
||||
|
@ -1388,7 +1473,8 @@ class Language:
|
|||
config: Union[Dict[str, Any], Config] = {},
|
||||
*,
|
||||
vocab: Union[Vocab, bool] = True,
|
||||
disable: Iterable[str] = tuple(),
|
||||
disable: Iterable[str] = SimpleFrozenList(),
|
||||
exclude: Iterable[str] = SimpleFrozenList(),
|
||||
auto_fill: bool = True,
|
||||
validate: bool = True,
|
||||
) -> "Language":
|
||||
|
@ -1398,7 +1484,11 @@ class Language:
|
|||
|
||||
config (Dict[str, Any] / Config): The loaded config.
|
||||
vocab (Vocab): A Vocab object. If True, a vocab is created.
|
||||
disable (Iterable[str]): List of pipeline component names to disable.
|
||||
disable (Iterable[str]): Names of pipeline components to disable.
|
||||
Disabled pipes will be loaded but they won't be run unless you
|
||||
explicitly enable them by calling nlp.enable_pipe.
|
||||
exclude (Iterable[str]): Names of pipeline components to exclude.
|
||||
Excluded components won't be loaded.
|
||||
auto_fill (bool): Automatically fill in missing values in config based
|
||||
on defaults and function argument annotations.
|
||||
validate (bool): Validate the component config and arguments against
|
||||
|
@ -1471,7 +1561,7 @@ class Language:
|
|||
raise ValueError(Errors.E956.format(name=pipe_name, opts=opts))
|
||||
pipe_cfg = util.copy_config(pipeline[pipe_name])
|
||||
raw_config = Config(filled["components"][pipe_name])
|
||||
if pipe_name not in disable:
|
||||
if pipe_name not in exclude:
|
||||
if "factory" not in pipe_cfg and "source" not in pipe_cfg:
|
||||
err = Errors.E984.format(name=pipe_name, config=pipe_cfg)
|
||||
raise ValueError(err)
|
||||
|
@ -1496,6 +1586,8 @@ class Language:
|
|||
)
|
||||
source_name = pipe_cfg.get("component", pipe_name)
|
||||
nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name)
|
||||
disabled_pipes = [*config["nlp"]["disabled"], *disable]
|
||||
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
|
||||
nlp.config = filled if auto_fill else config
|
||||
nlp.resolved = resolved
|
||||
if after_pipeline_creation is not None:
|
||||
|
@ -1507,7 +1599,7 @@ class Language:
|
|||
return nlp
|
||||
|
||||
def to_disk(
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = tuple()
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> None:
|
||||
"""Save the current state to a directory. If a model is loaded, this
|
||||
will include the model.
|
||||
|
@ -1525,9 +1617,7 @@ class Language:
|
|||
)
|
||||
serializers["meta.json"] = lambda p: srsly.write_json(p, self.meta)
|
||||
serializers["config.cfg"] = lambda p: self.config.to_disk(p)
|
||||
for name, proc in self.pipeline:
|
||||
if not hasattr(proc, "name"):
|
||||
continue
|
||||
for name, proc in self._components:
|
||||
if name in exclude:
|
||||
continue
|
||||
if not hasattr(proc, "to_disk"):
|
||||
|
@ -1537,7 +1627,7 @@ class Language:
|
|||
util.to_disk(path, serializers, exclude)
|
||||
|
||||
def from_disk(
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = tuple()
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> "Language":
|
||||
"""Loads state from a directory. Modifies the object in place and
|
||||
returns it. If the saved `Language` object contains a model, the
|
||||
|
@ -1573,7 +1663,7 @@ class Language:
|
|||
deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(
|
||||
p, exclude=["vocab"]
|
||||
)
|
||||
for name, proc in self.pipeline:
|
||||
for name, proc in self._components:
|
||||
if name in exclude:
|
||||
continue
|
||||
if not hasattr(proc, "from_disk"):
|
||||
|
@ -1589,7 +1679,7 @@ class Language:
|
|||
self._link_components()
|
||||
return self
|
||||
|
||||
def to_bytes(self, *, exclude: Iterable[str] = tuple()) -> bytes:
|
||||
def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
|
||||
"""Serialize the current state to a binary string.
|
||||
|
||||
exclude (list): Names of components or serialization fields to exclude.
|
||||
|
@ -1602,7 +1692,7 @@ class Language:
|
|||
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"])
|
||||
serializers["meta.json"] = lambda: srsly.json_dumps(self.meta)
|
||||
serializers["config.cfg"] = lambda: self.config.to_bytes()
|
||||
for name, proc in self.pipeline:
|
||||
for name, proc in self._components:
|
||||
if name in exclude:
|
||||
continue
|
||||
if not hasattr(proc, "to_bytes"):
|
||||
|
@ -1611,7 +1701,7 @@ class Language:
|
|||
return util.to_bytes(serializers, exclude)
|
||||
|
||||
def from_bytes(
|
||||
self, bytes_data: bytes, *, exclude: Iterable[str] = tuple()
|
||||
self, bytes_data: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> "Language":
|
||||
"""Load state from a binary string.
|
||||
|
||||
|
@ -1638,7 +1728,7 @@ class Language:
|
|||
deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(
|
||||
b, exclude=["vocab"]
|
||||
)
|
||||
for name, proc in self.pipeline:
|
||||
for name, proc in self._components:
|
||||
if name in exclude:
|
||||
continue
|
||||
if not hasattr(proc, "from_bytes"):
|
||||
|
@ -1674,14 +1764,10 @@ class DisabledPipes(list):
|
|||
def __init__(self, nlp: Language, names: List[str]) -> None:
|
||||
self.nlp = nlp
|
||||
self.names = names
|
||||
# Important! Not deep copy -- we just want the container (but we also
|
||||
# want to support people providing arbitrarily typed nlp.pipeline
|
||||
# objects.)
|
||||
self.original_pipeline = copy(nlp.pipeline)
|
||||
self.metas = {name: nlp.get_pipe_meta(name) for name in names}
|
||||
self.configs = {name: nlp.get_pipe_config(name) for name in names}
|
||||
for name in self.names:
|
||||
self.nlp.disable_pipe(name)
|
||||
list.__init__(self)
|
||||
self.extend(nlp.remove_pipe(name) for name in names)
|
||||
self.extend(self.names)
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
@ -1691,14 +1777,10 @@ class DisabledPipes(list):
|
|||
|
||||
def restore(self) -> None:
|
||||
"""Restore the pipeline to its state when DisabledPipes was created."""
|
||||
current, self.nlp.pipeline = self.nlp.pipeline, self.original_pipeline
|
||||
unexpected = [name for name, pipe in current if not self.nlp.has_pipe(name)]
|
||||
if unexpected:
|
||||
# Don't change the pipeline if we're raising an error.
|
||||
self.nlp.pipeline = current
|
||||
raise ValueError(Errors.E008.format(names=unexpected))
|
||||
self.nlp._pipe_meta.update(self.metas)
|
||||
self.nlp._pipe_configs.update(self.configs)
|
||||
for name in self.names:
|
||||
if name not in self.nlp.component_names:
|
||||
raise ValueError(Errors.E008.format(name=name))
|
||||
self.nlp.enable_pipe(name)
|
||||
self[:] = []
|
||||
|
||||
|
||||
|
|
|
@ -12,6 +12,7 @@ from ..symbols import IDS, TAG, POS, MORPH, LEMMA
|
|||
from ..tokens import Doc, Span
|
||||
from ..tokens._retokenize import normalize_token_attrs, set_token_attrs
|
||||
from ..vocab import Vocab
|
||||
from ..util import SimpleFrozenList
|
||||
from .. import util
|
||||
|
||||
|
||||
|
@ -220,7 +221,7 @@ class AttributeRuler(Pipe):
|
|||
results.update(Scorer.score_token_attr(examples, "lemma", **kwargs))
|
||||
return results
|
||||
|
||||
def to_bytes(self, exclude: Iterable[str] = tuple()) -> bytes:
|
||||
def to_bytes(self, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
|
||||
"""Serialize the AttributeRuler to a bytestring.
|
||||
|
||||
exclude (Iterable[str]): String names of serialization fields to exclude.
|
||||
|
@ -233,7 +234,9 @@ class AttributeRuler(Pipe):
|
|||
serialize["patterns"] = lambda: srsly.msgpack_dumps(self.patterns)
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data: bytes, exclude: Iterable[str] = tuple()):
|
||||
def from_bytes(
|
||||
self, bytes_data: bytes, exclude: Iterable[str] = SimpleFrozenList()
|
||||
):
|
||||
"""Load the AttributeRuler from a bytestring.
|
||||
|
||||
bytes_data (bytes): The data to load.
|
||||
|
@ -254,7 +257,9 @@ class AttributeRuler(Pipe):
|
|||
|
||||
return self
|
||||
|
||||
def to_disk(self, path: Union[Path, str], exclude: Iterable[str] = tuple()) -> None:
|
||||
def to_disk(
|
||||
self, path: Union[Path, str], exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> None:
|
||||
"""Serialize the AttributeRuler to disk.
|
||||
|
||||
path (Union[Path, str]): A path to a directory.
|
||||
|
@ -268,7 +273,7 @@ class AttributeRuler(Pipe):
|
|||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(
|
||||
self, path: Union[Path, str], exclude: Iterable[str] = tuple()
|
||||
self, path: Union[Path, str], exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> None:
|
||||
"""Load the AttributeRuler from disk.
|
||||
|
||||
|
|
|
@ -13,6 +13,7 @@ from ..language import Language
|
|||
from ..vocab import Vocab
|
||||
from ..gold import Example, validate_examples
|
||||
from ..errors import Errors, Warnings
|
||||
from ..util import SimpleFrozenList
|
||||
from .. import util
|
||||
|
||||
|
||||
|
@ -404,7 +405,7 @@ class EntityLinker(Pipe):
|
|||
token.ent_kb_id_ = kb_id
|
||||
|
||||
def to_disk(
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = tuple()
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList(),
|
||||
) -> None:
|
||||
"""Serialize the pipe to disk.
|
||||
|
||||
|
@ -421,7 +422,7 @@ class EntityLinker(Pipe):
|
|||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = tuple()
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList(),
|
||||
) -> "EntityLinker":
|
||||
"""Load the pipe from disk. Modifies the object in place and returns it.
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ import srsly
|
|||
|
||||
from ..language import Language
|
||||
from ..errors import Errors
|
||||
from ..util import ensure_path, to_disk, from_disk
|
||||
from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
|
||||
from ..tokens import Doc, Span
|
||||
from ..matcher import Matcher, PhraseMatcher
|
||||
from ..scorer import Scorer
|
||||
|
@ -317,7 +317,7 @@ class EntityRuler:
|
|||
return Scorer.score_spans(examples, "ents", **kwargs)
|
||||
|
||||
def from_bytes(
|
||||
self, patterns_bytes: bytes, *, exclude: Iterable[str] = tuple()
|
||||
self, patterns_bytes: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> "EntityRuler":
|
||||
"""Load the entity ruler from a bytestring.
|
||||
|
||||
|
@ -341,7 +341,7 @@ class EntityRuler:
|
|||
self.add_patterns(cfg)
|
||||
return self
|
||||
|
||||
def to_bytes(self, *, exclude: Iterable[str] = tuple()) -> bytes:
|
||||
def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
|
||||
"""Serialize the entity ruler patterns to a bytestring.
|
||||
|
||||
RETURNS (bytes): The serialized patterns.
|
||||
|
@ -357,7 +357,7 @@ class EntityRuler:
|
|||
return srsly.msgpack_dumps(serial)
|
||||
|
||||
def from_disk(
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = tuple()
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> "EntityRuler":
|
||||
"""Load the entity ruler from a file. Expects a file containing
|
||||
newline-delimited JSON (JSONL) with one entry per line.
|
||||
|
@ -394,7 +394,7 @@ class EntityRuler:
|
|||
return self
|
||||
|
||||
def to_disk(
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = tuple()
|
||||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||||
) -> None:
|
||||
"""Save the entity ruler patterns to a directory. The patterns will be
|
||||
saved as newline-delimited JSON (JSONL).
|
||||
|
|
|
@ -223,6 +223,7 @@ class ConfigSchemaNlp(BaseModel):
|
|||
# fmt: off
|
||||
lang: StrictStr = Field(..., title="The base language to use")
|
||||
pipeline: List[StrictStr] = Field(..., title="The pipeline component names in order")
|
||||
disabled: List[StrictStr] = Field(..., title="Pipeline components to disable by default")
|
||||
tokenizer: Callable = Field(..., title="The tokenizer to use")
|
||||
load_vocab_data: StrictBool = Field(..., title="Whether to load additional vocab data from spacy-lookups-data")
|
||||
before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
from typing import Optional, Iterable, Dict, Any, Callable, Tuple, TYPE_CHECKING
|
||||
from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING
|
||||
import numpy as np
|
||||
|
||||
from .gold import Example
|
||||
from .tokens import Token, Doc, Span
|
||||
from .errors import Errors
|
||||
from .util import get_lang_class
|
||||
from .util import get_lang_class, SimpleFrozenList
|
||||
from .morphology import Morphology
|
||||
|
||||
if TYPE_CHECKING:
|
||||
|
@ -317,7 +317,7 @@ class Scorer:
|
|||
attr: str,
|
||||
*,
|
||||
getter: Callable[[Doc, str], Any] = getattr,
|
||||
labels: Iterable[str] = tuple(),
|
||||
labels: Iterable[str] = SimpleFrozenList(),
|
||||
multi_label: bool = True,
|
||||
positive_label: Optional[str] = None,
|
||||
threshold: Optional[float] = None,
|
||||
|
@ -447,7 +447,7 @@ class Scorer:
|
|||
getter: Callable[[Token, str], Any] = getattr,
|
||||
head_attr: str = "head",
|
||||
head_getter: Callable[[Token, str], Token] = getattr,
|
||||
ignore_labels: Tuple[str] = tuple(),
|
||||
ignore_labels: Iterable[str] = SimpleFrozenList(),
|
||||
**cfg,
|
||||
) -> Dict[str, Any]:
|
||||
"""Returns the UAS, LAS, and LAS per type scores for dependency
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import pytest
|
||||
from spacy.language import Language
|
||||
from spacy.util import SimpleFrozenList
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -181,6 +182,11 @@ def test_select_pipes_errors(nlp):
|
|||
with pytest.raises(ValueError):
|
||||
nlp.select_pipes(enable=[], disable=["c3"])
|
||||
|
||||
disabled = nlp.select_pipes(disable=["c2"])
|
||||
nlp.remove_pipe("c2")
|
||||
with pytest.raises(ValueError):
|
||||
disabled.restore()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("n_pipes", [100])
|
||||
def test_add_lots_of_pipes(nlp, n_pipes):
|
||||
|
@ -249,3 +255,94 @@ def test_add_pipe_before_after():
|
|||
nlp.add_pipe("entity_ruler", before=True)
|
||||
with pytest.raises(ValueError):
|
||||
nlp.add_pipe("entity_ruler", first=False)
|
||||
|
||||
|
||||
def test_disable_enable_pipes():
|
||||
name = "test_disable_enable_pipes"
|
||||
results = {}
|
||||
|
||||
def make_component(name):
|
||||
results[name] = ""
|
||||
|
||||
def component(doc):
|
||||
nonlocal results
|
||||
results[name] = doc.text
|
||||
return doc
|
||||
|
||||
return component
|
||||
|
||||
c1 = Language.component(f"{name}1", func=make_component(f"{name}1"))
|
||||
c2 = Language.component(f"{name}2", func=make_component(f"{name}2"))
|
||||
|
||||
nlp = Language()
|
||||
nlp.add_pipe(f"{name}1")
|
||||
nlp.add_pipe(f"{name}2")
|
||||
assert results[f"{name}1"] == ""
|
||||
assert results[f"{name}2"] == ""
|
||||
assert nlp.pipeline == [(f"{name}1", c1), (f"{name}2", c2)]
|
||||
assert nlp.pipe_names == [f"{name}1", f"{name}2"]
|
||||
nlp.disable_pipe(f"{name}1")
|
||||
assert nlp.disabled == [f"{name}1"]
|
||||
assert nlp.component_names == [f"{name}1", f"{name}2"]
|
||||
assert nlp.pipe_names == [f"{name}2"]
|
||||
assert nlp.config["nlp"]["disabled"] == [f"{name}1"]
|
||||
nlp("hello")
|
||||
assert results[f"{name}1"] == "" # didn't run
|
||||
assert results[f"{name}2"] == "hello" # ran
|
||||
nlp.enable_pipe(f"{name}1")
|
||||
assert nlp.disabled == []
|
||||
assert nlp.pipe_names == [f"{name}1", f"{name}2"]
|
||||
assert nlp.config["nlp"]["disabled"] == []
|
||||
nlp("world")
|
||||
assert results[f"{name}1"] == "world"
|
||||
assert results[f"{name}2"] == "world"
|
||||
nlp.disable_pipe(f"{name}2")
|
||||
nlp.remove_pipe(f"{name}2")
|
||||
assert nlp.components == [(f"{name}1", c1)]
|
||||
assert nlp.pipeline == [(f"{name}1", c1)]
|
||||
assert nlp.component_names == [f"{name}1"]
|
||||
assert nlp.pipe_names == [f"{name}1"]
|
||||
assert nlp.disabled == []
|
||||
assert nlp.config["nlp"]["disabled"] == []
|
||||
nlp.rename_pipe(f"{name}1", name)
|
||||
assert nlp.components == [(name, c1)]
|
||||
assert nlp.component_names == [name]
|
||||
nlp("!")
|
||||
assert results[f"{name}1"] == "!"
|
||||
assert results[f"{name}2"] == "world"
|
||||
with pytest.raises(ValueError):
|
||||
nlp.disable_pipe(f"{name}2")
|
||||
nlp.disable_pipe(name)
|
||||
assert nlp.component_names == [name]
|
||||
assert nlp.pipe_names == []
|
||||
assert nlp.config["nlp"]["disabled"] == [name]
|
||||
nlp("?")
|
||||
assert results[f"{name}1"] == "!"
|
||||
|
||||
|
||||
def test_pipe_methods_frozen():
|
||||
"""Test that spaCy raises custom error messages if "frozen" properties are
|
||||
accessed. We still want to use a list here to not break backwards
|
||||
compatibility, but users should see an error if they're trying to append
|
||||
to nlp.pipeline etc."""
|
||||
nlp = Language()
|
||||
ner = nlp.add_pipe("ner")
|
||||
assert nlp.pipe_names == ["ner"]
|
||||
for prop in [
|
||||
nlp.pipeline,
|
||||
nlp.pipe_names,
|
||||
nlp.components,
|
||||
nlp.component_names,
|
||||
nlp.disabled,
|
||||
nlp.factory_names,
|
||||
]:
|
||||
assert isinstance(prop, list)
|
||||
assert isinstance(prop, SimpleFrozenList)
|
||||
with pytest.raises(NotImplementedError):
|
||||
nlp.pipeline.append(("ner2", ner))
|
||||
with pytest.raises(NotImplementedError):
|
||||
nlp.pipe_names.pop()
|
||||
with pytest.raises(NotImplementedError):
|
||||
nlp.components.sort()
|
||||
with pytest.raises(NotImplementedError):
|
||||
nlp.component_names.clear()
|
||||
|
|
|
@ -161,6 +161,7 @@ def test_issue4674():
|
|||
assert kb2.get_size_entities() == 1
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="API change: disable just disables, new exclude arg")
|
||||
def test_issue4707():
|
||||
"""Tests that disabled component names are also excluded from nlp.from_disk
|
||||
by default when loading a model.
|
||||
|
|
|
@ -6,6 +6,8 @@ from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
|||
from spacy.pipeline.tagger import DEFAULT_TAGGER_MODEL
|
||||
from spacy.pipeline.textcat import DEFAULT_TEXTCAT_MODEL
|
||||
from spacy.pipeline.senter import DEFAULT_SENTER_MODEL
|
||||
from spacy.lang.en import English
|
||||
import spacy
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
@ -173,3 +175,34 @@ def test_serialize_sentencerecognizer(en_vocab):
|
|||
sr_b = sr.to_bytes()
|
||||
sr_d = SentenceRecognizer(en_vocab, model).from_bytes(sr_b)
|
||||
assert sr.to_bytes() == sr_d.to_bytes()
|
||||
|
||||
|
||||
def test_serialize_pipeline_disable_enable():
|
||||
nlp = English()
|
||||
nlp.add_pipe("ner")
|
||||
nlp.add_pipe("tagger")
|
||||
nlp.disable_pipe("tagger")
|
||||
assert nlp.config["nlp"]["disabled"] == ["tagger"]
|
||||
config = nlp.config.copy()
|
||||
nlp2 = English.from_config(config)
|
||||
assert nlp2.pipe_names == ["ner"]
|
||||
assert nlp2.component_names == ["ner", "tagger"]
|
||||
assert nlp2.disabled == ["tagger"]
|
||||
assert nlp2.config["nlp"]["disabled"] == ["tagger"]
|
||||
with make_tempdir() as d:
|
||||
nlp2.to_disk(d)
|
||||
nlp3 = spacy.load(d)
|
||||
assert nlp3.pipe_names == ["ner"]
|
||||
assert nlp3.component_names == ["ner", "tagger"]
|
||||
with make_tempdir() as d:
|
||||
nlp3.to_disk(d)
|
||||
nlp4 = spacy.load(d, disable=["ner"])
|
||||
assert nlp4.pipe_names == []
|
||||
assert nlp4.component_names == ["ner", "tagger"]
|
||||
assert nlp4.disabled == ["ner", "tagger"]
|
||||
with make_tempdir() as d:
|
||||
nlp.to_disk(d)
|
||||
nlp5 = spacy.load(d, exclude=["tagger"])
|
||||
assert nlp5.pipe_names == ["ner"]
|
||||
assert nlp5.component_names == ["ner"]
|
||||
assert nlp5.disabled == []
|
||||
|
|
|
@ -3,10 +3,9 @@ import pytest
|
|||
from .util import get_random_doc
|
||||
|
||||
from spacy import util
|
||||
from spacy.util import dot_to_object
|
||||
from spacy.util import dot_to_object, SimpleFrozenList
|
||||
from thinc.api import Config, Optimizer
|
||||
from spacy.gold.batchers import minibatch_by_words
|
||||
|
||||
from ..lang.en import English
|
||||
from ..lang.nl import Dutch
|
||||
from ..language import DEFAULT_CONFIG_PATH
|
||||
|
@ -106,3 +105,20 @@ def test_util_dot_section():
|
|||
assert not dot_to_object(en_config, "nlp.load_vocab_data")
|
||||
assert dot_to_object(nl_config, "nlp.load_vocab_data")
|
||||
assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer)
|
||||
|
||||
|
||||
def test_simple_frozen_list():
|
||||
t = SimpleFrozenList(["foo", "bar"])
|
||||
assert t == ["foo", "bar"]
|
||||
assert t.index("bar") == 1 # okay method
|
||||
with pytest.raises(NotImplementedError):
|
||||
t.append("baz")
|
||||
with pytest.raises(NotImplementedError):
|
||||
t.sort()
|
||||
with pytest.raises(NotImplementedError):
|
||||
t.extend(["baz"])
|
||||
with pytest.raises(NotImplementedError):
|
||||
t.pop()
|
||||
t = SimpleFrozenList(["foo", "bar"], error="Error!")
|
||||
with pytest.raises(NotImplementedError):
|
||||
t.append("baz")
|
||||
|
|
|
@ -10,7 +10,7 @@ from ..vocab import Vocab
|
|||
from ..compat import copy_reg
|
||||
from ..attrs import SPACY, ORTH, intify_attr
|
||||
from ..errors import Errors
|
||||
from ..util import ensure_path
|
||||
from ..util import ensure_path, SimpleFrozenList
|
||||
|
||||
# fmt: off
|
||||
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
|
||||
|
@ -52,7 +52,7 @@ class DocBin:
|
|||
self,
|
||||
attrs: Iterable[str] = ALL_ATTRS,
|
||||
store_user_data: bool = False,
|
||||
docs: Iterable[Doc] = tuple(),
|
||||
docs: Iterable[Doc] = SimpleFrozenList(),
|
||||
) -> None:
|
||||
"""Create a DocBin object to hold serialized annotations.
|
||||
|
||||
|
|
104
spacy/util.py
104
spacy/util.py
|
@ -120,6 +120,47 @@ class SimpleFrozenDict(dict):
|
|||
raise NotImplementedError(self.error)
|
||||
|
||||
|
||||
class SimpleFrozenList(list):
|
||||
"""Wrapper class around a list that lets us raise custom errors if certain
|
||||
attributes/methods are accessed. Mostly used for properties like
|
||||
Language.pipeline that return an immutable list (and that we don't want to
|
||||
convert to a tuple to not break too much backwards compatibility). If a user
|
||||
accidentally calls nlp.pipeline.append(), we can raise a more helpful error.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, error: str = Errors.E927) -> None:
|
||||
"""Initialize the frozen list.
|
||||
|
||||
error (str): The error message when user tries to mutate the list.
|
||||
"""
|
||||
self.error = error
|
||||
super().__init__(*args)
|
||||
|
||||
def append(self, *args, **kwargs):
|
||||
raise NotImplementedError(self.error)
|
||||
|
||||
def clear(self, *args, **kwargs):
|
||||
raise NotImplementedError(self.error)
|
||||
|
||||
def extend(self, *args, **kwargs):
|
||||
raise NotImplementedError(self.error)
|
||||
|
||||
def insert(self, *args, **kwargs):
|
||||
raise NotImplementedError(self.error)
|
||||
|
||||
def pop(self, *args, **kwargs):
|
||||
raise NotImplementedError(self.error)
|
||||
|
||||
def remove(self, *args, **kwargs):
|
||||
raise NotImplementedError(self.error)
|
||||
|
||||
def reverse(self, *args, **kwargs):
|
||||
raise NotImplementedError(self.error)
|
||||
|
||||
def sort(self, *args, **kwargs):
|
||||
raise NotImplementedError(self.error)
|
||||
|
||||
|
||||
def lang_class_is_loaded(lang: str) -> bool:
|
||||
"""Check whether a Language class is already loaded. Language classes are
|
||||
loaded lazily, to avoid expensive setup code associated with the language
|
||||
|
@ -215,7 +256,8 @@ def load_model(
|
|||
name: Union[str, Path],
|
||||
*,
|
||||
vocab: Union["Vocab", bool] = True,
|
||||
disable: Iterable[str] = tuple(),
|
||||
disable: Iterable[str] = SimpleFrozenList(),
|
||||
exclude: Iterable[str] = SimpleFrozenList(),
|
||||
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
|
||||
) -> "Language":
|
||||
"""Load a model from a package or data path.
|
||||
|
@ -228,7 +270,7 @@ def load_model(
|
|||
keyed by section values in dot notation.
|
||||
RETURNS (Language): The loaded nlp object.
|
||||
"""
|
||||
kwargs = {"vocab": vocab, "disable": disable, "config": config}
|
||||
kwargs = {"vocab": vocab, "disable": disable, "exclude": exclude, "config": config}
|
||||
if isinstance(name, str): # name or string path
|
||||
if name.startswith("blank:"): # shortcut for blank model
|
||||
return get_lang_class(name.replace("blank:", ""))()
|
||||
|
@ -247,7 +289,8 @@ def load_model_from_package(
|
|||
name: str,
|
||||
*,
|
||||
vocab: Union["Vocab", bool] = True,
|
||||
disable: Iterable[str] = tuple(),
|
||||
disable: Iterable[str] = SimpleFrozenList(),
|
||||
exclude: Iterable[str] = SimpleFrozenList(),
|
||||
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
|
||||
) -> "Language":
|
||||
"""Load a model from an installed package.
|
||||
|
@ -255,13 +298,17 @@ def load_model_from_package(
|
|||
name (str): The package name.
|
||||
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
|
||||
a new Vocab object will be created.
|
||||
disable (Iterable[str]): Names of pipeline components to disable.
|
||||
disable (Iterable[str]): Names of pipeline components to disable. Disabled
|
||||
pipes will be loaded but they won't be run unless you explicitly
|
||||
enable them by calling nlp.enable_pipe.
|
||||
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
|
||||
components won't be loaded.
|
||||
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
|
||||
keyed by section values in dot notation.
|
||||
RETURNS (Language): The loaded nlp object.
|
||||
"""
|
||||
cls = importlib.import_module(name)
|
||||
return cls.load(vocab=vocab, disable=disable, config=config)
|
||||
return cls.load(vocab=vocab, disable=disable, exclude=exclude, config=config)
|
||||
|
||||
|
||||
def load_model_from_path(
|
||||
|
@ -269,7 +316,8 @@ def load_model_from_path(
|
|||
*,
|
||||
meta: Optional[Dict[str, Any]] = None,
|
||||
vocab: Union["Vocab", bool] = True,
|
||||
disable: Iterable[str] = tuple(),
|
||||
disable: Iterable[str] = SimpleFrozenList(),
|
||||
exclude: Iterable[str] = SimpleFrozenList(),
|
||||
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
|
||||
) -> "Language":
|
||||
"""Load a model from a data directory path. Creates Language class with
|
||||
|
@ -279,7 +327,11 @@ def load_model_from_path(
|
|||
meta (Dict[str, Any]): Optional model meta.
|
||||
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
|
||||
a new Vocab object will be created.
|
||||
disable (Iterable[str]): Names of pipeline components to disable.
|
||||
disable (Iterable[str]): Names of pipeline components to disable. Disabled
|
||||
pipes will be loaded but they won't be run unless you explicitly
|
||||
enable them by calling nlp.enable_pipe.
|
||||
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
|
||||
components won't be loaded.
|
||||
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
|
||||
keyed by section values in dot notation.
|
||||
RETURNS (Language): The loaded nlp object.
|
||||
|
@ -290,15 +342,18 @@ def load_model_from_path(
|
|||
meta = get_model_meta(model_path)
|
||||
config_path = model_path / "config.cfg"
|
||||
config = load_config(config_path, overrides=dict_to_dot(config))
|
||||
nlp, _ = load_model_from_config(config, vocab=vocab, disable=disable)
|
||||
return nlp.from_disk(model_path, exclude=disable)
|
||||
nlp, _ = load_model_from_config(
|
||||
config, vocab=vocab, disable=disable, exclude=exclude
|
||||
)
|
||||
return nlp.from_disk(model_path, exclude=exclude)
|
||||
|
||||
|
||||
def load_model_from_config(
|
||||
config: Union[Dict[str, Any], Config],
|
||||
*,
|
||||
vocab: Union["Vocab", bool] = True,
|
||||
disable: Iterable[str] = tuple(),
|
||||
disable: Iterable[str] = SimpleFrozenList(),
|
||||
exclude: Iterable[str] = SimpleFrozenList(),
|
||||
auto_fill: bool = False,
|
||||
validate: bool = True,
|
||||
) -> Tuple["Language", Config]:
|
||||
|
@ -309,7 +364,11 @@ def load_model_from_config(
|
|||
meta (Dict[str, Any]): Optional model meta.
|
||||
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
|
||||
a new Vocab object will be created.
|
||||
disable (Iterable[str]): Names of pipeline components to disable.
|
||||
disable (Iterable[str]): Names of pipeline components to disable. Disabled
|
||||
pipes will be loaded but they won't be run unless you explicitly
|
||||
enable them by calling nlp.enable_pipe.
|
||||
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
|
||||
components won't be loaded.
|
||||
auto_fill (bool): Whether to auto-fill config with missing defaults.
|
||||
validate (bool): Whether to show config validation errors.
|
||||
RETURNS (Language): The loaded nlp object.
|
||||
|
@ -323,7 +382,12 @@ def load_model_from_config(
|
|||
# registry, including custom subclasses provided via entry points
|
||||
lang_cls = get_lang_class(nlp_config["lang"])
|
||||
nlp = lang_cls.from_config(
|
||||
config, vocab=vocab, disable=disable, auto_fill=auto_fill, validate=validate,
|
||||
config,
|
||||
vocab=vocab,
|
||||
disable=disable,
|
||||
exclude=exclude,
|
||||
auto_fill=auto_fill,
|
||||
validate=validate,
|
||||
)
|
||||
return nlp, nlp.resolved
|
||||
|
||||
|
@ -332,7 +396,8 @@ def load_model_from_init_py(
|
|||
init_file: Union[Path, str],
|
||||
*,
|
||||
vocab: Union["Vocab", bool] = True,
|
||||
disable: Iterable[str] = tuple(),
|
||||
disable: Iterable[str] = SimpleFrozenList(),
|
||||
exclude: Iterable[str] = SimpleFrozenList(),
|
||||
config: Union[Dict[str, Any], Config] = SimpleFrozenDict(),
|
||||
) -> "Language":
|
||||
"""Helper function to use in the `load()` method of a model package's
|
||||
|
@ -340,7 +405,11 @@ def load_model_from_init_py(
|
|||
|
||||
vocab (Vocab / True): Optional vocab to pass in on initialization. If True,
|
||||
a new Vocab object will be created.
|
||||
disable (Iterable[str]): Names of pipeline components to disable.
|
||||
disable (Iterable[str]): Names of pipeline components to disable. Disabled
|
||||
pipes will be loaded but they won't be run unless you explicitly
|
||||
enable them by calling nlp.enable_pipe.
|
||||
exclude (Iterable[str]): Names of pipeline components to exclude. Excluded
|
||||
components won't be loaded.
|
||||
config (Dict[str, Any] / Config): Config overrides as nested dict or dict
|
||||
keyed by section values in dot notation.
|
||||
RETURNS (Language): The loaded nlp object.
|
||||
|
@ -352,7 +421,12 @@ def load_model_from_init_py(
|
|||
if not model_path.exists():
|
||||
raise IOError(Errors.E052.format(path=data_path))
|
||||
return load_model_from_path(
|
||||
data_path, vocab=vocab, meta=meta, disable=disable, config=config
|
||||
data_path,
|
||||
vocab=vocab,
|
||||
meta=meta,
|
||||
disable=disable,
|
||||
exclude=exclude,
|
||||
config=config,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -74,15 +74,16 @@ your config and check that it's valid, you can run the
|
|||
Defines the `nlp` object, its tokenizer and
|
||||
[processing pipeline](/usage/processing-pipelines) component names.
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lang` | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `null`. ~~str~~ |
|
||||
| `pipeline` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). Defaults to `[]`. ~~List[str]~~ |
|
||||
| `load_vocab_data` | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. Defaults to `true`. ~~bool~~ |
|
||||
| `before_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. Defaults to `null`. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~ |
|
||||
| `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
||||
| `after_pipeline_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
||||
| `tokenizer` | The tokenizer to use. Defaults to [`Tokenizer`](/api/tokenizer). ~~Callable[[str], Doc]~~ |
|
||||
| Name | Description |
|
||||
| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lang` | Model language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). Defaults to `null`. ~~str~~ |
|
||||
| `pipeline` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). Defaults to `[]`. ~~List[str]~~ |
|
||||
| `disabled` | Names of pipeline components that are loaded but disabled by default and not run as part of the pipeline. Should correspond to components listed in `pipeline`. After a model is loaded, disabled components can be enabled using [`Language.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
|
||||
| `load_vocab_data` | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. Defaults to `true`. ~~bool~~ |
|
||||
| `before_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. Defaults to `null`. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~ |
|
||||
| `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
||||
| `after_pipeline_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
|
||||
| `tokenizer` | The tokenizer to use. Defaults to [`Tokenizer`](/api/tokenizer). ~~Callable[[str], Doc]~~ |
|
||||
|
||||
### components {#config-components tag="section"}
|
||||
|
||||
|
|
|
@ -357,35 +357,6 @@ their original weights after the block.
|
|||
| -------- | ------------------------------------------------------ |
|
||||
| `params` | A dictionary of parameters keyed by model ID. ~~dict~~ |
|
||||
|
||||
## Language.create_pipe {#create_pipe tag="method" new="2"}
|
||||
|
||||
Create a pipeline component from a factory.
|
||||
|
||||
<Infobox title="Changed in v3.0" variant="warning">
|
||||
|
||||
As of v3.0, the [`Language.add_pipe`](/api/language#add_pipe) method also takes
|
||||
the string name of the factory, creates the component, adds it to the pipeline
|
||||
and returns it. The `Language.create_pipe` method is now mostly used internally.
|
||||
To create a component and add it to the pipeline, you should always use
|
||||
`Language.add_pipe`.
|
||||
|
||||
</Infobox>
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> parser = nlp.create_pipe("parser")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `factory_name` | Name of the registered component factory. ~~str~~ |
|
||||
| `name` | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. ~~Optional[str]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `config` <Tag variant="new">3</Tag> | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ |
|
||||
| `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
|
||||
| **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ |
|
||||
|
||||
## Language.add_pipe {#add_pipe tag="method" new="2"}
|
||||
|
||||
Add a component to the processing pipeline. Expects a name that maps to a
|
||||
|
@ -434,6 +405,35 @@ component, adds it to the pipeline and returns it.
|
|||
| `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
|
||||
| **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ |
|
||||
|
||||
## Language.create_pipe {#create_pipe tag="method" new="2"}
|
||||
|
||||
Create a pipeline component from a factory.
|
||||
|
||||
<Infobox title="Changed in v3.0" variant="warning">
|
||||
|
||||
As of v3.0, the [`Language.add_pipe`](/api/language#add_pipe) method also takes
|
||||
the string name of the factory, creates the component, adds it to the pipeline
|
||||
and returns it. The `Language.create_pipe` method is now mostly used internally.
|
||||
To create a component and add it to the pipeline, you should always use
|
||||
`Language.add_pipe`.
|
||||
|
||||
</Infobox>
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> parser = nlp.create_pipe("parser")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `factory_name` | Name of the registered component factory. ~~str~~ |
|
||||
| `name` | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. ~~Optional[str]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `config` <Tag variant="new">3</Tag> | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ |
|
||||
| `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
|
||||
| **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ |
|
||||
|
||||
## Language.has_factory {#has_factory tag="classmethod" new="3"}
|
||||
|
||||
Check whether a factory name is registered on the `Language` class or subclass.
|
||||
|
@ -561,6 +561,54 @@ component function.
|
|||
| `name` | Name of the component to remove. ~~str~~ |
|
||||
| **RETURNS** | A `(name, component)` tuple of the removed component. ~~Tuple[str, Callable[[Doc], Doc]]~~ |
|
||||
|
||||
## Language.disable_pipe {#disable_pipe tag="method" new="3"}
|
||||
|
||||
Temporarily disable a pipeline component so it's not run as part of the
|
||||
pipeline. Disabled components are listed in
|
||||
[`nlp.disabled`](/api/language#attributes) and included in
|
||||
[`nlp.components`](/api/language#attributes), but not in
|
||||
[`nlp.pipeline`](/api/language#pipeline), so they're not run when you process a
|
||||
`Doc` with the `nlp` object. If the component is already disabled, this method
|
||||
does nothing.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> nlp.add_pipe("ner")
|
||||
> nlp.add_pipe("textcat")
|
||||
> assert nlp.pipe_names == ["ner", "textcat"]
|
||||
> nlp.disable_pipe("ner")
|
||||
> assert nlp.pipe_names == ["textcat"]
|
||||
> assert nlp.component_names == ["ner", "textcat"]
|
||||
> assert nlp.disabled == ["ner"]
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------ | ----------------------------------------- |
|
||||
| `name` | Name of the component to disable. ~~str~~ |
|
||||
|
||||
## Language.enable_pipe {#enable_pipe tag="method" new="3"}
|
||||
|
||||
Enable a previously disable component (e.g. via
|
||||
[`Language.disable_pipes`](/api/language#disable_pipes)) so it's run as part of
|
||||
the pipeline, [`nlp.pipeline`](/api/language#pipeline). If the component is
|
||||
already enabled, this method does nothing.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> nlp.disable_pipe("ner")
|
||||
> assert "ner" in nlp.disabled
|
||||
> assert not "ner" in nlp.pipe_names
|
||||
> nlp.enable_pipe("ner")
|
||||
> assert not "ner" in nlp.disabled
|
||||
> assert "ner" in nlp.pipe_names
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------ | ---------------------------------------- |
|
||||
| `name` | Name of the component to enable. ~~str~~ |
|
||||
|
||||
## Language.select_pipes {#select_pipes tag="contextmanager, method" new="3"}
|
||||
|
||||
Disable one or more pipeline components. If used as a context manager, the
|
||||
|
@ -568,7 +616,9 @@ pipeline will be restored to the initial state at the end of the block.
|
|||
Otherwise, a `DisabledPipes` object is returned, that has a `.restore()` method
|
||||
you can use to undo your changes. You can specify either `disable` (as a list or
|
||||
string), or `enable`. In the latter case, all components not in the `enable`
|
||||
list, will be disabled.
|
||||
list, will be disabled. Under the hood, this method calls into
|
||||
[`disable_pipe`](/api/language#disable_pipe) and
|
||||
[`enable_pipe`](/api/language#enable_pipe).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -860,18 +910,21 @@ available to the loaded object.
|
|||
|
||||
## Attributes {#attributes}
|
||||
|
||||
| Name | Description |
|
||||
| --------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | A container for the lexical types. ~~Vocab~~ |
|
||||
| `tokenizer` | The tokenizer. ~~Tokenizer~~ |
|
||||
| `make_doc` | Callable that takes a string and returns a `Doc`. ~~Callable[[str], Doc]~~ |
|
||||
| `pipeline` | List of `(name, component)` tuples describing the current processing pipeline, in order. ~~List[str, Callable[[Doc], Doc]]~~ |
|
||||
| `pipe_names` <Tag variant="new">2</Tag> | List of pipeline component names, in order. ~~List[str]~~ |
|
||||
| `pipe_labels` <Tag variant="new">2.2</Tag> | List of labels set by the pipeline components, if available, keyed by component name. ~~Dict[str, List[str]]~~ |
|
||||
| `pipe_factories` <Tag variant="new">2.2</Tag> | Dictionary of pipeline component names, mapped to their factory names. ~~Dict[str, str]~~ |
|
||||
| `factories` | All available factory functions, keyed by name. ~~Dict[str, Callable[[...], Callable[[Doc], Doc]]]~~ |
|
||||
| `factory_names` <Tag variant="new">3</Tag> | List of all available factory names. ~~List[str]~~ |
|
||||
| `path` <Tag variant="new">2</Tag> | Path to the model data directory, if a model is loaded. Otherwise `None`. ~~Optional[Path]~~ |
|
||||
| Name | Description |
|
||||
| --------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | A container for the lexical types. ~~Vocab~~ |
|
||||
| `tokenizer` | The tokenizer. ~~Tokenizer~~ |
|
||||
| `make_doc` | Callable that takes a string and returns a `Doc`. ~~Callable[[str], Doc]~~ |
|
||||
| `pipeline` | List of `(name, component)` tuples describing the current processing pipeline, in order. ~~List[Tuple[str, Callable[[Doc], Doc]]]~~ |
|
||||
| `pipe_names` <Tag variant="new">2</Tag> | List of pipeline component names, in order. ~~List[str]~~ |
|
||||
| `pipe_labels` <Tag variant="new">2.2</Tag> | List of labels set by the pipeline components, if available, keyed by component name. ~~Dict[str, List[str]]~~ |
|
||||
| `pipe_factories` <Tag variant="new">2.2</Tag> | Dictionary of pipeline component names, mapped to their factory names. ~~Dict[str, str]~~ |
|
||||
| `factories` | All available factory functions, keyed by name. ~~Dict[str, Callable[[...], Callable[[Doc], Doc]]]~~ |
|
||||
| `factory_names` <Tag variant="new">3</Tag> | List of all available factory names. ~~List[str]~~ |
|
||||
| `components` <Tag variant="new">3</Tag> | List of all available `(name, component)` tuples, including components that are currently disabled. ~~List[Tuple[str, Callable[[Doc], Doc]]]~~ |
|
||||
| `component_names` <Tag variant="new">3</Tag> | List of all available component names, including components that are currently disabled. ~~List[str]~~ |
|
||||
| `disabled` <Tag variant="new">3</Tag> | Names of components that are currently disabled and don't run as part of the pipeline. ~~List[str]~~ |
|
||||
| `path` <Tag variant="new">2</Tag> | Path to the model data directory, if a model is loaded. Otherwise `None`. ~~Optional[Path]~~ |
|
||||
|
||||
## Class attributes {#class-attributes}
|
||||
|
||||
|
|
|
@ -23,6 +23,14 @@ path, spaCy will assume it's a data directory, load its
|
|||
information to construct the `Language` class. The data will be loaded in via
|
||||
[`Language.from_disk`](/api/language#from_disk).
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0">
|
||||
|
||||
As of v3.0, the `disable` keyword argument specifies components to load but
|
||||
disable, instead of components to not load at all. Those components can now be
|
||||
specified separately using the new `exclude` keyword argument.
|
||||
|
||||
</Infobox>
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
|
@ -30,16 +38,17 @@ information to construct the `Language` class. The data will be loaded in via
|
|||
> nlp = spacy.load("/path/to/en") # string path
|
||||
> nlp = spacy.load(Path("/path/to/en")) # pathlib Path
|
||||
>
|
||||
> nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"])
|
||||
> nlp = spacy.load("en_core_web_sm", exclude=["parser", "tagger"])
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `name` | Model to load, i.e. package name or path. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| **RETURNS** | A `Language` object with the loaded model. ~~Language~~ |
|
||||
| Name | Description |
|
||||
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | Model to load, i.e. package name or path. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
|
||||
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| **RETURNS** | A `Language` object with the loaded model. ~~Language~~ |
|
||||
|
||||
Essentially, `spacy.load()` is a convenience wrapper that reads the model's
|
||||
[`config.cfg`](/api/data-formats#config), uses the language and pipeline
|
||||
|
@ -562,17 +571,18 @@ and create a `Language` object. The model data will then be loaded in via
|
|||
>
|
||||
> ```python
|
||||
> nlp = util.load_model("en_core_web_sm")
|
||||
> nlp = util.load_model("en_core_web_sm", disable=["ner"])
|
||||
> nlp = util.load_model("en_core_web_sm", exclude=["ner"])
|
||||
> nlp = util.load_model("/path/to/data")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | Package name or model path. ~~str~~ |
|
||||
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
|
||||
| `disable` | Names of pipeline components to disable. ~~Iterable[str]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| **RETURNS** | `Language` class with the loaded model. ~~Language~~ |
|
||||
| Name | Description |
|
||||
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | Package name or model path. ~~str~~ |
|
||||
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
|
||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
|
||||
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| **RETURNS** | `Language` class with the loaded model. ~~Language~~ |
|
||||
|
||||
### util.load_model_from_init_py {#util.load_model_from_init_py tag="function" new="2"}
|
||||
|
||||
|
@ -588,13 +598,14 @@ A helper function to use in the `load()` method of a model package's
|
|||
> return load_model_from_init_py(__file__, **overrides)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `init_file` | Path to model's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ |
|
||||
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
|
||||
| `disable` | Names of pipeline components to disable. ~~Iterable[str]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| **RETURNS** | `Language` class with the loaded model. ~~Language~~ |
|
||||
| Name | Description |
|
||||
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `init_file` | Path to model's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ |
|
||||
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
|
||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
|
||||
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| **RETURNS** | `Language` class with the loaded model. ~~Language~~ |
|
||||
|
||||
### util.load_config {#util.load_config tag="function" new="3"}
|
||||
|
||||
|
|
|
@ -235,38 +235,54 @@ available pipeline components and component functions.
|
|||
| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | Assign token-to-vector embeddings. |
|
||||
| `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. |
|
||||
|
||||
### Disabling and modifying pipeline components {#disabling}
|
||||
### Disabling, excluding and modifying components {#disabling}
|
||||
|
||||
If you don't need a particular component of the pipeline – for example, the
|
||||
tagger or the parser, you can **disable loading** it. This can sometimes make a
|
||||
big difference and improve loading speed. Disabled component names can be
|
||||
provided to [`spacy.load`](/api/top-level#spacy.load),
|
||||
[`Language.from_disk`](/api/language#from_disk) or the `nlp` object itself as a
|
||||
list:
|
||||
tagger or the parser, you can **disable or exclude** it. This can sometimes make
|
||||
a big difference and improve loading and inference speed. There are two
|
||||
different mechanisms you can use:
|
||||
|
||||
1. **Disable:** The component and its data will be loaded with the model, but it
|
||||
will be disabled by default and not run as part of the processing pipeline.
|
||||
To run it, you can explicitly enable it by calling
|
||||
[`nlp.enable_pipe`](/api/language#enable_pipe). When you save out the `nlp`
|
||||
object, the disabled component will be included but disabled by default.
|
||||
2. **Exclude:** Don't load the component and its data with the model. Once the
|
||||
model is loaded, there will be no reference to the excluded component.
|
||||
|
||||
Disabled and excluded component names can be provided to
|
||||
[`spacy.load`](/api/top-level#spacy.load) as a list.
|
||||
|
||||
<!-- TODO: update with info on our models shipped with optional components -->
|
||||
|
||||
> #### 💡 Models with optional components
|
||||
>
|
||||
> The `disable` mechanism makes it easy to distribute models with optional
|
||||
> components that you can enable or disable at runtime. For instance, your model
|
||||
> may include a statistical _and_ a rule-based component for sentence
|
||||
> segmentation, and you can choose which one to run depending on your use case.
|
||||
|
||||
```python
|
||||
### Disable loading
|
||||
# Load the model without the entity recognizer
|
||||
nlp = spacy.load("en_core_web_sm", exclude=["ner"])
|
||||
|
||||
# Load the tagger and parser but don't enable them
|
||||
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser"])
|
||||
# Explicitly enable the tagger later on
|
||||
nlp.enable_pipe("tagger")
|
||||
```
|
||||
|
||||
In some cases, you do want to load all pipeline components and their weights,
|
||||
because you need them at different points in your application. However, if you
|
||||
only need a `Doc` object with named entities, there's no need to run all
|
||||
pipeline components on it – that can potentially make processing much slower.
|
||||
Instead, you can use the `disable` keyword argument on
|
||||
[`nlp.pipe`](/api/language#pipe) to temporarily disable the components **during
|
||||
processing**:
|
||||
<Infobox variant="warning" title="Changed in v3.0">
|
||||
|
||||
```python
|
||||
### Disable for processing
|
||||
for doc in nlp.pipe(texts, disable=["tagger", "parser"]):
|
||||
# Do something with the doc here
|
||||
```
|
||||
As of v3.0, the `disable` keyword argument specifies components to load but
|
||||
disable, instead of components to not load at all. Those components can now be
|
||||
specified separately using the new `exclude` keyword argument.
|
||||
|
||||
If you need to **execute more code** with components disabled – e.g. to reset
|
||||
the weights or update only some components during training – you can use the
|
||||
[`nlp.select_pipes`](/api/language#select_pipes) context manager. At the end of
|
||||
the `with` block, the disabled pipeline components will be restored
|
||||
</Infobox>
|
||||
|
||||
As a shortcut, you can use the [`nlp.select_pipes`](/api/language#select_pipes)
|
||||
context manager to temporarily disable certain components for a given block. At
|
||||
the end of the `with` block, the disabled pipeline components will be restored
|
||||
automatically. Alternatively, `select_pipes` returns an object that lets you
|
||||
call its `restore()` method to restore the disabled components when needed. This
|
||||
can be useful if you want to prevent unnecessary code indentation of large
|
||||
|
@ -295,6 +311,14 @@ with nlp.select_pipes(enable="parser"):
|
|||
doc = nlp("I will only be parsed")
|
||||
```
|
||||
|
||||
The [`nlp.pipe`](/api/language#pipe) method also supports a `disable` keyword
|
||||
argument if you only want to disable components during processing:
|
||||
|
||||
```python
|
||||
for doc in nlp.pipe(texts, disable=["tagger", "parser"]):
|
||||
# Do something with the doc here
|
||||
```
|
||||
|
||||
Finally, you can also use the [`remove_pipe`](/api/language#remove_pipe) method
|
||||
to remove pipeline components from an existing pipeline, the
|
||||
[`rename_pipe`](/api/language#rename_pipe) method to rename them, or the
|
||||
|
@ -308,6 +332,31 @@ nlp.rename_pipe("ner", "entityrecognizer")
|
|||
nlp.replace_pipe("tagger", my_custom_tagger)
|
||||
```
|
||||
|
||||
The `Language` object exposes different [attributes](/api/language#attributes)
|
||||
that let you inspect all available components and the components that currently
|
||||
run as part of the pipeline.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> nlp = spacy.blank("en")
|
||||
> nlp.add_pipe("ner")
|
||||
> nlp.add_pipe("textcat")
|
||||
> assert nlp.pipe_names == ["ner", "textcat"]
|
||||
> nlp.disable_pipe("ner")
|
||||
> assert nlp.pipe_names == ["textcat"]
|
||||
> assert nlp.component_names == ["ner", "textcat"]
|
||||
> assert nlp.disabled == ["ner"]
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| --------------------- | ---------------------------------------------------------------- |
|
||||
| `nlp.pipeline` | `(name, component)` tuples of the processing pipeline, in order. |
|
||||
| `nlp.pipe_names` | Pipeline component names, in order. |
|
||||
| `nlp.components` | All `(name, component)` tuples, including disabled components. |
|
||||
| `nlp.component_names` | All component names, including disabled components. |
|
||||
| `nlp.disabled` | Names of components that are currently disabled. |
|
||||
|
||||
### Sourcing pipeline components from existing models {#sourced-components new="3"}
|
||||
|
||||
Pipeline components that are independent can also be reused across models.
|
||||
|
|
|
@ -254,12 +254,15 @@ The following methods, attributes and commands are new in spaCy v3.0.
|
|||
| [`Token.lex`](/api/token#attributes) | Access a token's [`Lexeme`](/api/lexeme). |
|
||||
| [`Token.morph`](/api/token#attributes) [`Token.morph_`](/api/token#attributes) | Access a token's morphological analysis. |
|
||||
| [`Language.select_pipes`](/api/language#select_pipes) | Context manager for enabling or disabling specific pipeline components for a block. |
|
||||
| [`Language.disable_pipe`](/api/language#disable_pipe) [`Language.enable_pipe`](/api/language#enable_pipe) | Disable or enable a loaded pipeline component (but don't remove it). |
|
||||
| [`Language.analyze_pipes`](/api/language#analyze_pipes) | [Analyze](/usage/processing-pipelines#analysis) components and their interdependencies. |
|
||||
| [`Language.resume_training`](/api/language#resume_training) | Experimental: continue training a pretrained model and initialize "rehearsal" for components that implement a `rehearse` method to prevent catastrophic forgetting. |
|
||||
| [`@Language.factory`](/api/language#factory) [`@Language.component`](/api/language#component) | Decorators for [registering](/usage/processing-pipelines#custom-components) pipeline component factories and simple stateless component functions. |
|
||||
| [`Language.has_factory`](/api/language#has_factory) | Check whether a component factory is registered on a language class.s |
|
||||
| [`Language.get_factory_meta`](/api/language#get_factory_meta) [`Language.get_pipe_meta`](/api/language#get_factory_meta) | Get the [`FactoryMeta`](/api/language#factorymeta) with component metadata for a factory or instance name. |
|
||||
| [`Language.config`](/api/language#config) | The [config](/usage/training#config) used to create the current `nlp` object. An instance of [`Config`](https://thinc.ai/docs/api-config#config) and can be saved to disk and used for training. |
|
||||
| [`Language.components`](/api/language#attributes) [`Language.component_names`](/api/language#attributes) | All available components and component names, including disabled components that are not run as part of the pipeline. |
|
||||
| [`Language.disabled`](/api/language#attributes) | Names of disabled components that are not run as part of the pipeline. |
|
||||
| [`Pipe.score`](/api/pipe#score) | Method on trainable pipeline components that returns a dictionary of evaluation scores. |
|
||||
| [`registry`](/api/top-level#registry) | Function registry to map functions to string names that can be referenced in [configs](/usage/training#config). |
|
||||
| [`util.load_meta`](/api/top-level#util.load_meta) [`util.load_config`](/api/top-level#util.load_config) | Updated helpers for loading a model's [`meta.json`](/api/data-formats#meta) and [`config.cfg`](/api/data-formats#config). |
|
||||
|
|
Loading…
Reference in New Issue
Block a user