mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-07 15:56:32 +03:00
2702db9fef
* Recommend lookups tables from URLs or other loaders Shift away from the `lookups` extra (which isn't removed, just no longer mentioned) and recommend loading data from the `spacy-lookups-data` repo or other sources rather than the `spacy-lookups-data` package. If the tables can't be loaded from the `lookups` registry in the lemmatizer, show how to specify the tables in `[initialize]` rather than recommending the `spacy-lookups-data` package. * Add tests for some rule-based lemmatizers * Apply suggestions from code review Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> --------- Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
2456 lines
104 KiB
Python
2456 lines
104 KiB
Python
import functools
|
||
import itertools
|
||
import multiprocessing as mp
|
||
import random
|
||
import traceback
|
||
import warnings
|
||
from contextlib import contextmanager
|
||
from copy import deepcopy
|
||
from dataclasses import dataclass
|
||
from itertools import chain, cycle
|
||
from pathlib import Path
|
||
from timeit import default_timer as timer
|
||
from typing import (
|
||
Any,
|
||
Callable,
|
||
Dict,
|
||
Iterable,
|
||
Iterator,
|
||
List,
|
||
Literal,
|
||
NoReturn,
|
||
Optional,
|
||
Pattern,
|
||
Sequence,
|
||
Set,
|
||
Tuple,
|
||
TypeVar,
|
||
Union,
|
||
cast,
|
||
overload,
|
||
)
|
||
|
||
import srsly
|
||
from thinc.api import Config, CupyOps, Optimizer, get_current_ops
|
||
|
||
from . import about, ty, util
|
||
from .errors import Errors, Warnings
|
||
from .git_info import GIT_VERSION
|
||
from .lang.punctuation import TOKENIZER_INFIXES, TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
|
||
from .lang.tokenizer_exceptions import BASE_EXCEPTIONS, URL_MATCH
|
||
from .lookups import load_lookups
|
||
from .pipe_analysis import analyze_pipes, print_pipe_analysis, validate_attrs
|
||
from .schemas import (
|
||
ConfigSchema,
|
||
ConfigSchemaInit,
|
||
ConfigSchemaNlp,
|
||
ConfigSchemaPretrain,
|
||
validate_init_settings,
|
||
)
|
||
from .scorer import Scorer
|
||
from .tokenizer import Tokenizer
|
||
from .tokens import Doc
|
||
from .tokens.underscore import Underscore
|
||
from .training import Example, validate_distillation_examples, validate_examples
|
||
from .training.initialize import init_tok2vec, init_vocab
|
||
from .util import (
|
||
_DEFAULT_EMPTY_PIPES,
|
||
CONFIG_SECTION_ORDER,
|
||
SimpleFrozenDict,
|
||
SimpleFrozenList,
|
||
_pipe,
|
||
combine_score_weights,
|
||
raise_error,
|
||
registry,
|
||
warn_if_jupyter_cupy,
|
||
)
|
||
from .vocab import Vocab, create_vocab
|
||
|
||
PipeCallable = Callable[[Doc], Doc]
|
||
|
||
|
||
# This is the base config will all settings (training etc.)
|
||
DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
|
||
DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH)
|
||
# This is the base config for the [distillation] block and currently not included
|
||
# in the main config and only added via the 'init fill-config' command
|
||
DEFAULT_CONFIG_DISTILL_PATH = Path(__file__).parent / "default_config_distillation.cfg"
|
||
# This is the base config for the [pretraining] block and currently not included
|
||
# in the main config and only added via the 'init fill-config' command
|
||
DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg"
|
||
|
||
# Type variable for contexts piped with documents
|
||
_AnyContext = TypeVar("_AnyContext")
|
||
|
||
|
||
class BaseDefaults:
|
||
"""Language data defaults, available via Language.Defaults. Can be
|
||
overwritten by language subclasses by defining their own subclasses of
|
||
Language.Defaults.
|
||
"""
|
||
|
||
config: Config = Config(section_order=CONFIG_SECTION_ORDER)
|
||
tokenizer_exceptions: Dict[str, List[dict]] = BASE_EXCEPTIONS
|
||
prefixes: Optional[Sequence[Union[str, Pattern]]] = TOKENIZER_PREFIXES
|
||
suffixes: Optional[Sequence[Union[str, Pattern]]] = TOKENIZER_SUFFIXES
|
||
infixes: Optional[Sequence[Union[str, Pattern]]] = TOKENIZER_INFIXES
|
||
token_match: Optional[Callable] = None
|
||
url_match: Optional[Callable] = URL_MATCH
|
||
syntax_iterators: Dict[str, Callable] = {}
|
||
lex_attr_getters: Dict[int, Callable[[str], Any]] = {}
|
||
stop_words: Set[str] = set()
|
||
writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
|
||
|
||
|
||
@registry.tokenizers("spacy.Tokenizer.v1")
|
||
def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
||
"""Registered function to create a tokenizer. Returns a factory that takes
|
||
the nlp object and returns a Tokenizer instance using the language detaults.
|
||
"""
|
||
|
||
def tokenizer_factory(nlp: "Language") -> Tokenizer:
|
||
prefixes = nlp.Defaults.prefixes
|
||
suffixes = nlp.Defaults.suffixes
|
||
infixes = nlp.Defaults.infixes
|
||
prefix_search = util.compile_prefix_regex(prefixes).search if prefixes else None
|
||
suffix_search = util.compile_suffix_regex(suffixes).search if suffixes else None
|
||
infix_finditer = util.compile_infix_regex(infixes).finditer if infixes else None
|
||
return Tokenizer(
|
||
nlp.vocab,
|
||
rules=nlp.Defaults.tokenizer_exceptions,
|
||
prefix_search=prefix_search,
|
||
suffix_search=suffix_search,
|
||
infix_finditer=infix_finditer,
|
||
token_match=nlp.Defaults.token_match,
|
||
url_match=nlp.Defaults.url_match,
|
||
)
|
||
|
||
return tokenizer_factory
|
||
|
||
|
||
class Language:
|
||
"""A text-processing pipeline. Usually you'll load this once per process,
|
||
and pass the instance around your application.
|
||
|
||
Defaults (class): Settings, data and factory methods for creating the `nlp`
|
||
object and processing pipeline.
|
||
lang (str): IETF language code, such as 'en'.
|
||
|
||
DOCS: https://spacy.io/api/language
|
||
"""
|
||
|
||
Defaults = BaseDefaults
|
||
lang: Optional[str] = None
|
||
default_config = DEFAULT_CONFIG
|
||
|
||
factories = SimpleFrozenDict(error=Errors.E957)
|
||
_factory_meta: Dict[str, "FactoryMeta"] = {} # meta by factory
|
||
|
||
def __init__(
|
||
self,
|
||
vocab: Union[Vocab, bool] = True,
|
||
*,
|
||
max_length: int = 10**6,
|
||
meta: Dict[str, Any] = {},
|
||
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
|
||
batch_size: int = 1000,
|
||
**kwargs,
|
||
) -> None:
|
||
"""Initialise a Language object.
|
||
|
||
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created.
|
||
meta (dict): Custom meta data for the Language class. Is written to by
|
||
models to add model meta data.
|
||
max_length (int): Maximum number of characters in a single text. The
|
||
current models may run out memory on extremely long texts, due to
|
||
large internal allocations. You should segment these texts into
|
||
meaningful units, e.g. paragraphs, subsections etc, before passing
|
||
them to spaCy. Default maximum length is 1,000,000 charas (1mb). As
|
||
a rule of thumb, if all pipeline components are enabled, spaCy's
|
||
default models currently requires roughly 1GB of temporary memory per
|
||
100,000 characters in one text.
|
||
create_tokenizer (Callable): Function that takes the nlp object and
|
||
returns a tokenizer.
|
||
batch_size (int): Default batch size for pipe and evaluate.
|
||
|
||
DOCS: https://spacy.io/api/language#init
|
||
"""
|
||
# We're only calling this to import all factories provided via entry
|
||
# points. The factory decorator applied to these functions takes care
|
||
# of the rest.
|
||
util.registry._entry_point_factories.get_all()
|
||
|
||
self._config = DEFAULT_CONFIG.merge(self.default_config)
|
||
self._meta = dict(meta)
|
||
self._path = None
|
||
self._optimizer: Optional[Optimizer] = None
|
||
# Component meta and configs are only needed on the instance
|
||
self._pipe_meta: Dict[str, "FactoryMeta"] = {} # meta by component
|
||
self._pipe_configs: Dict[str, Config] = {} # config by component
|
||
|
||
if not isinstance(vocab, Vocab) and vocab is not True:
|
||
raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
|
||
if vocab is True:
|
||
vocab = create_vocab(self.lang, self.Defaults)
|
||
else:
|
||
if (self.lang and vocab.lang) and (self.lang != vocab.lang):
|
||
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
|
||
self.vocab: Vocab = vocab
|
||
if self.lang is None:
|
||
self.lang = self.vocab.lang
|
||
self._components: List[Tuple[str, PipeCallable]] = []
|
||
self._disabled: Set[str] = set()
|
||
self.max_length = max_length
|
||
# Create the default tokenizer from the default config
|
||
if not create_tokenizer:
|
||
tokenizer_cfg = {"tokenizer": self._config["nlp"]["tokenizer"]}
|
||
create_tokenizer = registry.resolve(tokenizer_cfg)["tokenizer"]
|
||
self.tokenizer = create_tokenizer(self)
|
||
self.batch_size = batch_size
|
||
self.default_error_handler = raise_error
|
||
|
||
def __init_subclass__(cls, **kwargs):
|
||
super().__init_subclass__(**kwargs)
|
||
cls.default_config = DEFAULT_CONFIG.merge(cls.Defaults.config)
|
||
cls.default_config["nlp"]["lang"] = cls.lang
|
||
|
||
@property
|
||
def path(self):
|
||
return self._path
|
||
|
||
@property
|
||
def meta(self) -> Dict[str, Any]:
|
||
"""Custom meta data of the language class. If a model is loaded, this
|
||
includes details from the model's meta.json.
|
||
|
||
RETURNS (Dict[str, Any]): The meta.
|
||
|
||
DOCS: https://spacy.io/api/language#meta
|
||
"""
|
||
spacy_version = util.get_minor_version_range(about.__version__)
|
||
if self.vocab.lang:
|
||
self._meta.setdefault("lang", self.vocab.lang)
|
||
else:
|
||
self._meta.setdefault("lang", self.lang)
|
||
self._meta.setdefault("name", "pipeline")
|
||
self._meta.setdefault("version", "0.0.0")
|
||
self._meta.setdefault("spacy_version", spacy_version)
|
||
self._meta.setdefault("description", "")
|
||
self._meta.setdefault("author", "")
|
||
self._meta.setdefault("email", "")
|
||
self._meta.setdefault("url", "")
|
||
self._meta.setdefault("license", "")
|
||
self._meta.setdefault("spacy_git_version", GIT_VERSION)
|
||
self._meta["vectors"] = {
|
||
"width": self.vocab.vectors_length,
|
||
"vectors": len(self.vocab.vectors),
|
||
"keys": self.vocab.vectors.n_keys,
|
||
"mode": self.vocab.vectors.mode,
|
||
}
|
||
self._meta["labels"] = dict(self.pipe_labels)
|
||
# TODO: Adding this back to prevent breaking people's code etc., but
|
||
# we should consider removing it
|
||
self._meta["pipeline"] = list(self.pipe_names)
|
||
self._meta["components"] = list(self.component_names)
|
||
self._meta["disabled"] = list(self.disabled)
|
||
return self._meta
|
||
|
||
@meta.setter
|
||
def meta(self, value: Dict[str, Any]) -> None:
|
||
self._meta = value
|
||
|
||
@property
|
||
def config(self) -> Config:
|
||
"""Trainable config for the current language instance. Includes the
|
||
current pipeline components, as well as default training config.
|
||
|
||
RETURNS (thinc.api.Config): The config.
|
||
|
||
DOCS: https://spacy.io/api/language#config
|
||
"""
|
||
self._config.setdefault("nlp", {})
|
||
self._config.setdefault("training", {})
|
||
self._config["nlp"]["lang"] = self.lang
|
||
# We're storing the filled config for each pipeline component and so
|
||
# we can populate the config again later
|
||
pipeline = {}
|
||
score_weights = []
|
||
for pipe_name in self.component_names:
|
||
pipe_meta = self.get_pipe_meta(pipe_name)
|
||
pipe_config = self.get_pipe_config(pipe_name)
|
||
pipeline[pipe_name] = {"factory": pipe_meta.factory, **pipe_config}
|
||
if pipe_meta.default_score_weights:
|
||
score_weights.append(pipe_meta.default_score_weights)
|
||
self._config["nlp"]["pipeline"] = list(self.component_names)
|
||
self._config["nlp"]["disabled"] = list(self.disabled)
|
||
self._config["components"] = pipeline
|
||
# We're merging the existing score weights back into the combined
|
||
# weights to make sure we're preserving custom settings in the config
|
||
# but also reflect updates (e.g. new components added)
|
||
prev_weights = self._config["training"].get("score_weights", {})
|
||
combined_score_weights = combine_score_weights(score_weights, prev_weights)
|
||
self._config["training"]["score_weights"] = combined_score_weights
|
||
if not srsly.is_json_serializable(self._config):
|
||
raise ValueError(Errors.E961.format(config=self._config))
|
||
return self._config
|
||
|
||
@config.setter
|
||
def config(self, value: Config) -> None:
|
||
self._config = value
|
||
|
||
@property
|
||
def disabled(self) -> List[str]:
|
||
"""Get the names of all disabled components.
|
||
|
||
RETURNS (List[str]): The disabled components.
|
||
"""
|
||
# Make sure the disabled components are returned in the order they
|
||
# appear in the pipeline (which isn't guaranteed by the set)
|
||
names = [name for name, _ in self._components if name in self._disabled]
|
||
return SimpleFrozenList(names, error=Errors.E926.format(attr="disabled"))
|
||
|
||
@property
|
||
def factory_names(self) -> List[str]:
|
||
"""Get names of all available factories.
|
||
|
||
RETURNS (List[str]): The factory names.
|
||
"""
|
||
names = list(self.factories.keys())
|
||
return SimpleFrozenList(names)
|
||
|
||
@property
|
||
def components(self) -> List[Tuple[str, PipeCallable]]:
|
||
"""Get all (name, component) tuples in the pipeline, including the
|
||
currently disabled components.
|
||
"""
|
||
return SimpleFrozenList(
|
||
self._components, error=Errors.E926.format(attr="components")
|
||
)
|
||
|
||
@property
|
||
def component_names(self) -> List[str]:
|
||
"""Get the names of the available pipeline components. Includes all
|
||
active and inactive pipeline components.
|
||
|
||
RETURNS (List[str]): List of component name strings, in order.
|
||
"""
|
||
names = [pipe_name for pipe_name, _ in self._components]
|
||
return SimpleFrozenList(names, error=Errors.E926.format(attr="component_names"))
|
||
|
||
@property
|
||
def pipeline(self) -> List[Tuple[str, PipeCallable]]:
|
||
"""The processing pipeline consisting of (name, component) tuples. The
|
||
components are called on the Doc in order as it passes through the
|
||
pipeline.
|
||
|
||
RETURNS (List[Tuple[str, Callable[[Doc], Doc]]]): The pipeline.
|
||
"""
|
||
pipes = [(n, p) for n, p in self._components if n not in self._disabled]
|
||
return SimpleFrozenList(pipes, error=Errors.E926.format(attr="pipeline"))
|
||
|
||
@property
|
||
def pipe_names(self) -> List[str]:
|
||
"""Get names of available active pipeline components.
|
||
|
||
RETURNS (List[str]): List of component name strings, in order.
|
||
"""
|
||
names = [pipe_name for pipe_name, _ in self.pipeline]
|
||
return SimpleFrozenList(names, error=Errors.E926.format(attr="pipe_names"))
|
||
|
||
@property
|
||
def pipe_factories(self) -> Dict[str, str]:
|
||
"""Get the component factories for the available pipeline components.
|
||
|
||
RETURNS (Dict[str, str]): Factory names, keyed by component names.
|
||
"""
|
||
factories = {}
|
||
for pipe_name, pipe in self._components:
|
||
factories[pipe_name] = self.get_pipe_meta(pipe_name).factory
|
||
return SimpleFrozenDict(factories)
|
||
|
||
@property
|
||
def pipe_labels(self) -> Dict[str, List[str]]:
|
||
"""Get the labels set by the pipeline components, if available (if
|
||
the component exposes a labels property and the labels are not
|
||
hidden).
|
||
|
||
RETURNS (Dict[str, List[str]]): Labels keyed by component name.
|
||
"""
|
||
labels = {}
|
||
for name, pipe in self._components:
|
||
if hasattr(pipe, "hide_labels") and pipe.hide_labels is True:
|
||
continue
|
||
if hasattr(pipe, "labels"):
|
||
labels[name] = list(pipe.labels)
|
||
return SimpleFrozenDict(labels)
|
||
|
||
@classmethod
|
||
def has_factory(cls, name: str) -> bool:
|
||
"""RETURNS (bool): Whether a factory of that name is registered."""
|
||
internal_name = cls.get_factory_name(name)
|
||
return name in registry.factories or internal_name in registry.factories
|
||
|
||
@classmethod
|
||
def get_factory_name(cls, name: str) -> str:
|
||
"""Get the internal factory name based on the language subclass.
|
||
|
||
name (str): The factory name.
|
||
RETURNS (str): The internal factory name.
|
||
"""
|
||
if cls.lang is None:
|
||
return name
|
||
return f"{cls.lang}.{name}"
|
||
|
||
@classmethod
|
||
def get_factory_meta(cls, name: str) -> "FactoryMeta":
|
||
"""Get the meta information for a given factory name.
|
||
|
||
name (str): The component factory name.
|
||
RETURNS (FactoryMeta): The meta for the given factory name.
|
||
"""
|
||
internal_name = cls.get_factory_name(name)
|
||
if internal_name in cls._factory_meta:
|
||
return cls._factory_meta[internal_name]
|
||
if name in cls._factory_meta:
|
||
return cls._factory_meta[name]
|
||
raise ValueError(Errors.E967.format(meta="factory", name=name))
|
||
|
||
@classmethod
|
||
def set_factory_meta(cls, name: str, value: "FactoryMeta") -> None:
|
||
"""Set the meta information for a given factory name.
|
||
|
||
name (str): The component factory name.
|
||
value (FactoryMeta): The meta to set.
|
||
"""
|
||
cls._factory_meta[cls.get_factory_name(name)] = value
|
||
|
||
def get_pipe_meta(self, name: str) -> "FactoryMeta":
|
||
"""Get the meta information for a given component name.
|
||
|
||
name (str): The component name.
|
||
RETURNS (FactoryMeta): The meta for the given component name.
|
||
"""
|
||
if name not in self._pipe_meta:
|
||
raise ValueError(Errors.E967.format(meta="component", name=name))
|
||
return self._pipe_meta[name]
|
||
|
||
def get_pipe_config(self, name: str) -> Config:
|
||
"""Get the config used to create a pipeline component.
|
||
|
||
name (str): The component name.
|
||
RETURNS (Config): The config used to create the pipeline component.
|
||
"""
|
||
if name not in self._pipe_configs:
|
||
raise ValueError(Errors.E960.format(name=name))
|
||
pipe_config = self._pipe_configs[name]
|
||
return pipe_config
|
||
|
||
@classmethod
|
||
def factory(
|
||
cls,
|
||
name: str,
|
||
*,
|
||
default_config: Dict[str, Any] = SimpleFrozenDict(),
|
||
assigns: Iterable[str] = SimpleFrozenList(),
|
||
requires: Iterable[str] = SimpleFrozenList(),
|
||
retokenizes: bool = False,
|
||
default_score_weights: Dict[str, Optional[float]] = SimpleFrozenDict(),
|
||
func: Optional[Callable] = None,
|
||
) -> Callable:
|
||
"""Register a new pipeline component factory. Can be used as a decorator
|
||
on a function or classmethod, or called as a function with the factory
|
||
provided as the func keyword argument. To create a component and add
|
||
it to the pipeline, you can use nlp.add_pipe(name).
|
||
|
||
name (str): The name of the component factory.
|
||
default_config (Dict[str, Any]): Default configuration, describing the
|
||
default values of the factory arguments.
|
||
assigns (Iterable[str]): Doc/Token attributes assigned by this component,
|
||
e.g. "token.ent_id". Used for pipeline analysis.
|
||
requires (Iterable[str]): Doc/Token attributes required by this component,
|
||
e.g. "token.ent_id". Used for pipeline analysis.
|
||
retokenizes (bool): Whether the component changes the tokenization.
|
||
Used for pipeline analysis.
|
||
default_score_weights (Dict[str, Optional[float]]): The scores to report during
|
||
training, and their default weight towards the final score used to
|
||
select the best model. Weights should sum to 1.0 per component and
|
||
will be combined and normalized for the whole pipeline. If None,
|
||
the score won't be shown in the logs or be weighted.
|
||
func (Optional[Callable]): Factory function if not used as a decorator.
|
||
|
||
DOCS: https://spacy.io/api/language#factory
|
||
"""
|
||
if not isinstance(name, str):
|
||
raise ValueError(Errors.E963.format(decorator="factory"))
|
||
if "." in name:
|
||
raise ValueError(Errors.E853.format(name=name))
|
||
if not isinstance(default_config, dict):
|
||
err = Errors.E962.format(
|
||
style="default config", name=name, cfg_type=type(default_config)
|
||
)
|
||
raise ValueError(err)
|
||
|
||
def add_factory(factory_func: Callable) -> Callable:
|
||
internal_name = cls.get_factory_name(name)
|
||
if internal_name in registry.factories:
|
||
# We only check for the internal name here – it's okay if it's a
|
||
# subclass and the base class has a factory of the same name. We
|
||
# also only raise if the function is different to prevent raising
|
||
# if module is reloaded.
|
||
existing_func = registry.factories.get(internal_name)
|
||
if not util.is_same_func(factory_func, existing_func):
|
||
err = Errors.E004.format(
|
||
name=name, func=existing_func, new_func=factory_func
|
||
)
|
||
raise ValueError(err)
|
||
|
||
arg_names = util.get_arg_names(factory_func)
|
||
if "nlp" not in arg_names or "name" not in arg_names:
|
||
raise ValueError(Errors.E964.format(name=name))
|
||
# Officially register the factory so we can later call
|
||
# registry.resolve and refer to it in the config as
|
||
# @factories = "spacy.Language.xyz". We use the class name here so
|
||
# different classes can have different factories.
|
||
registry.factories.register(internal_name, func=factory_func)
|
||
factory_meta = FactoryMeta(
|
||
factory=name,
|
||
default_config=default_config,
|
||
assigns=validate_attrs(assigns),
|
||
requires=validate_attrs(requires),
|
||
scores=list(default_score_weights.keys()),
|
||
default_score_weights=default_score_weights,
|
||
retokenizes=retokenizes,
|
||
)
|
||
cls.set_factory_meta(name, factory_meta)
|
||
# We're overwriting the class attr with a frozen dict to handle
|
||
# backwards-compat (writing to Language.factories directly). This
|
||
# wouldn't work with an instance property and just produce a
|
||
# confusing error – here we can show a custom error
|
||
cls.factories = SimpleFrozenDict(
|
||
registry.factories.get_all(), error=Errors.E957
|
||
)
|
||
return factory_func
|
||
|
||
if func is not None: # Support non-decorator use cases
|
||
return add_factory(func)
|
||
return add_factory
|
||
|
||
@classmethod
|
||
def component(
|
||
cls,
|
||
name: str,
|
||
*,
|
||
assigns: Iterable[str] = SimpleFrozenList(),
|
||
requires: Iterable[str] = SimpleFrozenList(),
|
||
retokenizes: bool = False,
|
||
func: Optional[PipeCallable] = None,
|
||
) -> Callable[..., Any]:
|
||
"""Register a new pipeline component. Can be used for stateless function
|
||
components that don't require a separate factory. Can be used as a
|
||
decorator on a function or classmethod, or called as a function with the
|
||
factory provided as the func keyword argument. To create a component and
|
||
add it to the pipeline, you can use nlp.add_pipe(name).
|
||
|
||
name (str): The name of the component factory.
|
||
assigns (Iterable[str]): Doc/Token attributes assigned by this component,
|
||
e.g. "token.ent_id". Used for pipeline analysis.
|
||
requires (Iterable[str]): Doc/Token attributes required by this component,
|
||
e.g. "token.ent_id". Used for pipeline analysis.
|
||
retokenizes (bool): Whether the component changes the tokenization.
|
||
Used for pipeline analysis.
|
||
func (Optional[Callable[[Doc], Doc]): Factory function if not used as a decorator.
|
||
|
||
DOCS: https://spacy.io/api/language#component
|
||
"""
|
||
if name is not None:
|
||
if not isinstance(name, str):
|
||
raise ValueError(Errors.E963.format(decorator="component"))
|
||
if "." in name:
|
||
raise ValueError(Errors.E853.format(name=name))
|
||
component_name = name if name is not None else util.get_object_name(func)
|
||
|
||
def add_component(component_func: PipeCallable) -> Callable:
|
||
if isinstance(func, type): # function is a class
|
||
raise ValueError(Errors.E965.format(name=component_name))
|
||
|
||
def factory_func(nlp, name: str) -> PipeCallable:
|
||
return component_func
|
||
|
||
internal_name = cls.get_factory_name(name)
|
||
if internal_name in registry.factories:
|
||
# We only check for the internal name here – it's okay if it's a
|
||
# subclass and the base class has a factory of the same name. We
|
||
# also only raise if the function is different to prevent raising
|
||
# if module is reloaded. It's hacky, but we need to check the
|
||
# existing functure for a closure and whether that's identical
|
||
# to the component function (because factory_func created above
|
||
# will always be different, even for the same function)
|
||
existing_func = registry.factories.get(internal_name)
|
||
closure = existing_func.__closure__
|
||
wrapped = [c.cell_contents for c in closure][0] if closure else None
|
||
if util.is_same_func(wrapped, component_func):
|
||
factory_func = existing_func # noqa: F811
|
||
|
||
cls.factory(
|
||
component_name,
|
||
assigns=assigns,
|
||
requires=requires,
|
||
retokenizes=retokenizes,
|
||
func=factory_func,
|
||
)
|
||
return component_func
|
||
|
||
if func is not None: # Support non-decorator use cases
|
||
return add_component(func)
|
||
return add_component
|
||
|
||
def analyze_pipes(
|
||
self,
|
||
*,
|
||
keys: List[str] = ["assigns", "requires", "scores", "retokenizes"],
|
||
pretty: bool = False,
|
||
) -> Optional[Dict[str, Any]]:
|
||
"""Analyze the current pipeline components, print a summary of what
|
||
they assign or require and check that all requirements are met.
|
||
|
||
keys (List[str]): The meta values to display in the table. Corresponds
|
||
to values in FactoryMeta, defined by @Language.factory decorator.
|
||
pretty (bool): Pretty-print the results.
|
||
RETURNS (dict): The data.
|
||
"""
|
||
analysis = analyze_pipes(self, keys=keys)
|
||
if pretty:
|
||
print_pipe_analysis(analysis, keys=keys)
|
||
return analysis
|
||
|
||
def get_pipe(self, name: str) -> PipeCallable:
|
||
"""Get a pipeline component for a given component name.
|
||
|
||
name (str): Name of pipeline component to get.
|
||
RETURNS (callable): The pipeline component.
|
||
|
||
DOCS: https://spacy.io/api/language#get_pipe
|
||
"""
|
||
for pipe_name, component in self._components:
|
||
if pipe_name == name:
|
||
return component
|
||
raise KeyError(Errors.E001.format(name=name, opts=self.component_names))
|
||
|
||
def create_pipe(
|
||
self,
|
||
factory_name: str,
|
||
name: Optional[str] = None,
|
||
*,
|
||
config: Dict[str, Any] = SimpleFrozenDict(),
|
||
raw_config: Optional[Config] = None,
|
||
validate: bool = True,
|
||
) -> PipeCallable:
|
||
"""Create a pipeline component. Mostly used internally. To create and
|
||
add a component to the pipeline, you can use nlp.add_pipe.
|
||
|
||
factory_name (str): Name of component factory.
|
||
name (Optional[str]): Optional name to assign to component instance.
|
||
Defaults to factory name if not set.
|
||
config (Dict[str, Any]): Config parameters to use for this component.
|
||
Will be merged with default config, if available.
|
||
raw_config (Optional[Config]): Internals: the non-interpolated config.
|
||
validate (bool): Whether to validate the component config against the
|
||
arguments and types expected by the factory.
|
||
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
||
|
||
DOCS: https://spacy.io/api/language#create_pipe
|
||
"""
|
||
name = name if name is not None else factory_name
|
||
if not isinstance(config, dict):
|
||
err = Errors.E962.format(style="config", name=name, cfg_type=type(config))
|
||
raise ValueError(err)
|
||
if not srsly.is_json_serializable(config):
|
||
raise ValueError(Errors.E961.format(config=config))
|
||
if not self.has_factory(factory_name):
|
||
err = Errors.E002.format(
|
||
name=factory_name,
|
||
opts=", ".join(self.factory_names),
|
||
method="create_pipe",
|
||
lang=util.get_object_name(self),
|
||
lang_code=self.lang,
|
||
)
|
||
raise ValueError(err)
|
||
pipe_meta = self.get_factory_meta(factory_name)
|
||
# This is unideal, but the alternative would mean you always need to
|
||
# specify the full config settings, which is not really viable.
|
||
if pipe_meta.default_config:
|
||
config = Config(pipe_meta.default_config).merge(config)
|
||
internal_name = self.get_factory_name(factory_name)
|
||
# If the language-specific factory doesn't exist, try again with the
|
||
# not-specific name
|
||
if internal_name not in registry.factories:
|
||
internal_name = factory_name
|
||
# The name allows components to know their pipe name and use it in the
|
||
# losses etc. (even if multiple instances of the same factory are used)
|
||
config = {"nlp": self, "name": name, **config, "@factories": internal_name}
|
||
# We need to create a top-level key because Thinc doesn't allow resolving
|
||
# top-level references to registered functions. Also gives nicer errors.
|
||
cfg = {factory_name: config}
|
||
# We're calling the internal _fill here to avoid constructing the
|
||
# registered functions twice
|
||
resolved = registry.resolve(cfg, validate=validate)
|
||
filled = registry.fill({"cfg": cfg[factory_name]}, validate=validate)["cfg"]
|
||
filled = Config(filled)
|
||
filled["factory"] = factory_name
|
||
filled.pop("@factories", None)
|
||
# Remove the extra values we added because we don't want to keep passing
|
||
# them around, copying them etc.
|
||
filled.pop("nlp", None)
|
||
filled.pop("name", None)
|
||
# Merge the final filled config with the raw config (including non-
|
||
# interpolated variables)
|
||
if raw_config:
|
||
filled = filled.merge(raw_config)
|
||
self._pipe_configs[name] = filled
|
||
return resolved[factory_name]
|
||
|
||
def create_pipe_from_source(
|
||
self, source_name: str, source: "Language", *, name: str
|
||
) -> Tuple[PipeCallable, str]:
|
||
"""Create a pipeline component by copying it from an existing model.
|
||
|
||
source_name (str): Name of the component in the source pipeline.
|
||
source (Language): The source nlp object to copy from.
|
||
name (str): Optional alternative name to use in current pipeline.
|
||
RETURNS (Tuple[Callable[[Doc], Doc], str]): The component and its factory name.
|
||
"""
|
||
# Check source type
|
||
if not isinstance(source, Language):
|
||
raise ValueError(Errors.E945.format(name=source_name, source=type(source)))
|
||
if self.vocab.vectors != source.vocab.vectors:
|
||
warnings.warn(Warnings.W113.format(name=source_name))
|
||
if source_name not in source.component_names:
|
||
raise KeyError(
|
||
Errors.E944.format(
|
||
name=source_name,
|
||
model=f"{source.meta['lang']}_{source.meta['name']}",
|
||
opts=", ".join(source.component_names),
|
||
)
|
||
)
|
||
pipe = source.get_pipe(source_name)
|
||
# There is no actual solution here. Either the component has the right
|
||
# name for the source pipeline or the component has the right name for
|
||
# the current pipeline. This prioritizes the current pipeline.
|
||
if hasattr(pipe, "name"):
|
||
pipe.name = name
|
||
# Make sure the source config is interpolated so we don't end up with
|
||
# orphaned variables in our final config
|
||
source_config = source.config.interpolate()
|
||
pipe_config = util.copy_config(source_config["components"][source_name])
|
||
self._pipe_configs[name] = pipe_config
|
||
if self.vocab.strings != source.vocab.strings:
|
||
for s in source.vocab.strings:
|
||
self.vocab.strings.add(s)
|
||
return pipe, pipe_config["factory"]
|
||
|
||
def add_pipe(
|
||
self,
|
||
factory_name: str,
|
||
name: Optional[str] = None,
|
||
*,
|
||
before: Optional[Union[str, int]] = None,
|
||
after: Optional[Union[str, int]] = None,
|
||
first: Optional[Literal[True]] = None,
|
||
last: Optional[Literal[True]] = None,
|
||
source: Optional["Language"] = None,
|
||
config: Dict[str, Any] = SimpleFrozenDict(),
|
||
raw_config: Optional[Config] = None,
|
||
validate: bool = True,
|
||
) -> PipeCallable:
|
||
"""Add a component to the processing pipeline. Valid components are
|
||
callables that take a `Doc` object, modify it and return it. Only one
|
||
of before/after/first/last can be set. Default behaviour is "last".
|
||
|
||
factory_name (str): Name of the component factory.
|
||
name (str): Name of pipeline component. Overwrites existing
|
||
component.name attribute if available. If no name is set and
|
||
the component exposes no name attribute, component.__name__ is
|
||
used. An error is raised if a name already exists in the pipeline.
|
||
before (Union[str, int]): Name or index of the component to insert new
|
||
component directly before.
|
||
after (Union[str, int]): Name or index of the component to insert new
|
||
component directly after.
|
||
first (Optional[Literal[True]]): If True, insert component first in the pipeline.
|
||
last (Optional[Literal[True]]): If True, insert component last in the pipeline.
|
||
source (Language): Optional loaded nlp object to copy the pipeline
|
||
component from.
|
||
config (Dict[str, Any]): Config parameters to use for this component.
|
||
Will be merged with default config, if available.
|
||
raw_config (Optional[Config]): Internals: the non-interpolated config.
|
||
validate (bool): Whether to validate the component config against the
|
||
arguments and types expected by the factory.
|
||
RETURNS (Callable[[Doc], Doc]): The pipeline component.
|
||
|
||
DOCS: https://spacy.io/api/language#add_pipe
|
||
"""
|
||
if not isinstance(factory_name, str):
|
||
bad_val = repr(factory_name)
|
||
err = Errors.E966.format(component=bad_val, name=name)
|
||
raise ValueError(err)
|
||
name = name if name is not None else factory_name
|
||
if name in self.component_names:
|
||
raise ValueError(Errors.E007.format(name=name, opts=self.component_names))
|
||
# Overriding pipe name in the config is not supported and will be ignored.
|
||
if "name" in config:
|
||
warnings.warn(Warnings.W119.format(name_in_config=config.pop("name")))
|
||
if source is not None:
|
||
# We're loading the component from a model. After loading the
|
||
# component, we know its real factory name
|
||
pipe_component, factory_name = self.create_pipe_from_source(
|
||
factory_name, source, name=name
|
||
)
|
||
else:
|
||
pipe_component = self.create_pipe(
|
||
factory_name,
|
||
name=name,
|
||
config=config,
|
||
raw_config=raw_config,
|
||
validate=validate,
|
||
)
|
||
pipe_index = self._get_pipe_index(before, after, first, last)
|
||
self._pipe_meta[name] = self.get_factory_meta(factory_name)
|
||
self._components.insert(pipe_index, (name, pipe_component))
|
||
self._link_components()
|
||
return pipe_component
|
||
|
||
def _get_pipe_index(
|
||
self,
|
||
before: Optional[Union[str, int]] = None,
|
||
after: Optional[Union[str, int]] = None,
|
||
first: Optional[Literal[True]] = None,
|
||
last: Optional[Literal[True]] = None,
|
||
) -> int:
|
||
"""Determine where to insert a pipeline component based on the before/
|
||
after/first/last values.
|
||
|
||
before (str): Name or index of the component to insert directly before.
|
||
after (str): Name or index of component to insert directly after.
|
||
first (Optional[Literal[True]]): If True, insert component first in the pipeline.
|
||
last (Optional[Literal[True]]): If True, insert component last in the pipeline.
|
||
RETURNS (int): The index of the new pipeline component.
|
||
"""
|
||
if first is not None and first is not True:
|
||
raise ValueError(Errors.E4009.format(attr="first", value=first))
|
||
if last is not None and last is not True:
|
||
raise ValueError(Errors.E4009.format(attr="last", value=last))
|
||
all_args = {"before": before, "after": after, "first": first, "last": last}
|
||
if sum(arg is not None for arg in [before, after, first, last]) >= 2:
|
||
raise ValueError(
|
||
Errors.E006.format(args=all_args, opts=self.component_names)
|
||
)
|
||
if last or not any(value is not None for value in [first, before, after]):
|
||
return len(self._components)
|
||
elif first:
|
||
return 0
|
||
elif isinstance(before, str):
|
||
if before not in self.component_names:
|
||
raise ValueError(
|
||
Errors.E001.format(name=before, opts=self.component_names)
|
||
)
|
||
return self.component_names.index(before)
|
||
elif isinstance(after, str):
|
||
if after not in self.component_names:
|
||
raise ValueError(
|
||
Errors.E001.format(name=after, opts=self.component_names)
|
||
)
|
||
return self.component_names.index(after) + 1
|
||
# We're only accepting indices referring to components that exist
|
||
# (can't just do isinstance here because bools are instance of int, too)
|
||
elif type(before) == int:
|
||
if before >= len(self._components) or before < 0:
|
||
err = Errors.E959.format(
|
||
dir="before", idx=before, opts=self.component_names
|
||
)
|
||
raise ValueError(err)
|
||
return before
|
||
elif type(after) == int:
|
||
if after >= len(self._components) or after < 0:
|
||
err = Errors.E959.format(
|
||
dir="after", idx=after, opts=self.component_names
|
||
)
|
||
raise ValueError(err)
|
||
return after + 1
|
||
raise ValueError(Errors.E006.format(args=all_args, opts=self.component_names))
|
||
|
||
def has_pipe(self, name: str) -> bool:
|
||
"""Check if a component name is present in the pipeline. Equivalent to
|
||
`name in nlp.pipe_names`.
|
||
|
||
name (str): Name of the component.
|
||
RETURNS (bool): Whether a component of the name exists in the pipeline.
|
||
|
||
DOCS: https://spacy.io/api/language#has_pipe
|
||
"""
|
||
return name in self.pipe_names
|
||
|
||
def replace_pipe(
|
||
self,
|
||
name: str,
|
||
factory_name: str,
|
||
*,
|
||
config: Dict[str, Any] = SimpleFrozenDict(),
|
||
validate: bool = True,
|
||
) -> PipeCallable:
|
||
"""Replace a component in the pipeline.
|
||
|
||
name (str): Name of the component to replace.
|
||
factory_name (str): Factory name of replacement component.
|
||
config (Optional[Dict[str, Any]]): Config parameters to use for this
|
||
component. Will be merged with default config, if available.
|
||
validate (bool): Whether to validate the component config against the
|
||
arguments and types expected by the factory.
|
||
RETURNS (Callable[[Doc], Doc]): The new pipeline component.
|
||
|
||
DOCS: https://spacy.io/api/language#replace_pipe
|
||
"""
|
||
if name not in self.component_names:
|
||
raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
|
||
if hasattr(factory_name, "__call__"):
|
||
err = Errors.E968.format(component=repr(factory_name), name=name)
|
||
raise ValueError(err)
|
||
# We need to delegate to Language.add_pipe here instead of just writing
|
||
# to Language.pipeline to make sure the configs are handled correctly
|
||
pipe_index = self.component_names.index(name)
|
||
self.remove_pipe(name)
|
||
if not len(self._components) or pipe_index == len(self._components):
|
||
# we have no components to insert before/after, or we're replacing the last component
|
||
return self.add_pipe(
|
||
factory_name, name=name, config=config, validate=validate
|
||
)
|
||
else:
|
||
return self.add_pipe(
|
||
factory_name,
|
||
name=name,
|
||
before=pipe_index,
|
||
config=config,
|
||
validate=validate,
|
||
)
|
||
|
||
def rename_pipe(self, old_name: str, new_name: str) -> None:
|
||
"""Rename a pipeline component.
|
||
|
||
old_name (str): Name of the component to rename.
|
||
new_name (str): New name of the component.
|
||
|
||
DOCS: https://spacy.io/api/language#rename_pipe
|
||
"""
|
||
if old_name not in self.component_names:
|
||
raise ValueError(
|
||
Errors.E001.format(name=old_name, opts=self.component_names)
|
||
)
|
||
if new_name in self.component_names:
|
||
raise ValueError(
|
||
Errors.E007.format(name=new_name, opts=self.component_names)
|
||
)
|
||
i = self.component_names.index(old_name)
|
||
self._components[i] = (new_name, self._components[i][1])
|
||
self._pipe_meta[new_name] = self._pipe_meta.pop(old_name)
|
||
self._pipe_configs[new_name] = self._pipe_configs.pop(old_name)
|
||
# Make sure [initialize] config is adjusted
|
||
if old_name in self._config["initialize"]["components"]:
|
||
init_cfg = self._config["initialize"]["components"].pop(old_name)
|
||
self._config["initialize"]["components"][new_name] = init_cfg
|
||
self._link_components()
|
||
|
||
def remove_pipe(self, name: str) -> Tuple[str, PipeCallable]:
|
||
"""Remove a component from the pipeline.
|
||
|
||
name (str): Name of the component to remove.
|
||
RETURNS (Tuple[str, Callable[[Doc], Doc]]): A `(name, component)` tuple of the removed component.
|
||
|
||
DOCS: https://spacy.io/api/language#remove_pipe
|
||
"""
|
||
if name not in self.component_names:
|
||
raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
|
||
removed = self._components.pop(self.component_names.index(name))
|
||
# We're only removing the component itself from the metas/configs here
|
||
# because factory may be used for something else
|
||
self._pipe_meta.pop(name)
|
||
self._pipe_configs.pop(name)
|
||
self.meta.get("_sourced_vectors_hashes", {}).pop(name, None)
|
||
# Make sure name is removed from the [initialize] config
|
||
if name in self._config["initialize"]["components"]:
|
||
self._config["initialize"]["components"].pop(name)
|
||
# Make sure the name is also removed from the set of disabled components
|
||
if name in self.disabled:
|
||
self._disabled.remove(name)
|
||
self._link_components()
|
||
return removed
|
||
|
||
def disable_pipe(self, name: str) -> None:
|
||
"""Disable a pipeline component. The component will still exist on
|
||
the nlp object, but it won't be run as part of the pipeline. Does
|
||
nothing if the component is already disabled.
|
||
|
||
name (str): The name of the component to disable.
|
||
"""
|
||
if name not in self.component_names:
|
||
raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
|
||
self._disabled.add(name)
|
||
|
||
def enable_pipe(self, name: str) -> None:
|
||
"""Enable a previously disabled pipeline component so it's run as part
|
||
of the pipeline. Does nothing if the component is already enabled.
|
||
|
||
name (str): The name of the component to enable.
|
||
"""
|
||
if name not in self.component_names:
|
||
raise ValueError(Errors.E001.format(name=name, opts=self.component_names))
|
||
if name in self.disabled:
|
||
self._disabled.remove(name)
|
||
|
||
def __call__(
|
||
self,
|
||
text: Union[str, Doc],
|
||
*,
|
||
disable: Iterable[str] = SimpleFrozenList(),
|
||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||
) -> Doc:
|
||
"""Apply the pipeline to some text. The text can span multiple sentences,
|
||
and can contain arbitrary whitespace. Alignment into the original string
|
||
is preserved.
|
||
|
||
text (Union[str, Doc]): If `str`, the text to be processed. If `Doc`,
|
||
the doc will be passed directly to the pipeline, skipping
|
||
`Language.make_doc`.
|
||
disable (List[str]): Names of the pipeline components to disable.
|
||
component_cfg (Dict[str, dict]): An optional dictionary with extra
|
||
keyword arguments for specific components.
|
||
RETURNS (Doc): A container for accessing the annotations.
|
||
|
||
DOCS: https://spacy.io/api/language#call
|
||
"""
|
||
doc = self._ensure_doc(text)
|
||
if component_cfg is None:
|
||
component_cfg = {}
|
||
for name, proc in self.pipeline:
|
||
if name in disable:
|
||
continue
|
||
if not hasattr(proc, "__call__"):
|
||
raise ValueError(Errors.E003.format(component=type(proc), name=name))
|
||
error_handler = self.default_error_handler
|
||
if hasattr(proc, "get_error_handler"):
|
||
error_handler = proc.get_error_handler()
|
||
try:
|
||
doc = proc(doc, **component_cfg.get(name, {})) # type: ignore[call-arg]
|
||
except KeyError as e:
|
||
# This typically happens if a component is not initialized
|
||
raise ValueError(Errors.E109.format(name=name)) from e
|
||
except Exception as e:
|
||
error_handler(name, proc, [doc], e)
|
||
if not isinstance(doc, Doc):
|
||
raise ValueError(Errors.E005.format(name=name, returned_type=type(doc)))
|
||
return doc
|
||
|
||
def distill(
|
||
self,
|
||
teacher: "Language",
|
||
examples: Iterable[Example],
|
||
*,
|
||
drop: float = 0.0,
|
||
sgd: Union[Optimizer, None, Literal[False]] = None,
|
||
losses: Optional[Dict[str, float]] = None,
|
||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||
exclude: Iterable[str] = SimpleFrozenList(),
|
||
annotates: Iterable[str] = SimpleFrozenList(),
|
||
student_to_teacher: Optional[Dict[str, str]] = None,
|
||
):
|
||
"""Distill the models in a student pipeline from a teacher pipeline.
|
||
teacher (Language): Teacher to distill from.
|
||
examples (Iterable[Example]): Distillation examples. The reference
|
||
(teacher) and predicted (student) docs must have the same number of
|
||
tokens and the same orthography.
|
||
drop (float): The dropout rate.
|
||
sgd (Union[Optimizer, None, Literal[False]]): An optimizer. Will
|
||
be created via create_optimizer if 'None'. No optimizer will
|
||
be used when set to 'False'.
|
||
losses (Optional(Dict[str, float])): Dictionary to update with the loss,
|
||
keyed by component.
|
||
component_cfg (Optional[Dict[str, Dict[str, Any]]]): Config parameters
|
||
for specific pipeline components, keyed by component name.
|
||
exclude (Iterable[str]): Names of components that shouldn't be updated.
|
||
annotates (Iterable[str]): Names of components that should set
|
||
annotations on the predicted examples after updating.
|
||
student_to_teacher (Optional[Dict[str, str]]): Map student pipe name to
|
||
teacher pipe name, only needed for pipes where the student pipe
|
||
name does not match the teacher pipe name.
|
||
RETURNS (Dict[str, float]): The updated losses dictionary
|
||
|
||
DOCS: https://spacy.io/api/language#distill
|
||
"""
|
||
if student_to_teacher is None:
|
||
student_to_teacher = {}
|
||
if losses is None:
|
||
losses = {}
|
||
if isinstance(examples, list) and len(examples) == 0:
|
||
return losses
|
||
|
||
validate_distillation_examples(examples, "Language.distill")
|
||
examples = _copy_examples(examples, copy_x=True, copy_y=True)
|
||
|
||
if sgd is None:
|
||
if self._optimizer is None:
|
||
self._optimizer = self.create_optimizer()
|
||
sgd = self._optimizer
|
||
|
||
if component_cfg is None:
|
||
component_cfg = {}
|
||
pipe_kwargs = {}
|
||
for student_name, student_proc in self.pipeline:
|
||
component_cfg.setdefault(student_name, {})
|
||
pipe_kwargs[student_name] = deepcopy(component_cfg[student_name])
|
||
component_cfg[student_name].setdefault("drop", drop)
|
||
pipe_kwargs[student_name].setdefault("batch_size", self.batch_size)
|
||
|
||
teacher_pipes = dict(teacher.pipeline)
|
||
for student_name, student_proc in self.pipeline:
|
||
if student_name in annotates:
|
||
for doc, eg in zip(
|
||
_pipe(
|
||
(eg.predicted for eg in examples),
|
||
proc=student_proc,
|
||
name=student_name,
|
||
default_error_handler=self.default_error_handler,
|
||
kwargs=pipe_kwargs[student_name],
|
||
),
|
||
examples,
|
||
):
|
||
eg.predicted = doc
|
||
|
||
if (
|
||
student_name not in exclude
|
||
and isinstance(student_proc, ty.DistillableComponent)
|
||
and student_proc.is_distillable
|
||
):
|
||
# A missing teacher pipe is not an error, some student pipes
|
||
# do not need a teacher, such as tok2vec layer losses.
|
||
teacher_name = (
|
||
student_to_teacher[student_name]
|
||
if student_name in student_to_teacher
|
||
else student_name
|
||
)
|
||
teacher_pipe = teacher_pipes.get(teacher_name, None)
|
||
student_proc.distill(
|
||
teacher_pipe,
|
||
examples,
|
||
sgd=None,
|
||
losses=losses,
|
||
**component_cfg[student_name],
|
||
)
|
||
|
||
# Only finish the update after all component updates are done. Some
|
||
# components may share weights (such as tok2vec) and we only want
|
||
# to apply weight updates after all gradients are accumulated.
|
||
for student_name, student_proc in self.pipeline:
|
||
if (
|
||
student_name not in exclude
|
||
and isinstance(student_proc, ty.DistillableComponent)
|
||
and student_proc.is_distillable
|
||
and sgd not in (None, False)
|
||
):
|
||
student_proc.finish_update(sgd)
|
||
|
||
return losses
|
||
|
||
def disable_pipes(self, *names) -> "DisabledPipes":
|
||
"""Disable one or more pipeline components. If used as a context
|
||
manager, the pipeline will be restored to the initial state at the end
|
||
of the block. Otherwise, a DisabledPipes object is returned, that has
|
||
a `.restore()` method you can use to undo your changes.
|
||
|
||
This method has been deprecated since 3.0
|
||
"""
|
||
warnings.warn(Warnings.W096, DeprecationWarning)
|
||
if len(names) == 1 and isinstance(names[0], (list, tuple)):
|
||
names = names[0] # type: ignore[assignment] # support list of names instead of spread
|
||
return self.select_pipes(disable=names)
|
||
|
||
def select_pipes(
|
||
self,
|
||
*,
|
||
disable: Optional[Union[str, Iterable[str]]] = None,
|
||
enable: Optional[Union[str, Iterable[str]]] = None,
|
||
) -> "DisabledPipes":
|
||
"""Disable one or more pipeline components. If used as a context
|
||
manager, the pipeline will be restored to the initial state at the end
|
||
of the block. Otherwise, a DisabledPipes object is returned, that has
|
||
a `.restore()` method you can use to undo your changes.
|
||
|
||
disable (str or iterable): The name(s) of the pipes to disable
|
||
enable (str or iterable): The name(s) of the pipes to enable - all others will be disabled
|
||
|
||
DOCS: https://spacy.io/api/language#select_pipes
|
||
"""
|
||
if enable is None and disable is None:
|
||
raise ValueError(Errors.E991)
|
||
if isinstance(disable, str):
|
||
disable = [disable]
|
||
if enable is not None:
|
||
if isinstance(enable, str):
|
||
enable = [enable]
|
||
to_disable = [pipe for pipe in self.pipe_names if pipe not in enable]
|
||
# raise an error if the enable and disable keywords are not consistent
|
||
if disable is not None and disable != to_disable:
|
||
raise ValueError(
|
||
Errors.E992.format(
|
||
enable=enable, disable=disable, names=self.pipe_names
|
||
)
|
||
)
|
||
disable = to_disable
|
||
assert disable is not None
|
||
# DisabledPipes will restore the pipes in 'disable' when it's done, so we need to exclude
|
||
# those pipes that were already disabled.
|
||
disable = [d for d in disable if d not in self._disabled]
|
||
return DisabledPipes(self, disable)
|
||
|
||
def make_doc(self, text: str) -> Doc:
|
||
"""Turn a text into a Doc object.
|
||
|
||
text (str): The text to process.
|
||
RETURNS (Doc): The processed doc.
|
||
"""
|
||
if len(text) > self.max_length:
|
||
raise ValueError(
|
||
Errors.E088.format(length=len(text), max_length=self.max_length)
|
||
)
|
||
return self.tokenizer(text)
|
||
|
||
def _ensure_doc(self, doc_like: Union[str, Doc, bytes]) -> Doc:
|
||
"""Create a Doc if need be, or raise an error if the input is not
|
||
a Doc, string, or a byte array (generated by Doc.to_bytes())."""
|
||
if isinstance(doc_like, Doc):
|
||
return doc_like
|
||
if isinstance(doc_like, str):
|
||
return self.make_doc(doc_like)
|
||
if isinstance(doc_like, bytes):
|
||
return Doc(self.vocab).from_bytes(doc_like)
|
||
raise ValueError(Errors.E1041.format(type=type(doc_like)))
|
||
|
||
def _ensure_doc_with_context(
|
||
self, doc_like: Union[str, Doc, bytes], context: _AnyContext
|
||
) -> Doc:
|
||
"""Call _ensure_doc to generate a Doc and set its context object."""
|
||
doc = self._ensure_doc(doc_like)
|
||
doc._context = context
|
||
return doc
|
||
|
||
def update(
|
||
self,
|
||
examples: Iterable[Example],
|
||
_: Optional[Any] = None,
|
||
*,
|
||
drop: float = 0.0,
|
||
sgd: Union[Optimizer, None, Literal[False]] = None,
|
||
losses: Optional[Dict[str, float]] = None,
|
||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||
exclude: Iterable[str] = SimpleFrozenList(),
|
||
annotates: Iterable[str] = SimpleFrozenList(),
|
||
):
|
||
"""Update the models in the pipeline.
|
||
|
||
examples (Iterable[Example]): A batch of examples
|
||
_: Should not be set - serves to catch backwards-incompatible scripts.
|
||
drop (float): The dropout rate.
|
||
sgd (Union[Optimizer, None, Literal[False]]): An optimizer. Will
|
||
be created via create_optimizer if 'None'. No optimizer will
|
||
be used when set to 'False'.
|
||
losses (Dict[str, float]): Dictionary to update with the loss, keyed by
|
||
component.
|
||
component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
|
||
components, keyed by component name.
|
||
exclude (Iterable[str]): Names of components that shouldn't be updated.
|
||
annotates (Iterable[str]): Names of components that should set
|
||
annotations on the predicted examples after updating.
|
||
RETURNS (Dict[str, float]): The updated losses dictionary
|
||
|
||
DOCS: https://spacy.io/api/language#update
|
||
"""
|
||
if _ is not None:
|
||
raise ValueError(Errors.E989)
|
||
if losses is None:
|
||
losses = {}
|
||
if isinstance(examples, list) and len(examples) == 0:
|
||
return losses
|
||
validate_examples(examples, "Language.update")
|
||
examples = _copy_examples(examples)
|
||
if sgd is None:
|
||
if self._optimizer is None:
|
||
self._optimizer = self.create_optimizer()
|
||
sgd = self._optimizer
|
||
if component_cfg is None:
|
||
component_cfg = {}
|
||
pipe_kwargs = {}
|
||
for i, (name, proc) in enumerate(self.pipeline):
|
||
component_cfg.setdefault(name, {})
|
||
pipe_kwargs[name] = deepcopy(component_cfg[name])
|
||
component_cfg[name].setdefault("drop", drop)
|
||
pipe_kwargs[name].setdefault("batch_size", self.batch_size)
|
||
for name, proc in self.pipeline:
|
||
if (
|
||
name not in exclude
|
||
and isinstance(proc, ty.TrainableComponent)
|
||
and proc.is_trainable
|
||
):
|
||
proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
|
||
if name in annotates:
|
||
for doc, eg in zip(
|
||
_pipe(
|
||
(eg.predicted for eg in examples),
|
||
proc=proc,
|
||
name=name,
|
||
default_error_handler=self.default_error_handler,
|
||
kwargs=pipe_kwargs[name],
|
||
),
|
||
examples,
|
||
):
|
||
eg.predicted = doc
|
||
# Only finish the update after all component updates are done. Some
|
||
# components may share weights (such as tok2vec) and we only want
|
||
# to apply weight updates after all gradients are accumulated.
|
||
for name, proc in self.pipeline:
|
||
if (
|
||
name not in exclude
|
||
and isinstance(proc, ty.TrainableComponent)
|
||
and proc.is_trainable
|
||
and sgd not in (None, False)
|
||
):
|
||
proc.finish_update(sgd)
|
||
|
||
return losses
|
||
|
||
def rehearse(
|
||
self,
|
||
examples: Iterable[Example],
|
||
*,
|
||
sgd: Optional[Optimizer] = None,
|
||
losses: Optional[Dict[str, float]] = None,
|
||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||
exclude: Iterable[str] = SimpleFrozenList(),
|
||
) -> Dict[str, float]:
|
||
"""Make a "rehearsal" update to the models in the pipeline, to prevent
|
||
forgetting. Rehearsal updates run an initial copy of the model over some
|
||
data, and update the model so its current predictions are more like the
|
||
initial ones. This is useful for keeping a pretrained model on-track,
|
||
even if you're updating it with a smaller set of examples.
|
||
|
||
examples (Iterable[Example]): A batch of `Example` objects.
|
||
sgd (Optional[Optimizer]): An optimizer.
|
||
component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
|
||
components, keyed by component name.
|
||
exclude (Iterable[str]): Names of components that shouldn't be updated.
|
||
RETURNS (dict): Results from the update.
|
||
|
||
EXAMPLE:
|
||
>>> raw_text_batches = minibatch(raw_texts)
|
||
>>> for labelled_batch in minibatch(examples):
|
||
>>> nlp.update(labelled_batch)
|
||
>>> raw_batch = [Example.from_dict(nlp.make_doc(text), {}) for text in next(raw_text_batches)]
|
||
>>> nlp.rehearse(raw_batch)
|
||
|
||
DOCS: https://spacy.io/api/language#rehearse
|
||
"""
|
||
if losses is None:
|
||
losses = {}
|
||
if isinstance(examples, list) and len(examples) == 0:
|
||
return losses
|
||
validate_examples(examples, "Language.rehearse")
|
||
if sgd is None:
|
||
if self._optimizer is None:
|
||
self._optimizer = self.create_optimizer()
|
||
sgd = self._optimizer
|
||
pipes = list(self.pipeline)
|
||
random.shuffle(pipes)
|
||
if component_cfg is None:
|
||
component_cfg = {}
|
||
grads = {}
|
||
|
||
def get_grads(key, W, dW):
|
||
grads[key] = (W, dW)
|
||
return W, dW
|
||
|
||
get_grads.learn_rate = sgd.learn_rate # type: ignore[attr-defined, union-attr]
|
||
get_grads.b1 = sgd.b1 # type: ignore[attr-defined, union-attr]
|
||
get_grads.b2 = sgd.b2 # type: ignore[attr-defined, union-attr]
|
||
for name, proc in pipes:
|
||
if name in exclude or not hasattr(proc, "rehearse"):
|
||
continue
|
||
grads = {}
|
||
proc.rehearse( # type: ignore[attr-defined]
|
||
examples, sgd=get_grads, losses=losses, **component_cfg.get(name, {})
|
||
)
|
||
for key, (W, dW) in grads.items():
|
||
sgd(key, W, dW) # type: ignore[call-arg, misc]
|
||
return losses
|
||
|
||
def initialize(
|
||
self,
|
||
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
|
||
*,
|
||
labels: Optional[Dict[str, Any]] = None,
|
||
sgd: Optional[Optimizer] = None,
|
||
) -> Optimizer:
|
||
"""Initialize the pipe for training, using data examples if available.
|
||
|
||
get_examples (Callable[[], Iterable[Example]]): Optional function that
|
||
returns gold-standard Example objects.
|
||
labels (Optional[Dict[str, Any]]): Labels to pass to pipe initialization,
|
||
using the names of the pipes as keys. Overrides labels that are in
|
||
the model configuration.
|
||
sgd (Optional[Optimizer]): An optimizer to use for updates. If not
|
||
provided, will be created using the .create_optimizer() method.
|
||
RETURNS (thinc.api.Optimizer): The optimizer.
|
||
|
||
DOCS: https://spacy.io/api/language#initialize
|
||
"""
|
||
if get_examples is None:
|
||
util.logger.debug(
|
||
"No 'get_examples' callback provided to 'Language.initialize', creating dummy examples"
|
||
)
|
||
doc = Doc(self.vocab, words=["x", "y", "z"])
|
||
|
||
def get_examples():
|
||
return [Example.from_dict(doc, {})]
|
||
|
||
if not hasattr(get_examples, "__call__"):
|
||
err = Errors.E930.format(
|
||
method="Language.initialize", obj=type(get_examples)
|
||
)
|
||
raise TypeError(err)
|
||
# Make sure the config is interpolated so we can resolve subsections
|
||
config = self.config.interpolate()
|
||
# These are the settings provided in the [initialize] block in the config
|
||
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
|
||
before_init = I["before_init"]
|
||
if before_init is not None:
|
||
before_init(self)
|
||
try:
|
||
init_vocab(
|
||
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
|
||
)
|
||
except IOError:
|
||
raise IOError(Errors.E884.format(vectors=I["vectors"]))
|
||
if self.vocab.vectors.shape[1] >= 1:
|
||
ops = get_current_ops()
|
||
self.vocab.vectors.to_ops(ops)
|
||
if hasattr(self.tokenizer, "initialize"):
|
||
tok_settings = validate_init_settings(
|
||
self.tokenizer.initialize, # type: ignore[union-attr]
|
||
I["tokenizer"],
|
||
section="tokenizer",
|
||
name="tokenizer",
|
||
)
|
||
self.tokenizer.initialize(get_examples, nlp=self, **tok_settings) # type: ignore[union-attr]
|
||
for name, proc in self.pipeline:
|
||
if isinstance(proc, ty.InitializableComponent):
|
||
p_settings = I["components"].get(name, {})
|
||
if labels is not None and name in labels:
|
||
p_settings["labels"] = labels[name]
|
||
p_settings = validate_init_settings(
|
||
proc.initialize, p_settings, section="components", name=name
|
||
)
|
||
proc.initialize(get_examples, nlp=self, **p_settings)
|
||
pretrain_cfg = config.get("pretraining")
|
||
if pretrain_cfg:
|
||
P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
|
||
init_tok2vec(self, P, I)
|
||
self._link_components()
|
||
self._optimizer = sgd
|
||
if sgd is not None:
|
||
self._optimizer = sgd
|
||
elif self._optimizer is None:
|
||
self._optimizer = self.create_optimizer()
|
||
after_init = I["after_init"]
|
||
if after_init is not None:
|
||
after_init(self)
|
||
return self._optimizer
|
||
|
||
def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
|
||
"""Continue training a pretrained model.
|
||
|
||
Create and return an optimizer, and initialize "rehearsal" for any pipeline
|
||
component that has a .rehearse() method. Rehearsal is used to prevent
|
||
models from "forgetting" their initialized "knowledge". To perform
|
||
rehearsal, collect samples of text you want the models to retain performance
|
||
on, and call nlp.rehearse() with a batch of Example objects.
|
||
|
||
RETURNS (Optimizer): The optimizer.
|
||
|
||
DOCS: https://spacy.io/api/language#resume_training
|
||
"""
|
||
ops = get_current_ops()
|
||
if self.vocab.vectors.shape[1] >= 1:
|
||
self.vocab.vectors.to_ops(ops)
|
||
for name, proc in self.pipeline:
|
||
if hasattr(proc, "_rehearsal_model"):
|
||
proc._rehearsal_model = deepcopy(proc.model) # type: ignore[attr-defined]
|
||
if sgd is not None:
|
||
self._optimizer = sgd
|
||
elif self._optimizer is None:
|
||
self._optimizer = self.create_optimizer()
|
||
return self._optimizer
|
||
|
||
def set_error_handler(
|
||
self,
|
||
error_handler: Callable[[str, PipeCallable, List[Doc], Exception], NoReturn],
|
||
):
|
||
"""Set an error handler object for all the components in the pipeline
|
||
that implement a set_error_handler function.
|
||
|
||
error_handler (Callable[[str, Callable[[Doc], Doc], List[Doc], Exception], NoReturn]):
|
||
Function that deals with a failing batch of documents. This callable
|
||
function should take in the component's name, the component itself,
|
||
the offending batch of documents, and the exception that was thrown.
|
||
DOCS: https://spacy.io/api/language#set_error_handler
|
||
"""
|
||
self.default_error_handler = error_handler
|
||
for name, pipe in self.pipeline:
|
||
if hasattr(pipe, "set_error_handler"):
|
||
pipe.set_error_handler(error_handler)
|
||
|
||
def evaluate(
|
||
self,
|
||
examples: Iterable[Example],
|
||
*,
|
||
batch_size: Optional[int] = None,
|
||
scorer: Optional[Scorer] = None,
|
||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||
scorer_cfg: Optional[Dict[str, Any]] = None,
|
||
per_component: bool = False,
|
||
) -> Dict[str, Any]:
|
||
"""Evaluate a model's pipeline components.
|
||
|
||
examples (Iterable[Example]): `Example` objects.
|
||
batch_size (Optional[int]): Batch size to use.
|
||
scorer (Optional[Scorer]): Scorer to use. If not passed in, a new one
|
||
will be created.
|
||
component_cfg (dict): An optional dictionary with extra keyword
|
||
arguments for specific components.
|
||
scorer_cfg (dict): An optional dictionary with extra keyword arguments
|
||
for the scorer.
|
||
per_component (bool): Whether to return the scores keyed by component
|
||
name. Defaults to False.
|
||
|
||
RETURNS (Scorer): The scorer containing the evaluation results.
|
||
|
||
DOCS: https://spacy.io/api/language#evaluate
|
||
"""
|
||
examples = list(examples)
|
||
validate_examples(examples, "Language.evaluate")
|
||
examples = _copy_examples(examples)
|
||
if batch_size is None:
|
||
batch_size = self.batch_size
|
||
if component_cfg is None:
|
||
component_cfg = {}
|
||
if scorer_cfg is None:
|
||
scorer_cfg = {}
|
||
if scorer is None:
|
||
kwargs = dict(scorer_cfg)
|
||
kwargs.setdefault("nlp", self)
|
||
scorer = Scorer(**kwargs)
|
||
# reset annotation in predicted docs and time tokenization
|
||
start_time = timer()
|
||
# this is purely for timing
|
||
for eg in examples:
|
||
self.make_doc(eg.reference.text)
|
||
# apply all pipeline components
|
||
docs = self.pipe(
|
||
(eg.predicted for eg in examples),
|
||
batch_size=batch_size,
|
||
component_cfg=component_cfg,
|
||
)
|
||
for eg, doc in zip(examples, docs):
|
||
eg.predicted = doc
|
||
end_time = timer()
|
||
results = scorer.score(examples, per_component=per_component)
|
||
n_words = sum(len(eg.predicted) for eg in examples)
|
||
results["speed"] = n_words / (end_time - start_time)
|
||
return results
|
||
|
||
def create_optimizer(self):
|
||
"""Create an optimizer, usually using the [training.optimizer] config."""
|
||
subconfig = {"optimizer": self.config["training"]["optimizer"]}
|
||
return registry.resolve(subconfig)["optimizer"]
|
||
|
||
@contextmanager
|
||
def use_params(self, params: Optional[dict]):
|
||
"""Replace weights of models in the pipeline with those provided in the
|
||
params dictionary. Can be used as a contextmanager, in which case,
|
||
models go back to their original weights after the block.
|
||
|
||
params (dict): A dictionary of parameters keyed by model ID.
|
||
|
||
EXAMPLE:
|
||
>>> with nlp.use_params(optimizer.averages):
|
||
>>> nlp.to_disk("/tmp/checkpoint")
|
||
|
||
DOCS: https://spacy.io/api/language#use_params
|
||
"""
|
||
if not params:
|
||
yield
|
||
else:
|
||
contexts = [
|
||
pipe.use_params(params) # type: ignore[attr-defined]
|
||
for name, pipe in self.pipeline
|
||
if hasattr(pipe, "use_params") and hasattr(pipe, "model")
|
||
]
|
||
# TODO: Having trouble with contextlib
|
||
# Workaround: these aren't actually context managers atm.
|
||
for context in contexts:
|
||
try:
|
||
next(context)
|
||
except StopIteration:
|
||
pass
|
||
yield
|
||
for context in contexts:
|
||
try:
|
||
next(context)
|
||
except StopIteration:
|
||
pass
|
||
|
||
@overload
|
||
def pipe(
|
||
self,
|
||
texts: Iterable[Union[str, Doc]],
|
||
*,
|
||
as_tuples: Literal[False] = ...,
|
||
batch_size: Optional[int] = ...,
|
||
disable: Iterable[str] = ...,
|
||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = ...,
|
||
n_process: int = ...,
|
||
) -> Iterator[Doc]:
|
||
...
|
||
|
||
@overload
|
||
def pipe( # noqa: F811
|
||
self,
|
||
texts: Iterable[Tuple[Union[str, Doc], _AnyContext]],
|
||
*,
|
||
as_tuples: Literal[True] = ...,
|
||
batch_size: Optional[int] = ...,
|
||
disable: Iterable[str] = ...,
|
||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = ...,
|
||
n_process: int = ...,
|
||
) -> Iterator[Tuple[Doc, _AnyContext]]:
|
||
...
|
||
|
||
def pipe( # noqa: F811
|
||
self,
|
||
texts: Union[
|
||
Iterable[Union[str, Doc]], Iterable[Tuple[Union[str, Doc], _AnyContext]]
|
||
],
|
||
*,
|
||
as_tuples: bool = False,
|
||
batch_size: Optional[int] = None,
|
||
disable: Iterable[str] = SimpleFrozenList(),
|
||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||
n_process: int = 1,
|
||
) -> Union[Iterator[Doc], Iterator[Tuple[Doc, _AnyContext]]]:
|
||
"""Process texts as a stream, and yield `Doc` objects in order.
|
||
|
||
texts (Iterable[Union[str, Doc]]): A sequence of texts or docs to
|
||
process.
|
||
as_tuples (bool): If set to True, inputs should be a sequence of
|
||
(text, context) tuples. Output will then be a sequence of
|
||
(doc, context) tuples. Defaults to False.
|
||
batch_size (Optional[int]): The number of texts to buffer.
|
||
disable (List[str]): Names of the pipeline components to disable.
|
||
component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword
|
||
arguments for specific components.
|
||
n_process (int): Number of processors to process texts. If -1, set `multiprocessing.cpu_count()`.
|
||
YIELDS (Doc): Documents in the order of the original text.
|
||
|
||
DOCS: https://spacy.io/api/language#pipe
|
||
"""
|
||
if as_tuples:
|
||
texts = cast(Iterable[Tuple[Union[str, Doc], _AnyContext]], texts)
|
||
docs_with_contexts = (
|
||
self._ensure_doc_with_context(text, context) for text, context in texts
|
||
)
|
||
docs = self.pipe(
|
||
docs_with_contexts,
|
||
batch_size=batch_size,
|
||
disable=disable,
|
||
n_process=n_process,
|
||
component_cfg=component_cfg,
|
||
)
|
||
for doc in docs:
|
||
context = doc._context
|
||
doc._context = None
|
||
yield (doc, context)
|
||
return
|
||
|
||
texts = cast(Iterable[Union[str, Doc]], texts)
|
||
|
||
# Set argument defaults
|
||
if n_process == -1:
|
||
n_process = mp.cpu_count()
|
||
if component_cfg is None:
|
||
component_cfg = {}
|
||
if batch_size is None:
|
||
batch_size = self.batch_size
|
||
|
||
pipes = (
|
||
[]
|
||
) # contains functools.partial objects to easily create multiprocess worker.
|
||
for name, proc in self.pipeline:
|
||
if name in disable:
|
||
continue
|
||
kwargs = component_cfg.get(name, {})
|
||
# Allow component_cfg to overwrite the top-level kwargs.
|
||
kwargs.setdefault("batch_size", batch_size)
|
||
f = functools.partial(
|
||
_pipe,
|
||
proc=proc,
|
||
name=name,
|
||
kwargs=kwargs,
|
||
default_error_handler=self.default_error_handler,
|
||
)
|
||
pipes.append(f)
|
||
|
||
if n_process != 1:
|
||
if self._has_gpu_model(disable):
|
||
warnings.warn(Warnings.W114)
|
||
|
||
docs = self._multiprocessing_pipe(texts, pipes, n_process, batch_size)
|
||
else:
|
||
# if n_process == 1, no processes are forked.
|
||
docs = (self._ensure_doc(text) for text in texts)
|
||
for pipe in pipes:
|
||
docs = pipe(docs)
|
||
for doc in docs:
|
||
yield doc
|
||
|
||
def _has_gpu_model(self, disable: Iterable[str]):
|
||
for name, proc in self.pipeline:
|
||
is_trainable = hasattr(proc, "is_trainable") and proc.is_trainable # type: ignore
|
||
if name in disable or not is_trainable:
|
||
continue
|
||
|
||
if hasattr(proc, "model") and hasattr(proc.model, "ops") and isinstance(proc.model.ops, CupyOps): # type: ignore
|
||
return True
|
||
|
||
return False
|
||
|
||
def _multiprocessing_pipe(
|
||
self,
|
||
texts: Iterable[Union[str, Doc]],
|
||
pipes: Iterable[Callable[..., Iterator[Doc]]],
|
||
n_process: int,
|
||
batch_size: int,
|
||
) -> Iterator[Doc]:
|
||
def prepare_input(
|
||
texts: Iterable[Union[str, Doc]]
|
||
) -> Iterable[Tuple[Union[str, bytes], _AnyContext]]:
|
||
# Serialize Doc inputs to bytes to avoid incurring pickling
|
||
# overhead when they are passed to child processes. Also yield
|
||
# any context objects they might have separately (as they are not serialized).
|
||
for doc_like in texts:
|
||
if isinstance(doc_like, Doc):
|
||
yield (doc_like.to_bytes(), cast(_AnyContext, doc_like._context))
|
||
else:
|
||
yield (doc_like, cast(_AnyContext, None))
|
||
|
||
serialized_texts_with_ctx = prepare_input(texts) # type: ignore
|
||
# raw_texts is used later to stop iteration.
|
||
texts, raw_texts = itertools.tee(serialized_texts_with_ctx) # type: ignore
|
||
# for sending texts to worker
|
||
texts_q: List[mp.Queue] = [mp.Queue() for _ in range(n_process)]
|
||
# for receiving byte-encoded docs from worker
|
||
bytedocs_recv_ch, bytedocs_send_ch = zip(
|
||
*[mp.Pipe(False) for _ in range(n_process)]
|
||
)
|
||
|
||
batch_texts = util.minibatch(texts, batch_size)
|
||
# Sender sends texts to the workers.
|
||
# This is necessary to properly handle infinite length of texts.
|
||
# (In this case, all data cannot be sent to the workers at once)
|
||
sender = _Sender(batch_texts, texts_q, chunk_size=n_process)
|
||
# send twice to make process busy
|
||
sender.send()
|
||
sender.send()
|
||
|
||
procs = [
|
||
mp.Process(
|
||
target=_apply_pipes,
|
||
args=(
|
||
self._ensure_doc_with_context,
|
||
pipes,
|
||
rch,
|
||
sch,
|
||
Underscore.get_state(),
|
||
),
|
||
)
|
||
for rch, sch in zip(texts_q, bytedocs_send_ch)
|
||
]
|
||
for proc in procs:
|
||
proc.start()
|
||
|
||
# Cycle channels not to break the order of docs.
|
||
# The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
|
||
byte_tuples = chain.from_iterable(
|
||
recv.recv() for recv in cycle(bytedocs_recv_ch)
|
||
)
|
||
try:
|
||
for i, (_, (byte_doc, context, byte_error)) in enumerate(
|
||
zip(raw_texts, byte_tuples), 1
|
||
):
|
||
if byte_doc is not None:
|
||
doc = Doc(self.vocab).from_bytes(byte_doc)
|
||
doc._context = context
|
||
yield doc
|
||
elif byte_error is not None:
|
||
error = srsly.msgpack_loads(byte_error)
|
||
self.default_error_handler(
|
||
None, None, None, ValueError(Errors.E871.format(error=error))
|
||
)
|
||
if i % batch_size == 0:
|
||
# tell `sender` that one batch was consumed.
|
||
sender.step()
|
||
finally:
|
||
for proc in procs:
|
||
proc.terminate()
|
||
|
||
def _link_components(self) -> None:
|
||
"""Register 'listeners' within pipeline components, to allow them to
|
||
effectively share weights.
|
||
"""
|
||
# I had thought, "Why do we do this inside the Language object? Shouldn't
|
||
# it be the tok2vec/transformer/etc's job?
|
||
# The problem is we need to do it during deserialization...And the
|
||
# components don't receive the pipeline then. So this does have to be
|
||
# here :(
|
||
# First, fix up all the internal component names in case they have
|
||
# gotten out of sync due to sourcing components from different
|
||
# pipelines, since find_listeners uses proc2.name for the listener
|
||
# map.
|
||
for name, proc in self.pipeline:
|
||
if hasattr(proc, "name"):
|
||
proc.name = name
|
||
for i, (name1, proc1) in enumerate(self.pipeline):
|
||
if isinstance(proc1, ty.ListenedToComponent):
|
||
proc1.listener_map = {}
|
||
for name2, proc2 in self.pipeline[i + 1 :]:
|
||
proc1.find_listeners(proc2)
|
||
|
||
@classmethod
|
||
def from_config(
|
||
cls,
|
||
config: Union[Dict[str, Any], Config] = {},
|
||
*,
|
||
vocab: Union[Vocab, bool] = True,
|
||
disable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
|
||
enable: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
|
||
exclude: Union[str, Iterable[str]] = _DEFAULT_EMPTY_PIPES,
|
||
meta: Dict[str, Any] = SimpleFrozenDict(),
|
||
auto_fill: bool = True,
|
||
validate: bool = True,
|
||
) -> "Language":
|
||
"""Create the nlp object from a loaded config. Will set up the tokenizer
|
||
and language data, add pipeline components etc. If no config is provided,
|
||
the default config of the given language is used.
|
||
|
||
config (Dict[str, Any] / Config): The loaded config.
|
||
vocab (Vocab): A Vocab object. If True, a vocab is created.
|
||
disable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to disable.
|
||
Disabled pipes will be loaded but they won't be run unless you
|
||
explicitly enable them by calling nlp.enable_pipe.
|
||
enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable. All other
|
||
pipes will be disabled (and can be enabled using `nlp.enable_pipe`).
|
||
exclude (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to exclude.
|
||
Excluded components won't be loaded.
|
||
meta (Dict[str, Any]): Meta overrides for nlp.meta.
|
||
auto_fill (bool): Automatically fill in missing values in config based
|
||
on defaults and function argument annotations.
|
||
validate (bool): Validate the component config and arguments against
|
||
the types expected by the factory.
|
||
RETURNS (Language): The initialized Language class.
|
||
|
||
DOCS: https://spacy.io/api/language#from_config
|
||
"""
|
||
if auto_fill:
|
||
config = Config(
|
||
cls.default_config, section_order=CONFIG_SECTION_ORDER
|
||
).merge(config)
|
||
if "nlp" not in config:
|
||
raise ValueError(Errors.E985.format(config=config))
|
||
config_lang = config["nlp"].get("lang")
|
||
if config_lang is not None and config_lang != cls.lang:
|
||
raise ValueError(
|
||
Errors.E958.format(
|
||
bad_lang_code=config["nlp"]["lang"],
|
||
lang_code=cls.lang,
|
||
lang=util.get_object_name(cls),
|
||
)
|
||
)
|
||
config["nlp"]["lang"] = cls.lang
|
||
# This isn't very elegant, but we remove the [components] block here to prevent
|
||
# it from getting resolved (causes problems because we expect to pass in
|
||
# the nlp and name args for each component). If we're auto-filling, we're
|
||
# using the nlp.config with all defaults.
|
||
config = util.copy_config(config)
|
||
orig_pipeline = config.pop("components", {})
|
||
orig_distill = config.pop("distillation", None)
|
||
orig_pretraining = config.pop("pretraining", None)
|
||
config["components"] = {}
|
||
if auto_fill:
|
||
filled = registry.fill(config, validate=validate, schema=ConfigSchema)
|
||
else:
|
||
filled = config
|
||
filled["components"] = orig_pipeline
|
||
config["components"] = orig_pipeline
|
||
if orig_distill is not None:
|
||
filled["distillation"] = orig_distill
|
||
config["distillation"] = orig_distill
|
||
if orig_pretraining is not None:
|
||
filled["pretraining"] = orig_pretraining
|
||
config["pretraining"] = orig_pretraining
|
||
resolved_nlp = registry.resolve(
|
||
filled["nlp"], validate=validate, schema=ConfigSchemaNlp
|
||
)
|
||
create_tokenizer = resolved_nlp["tokenizer"]
|
||
before_creation = resolved_nlp["before_creation"]
|
||
after_creation = resolved_nlp["after_creation"]
|
||
after_pipeline_creation = resolved_nlp["after_pipeline_creation"]
|
||
lang_cls = cls
|
||
if before_creation is not None:
|
||
lang_cls = before_creation(cls)
|
||
if (
|
||
not isinstance(lang_cls, type)
|
||
or not issubclass(lang_cls, cls)
|
||
or lang_cls is not cls
|
||
):
|
||
raise ValueError(Errors.E943.format(value=type(lang_cls)))
|
||
|
||
# Warn about require_gpu usage in jupyter notebook
|
||
warn_if_jupyter_cupy()
|
||
|
||
# Note that we don't load vectors here, instead they get loaded explicitly
|
||
# inside stuff like the spacy train function. If we loaded them here,
|
||
# then we would load them twice at runtime: once when we make from config,
|
||
# and then again when we load from disk.
|
||
nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, meta=meta)
|
||
if after_creation is not None:
|
||
nlp = after_creation(nlp)
|
||
if not isinstance(nlp, cls):
|
||
raise ValueError(Errors.E942.format(name="creation", value=type(nlp)))
|
||
# To create the components we need to use the final interpolated config
|
||
# so all values are available (if component configs use variables).
|
||
# Later we replace the component config with the raw config again.
|
||
interpolated = filled.interpolate() if not filled.is_interpolated else filled
|
||
pipeline = interpolated.get("components", {})
|
||
# If components are loaded from a source (existing models), we cache
|
||
# them here so they're only loaded once
|
||
source_nlps = {}
|
||
source_nlp_vectors_hashes = {}
|
||
vocab_b = None
|
||
for pipe_name in config["nlp"]["pipeline"]:
|
||
if pipe_name not in pipeline:
|
||
opts = ", ".join(pipeline.keys())
|
||
raise ValueError(Errors.E956.format(name=pipe_name, opts=opts))
|
||
pipe_cfg = util.copy_config(pipeline[pipe_name])
|
||
raw_config = Config(filled["components"][pipe_name])
|
||
if pipe_name not in exclude:
|
||
if "factory" not in pipe_cfg and "source" not in pipe_cfg:
|
||
err = Errors.E984.format(name=pipe_name, config=pipe_cfg)
|
||
raise ValueError(err)
|
||
if "factory" in pipe_cfg:
|
||
factory = pipe_cfg.pop("factory")
|
||
# The pipe name (key in the config) here is the unique name
|
||
# of the component, not necessarily the factory
|
||
nlp.add_pipe(
|
||
factory,
|
||
name=pipe_name,
|
||
config=pipe_cfg,
|
||
validate=validate,
|
||
raw_config=raw_config,
|
||
)
|
||
else:
|
||
assert "source" in pipe_cfg
|
||
# We need the sourced components to reference the same
|
||
# vocab without modifying the current vocab state **AND**
|
||
# we still want to load the source model vectors to perform
|
||
# the vectors check. Since the source vectors clobber the
|
||
# current ones, we save the original vocab state and
|
||
# restore after this loop. Existing strings are preserved
|
||
# during deserialization, so they do not need any
|
||
# additional handling.
|
||
if vocab_b is None:
|
||
vocab_b = nlp.vocab.to_bytes(exclude=["lookups", "strings"])
|
||
model = pipe_cfg["source"]
|
||
if model not in source_nlps:
|
||
# Load with the same vocab, adding any strings
|
||
source_nlps[model] = util.load_model(
|
||
model, vocab=nlp.vocab, exclude=["lookups"]
|
||
)
|
||
source_name = pipe_cfg.get("component", pipe_name)
|
||
listeners_replaced = False
|
||
if "replace_listeners" in pipe_cfg:
|
||
# Make sure that the listened-to component has the
|
||
# state of the source pipeline listener map so that the
|
||
# replace_listeners method below works as intended.
|
||
source_nlps[model]._link_components()
|
||
for name, proc in source_nlps[model].pipeline:
|
||
if source_name in getattr(proc, "listening_components", []):
|
||
source_nlps[model].replace_listeners(
|
||
name, source_name, pipe_cfg["replace_listeners"]
|
||
)
|
||
listeners_replaced = True
|
||
with warnings.catch_warnings():
|
||
warnings.filterwarnings("ignore", message="\\[W113\\]")
|
||
nlp.add_pipe(
|
||
source_name, source=source_nlps[model], name=pipe_name
|
||
)
|
||
# At this point after nlp.add_pipe, the listener map
|
||
# corresponds to the new pipeline.
|
||
if model not in source_nlp_vectors_hashes:
|
||
source_nlp_vectors_hashes[model] = hash(
|
||
source_nlps[model].vocab.vectors.to_bytes(
|
||
exclude=["strings"]
|
||
)
|
||
)
|
||
if "_sourced_vectors_hashes" not in nlp.meta:
|
||
nlp.meta["_sourced_vectors_hashes"] = {}
|
||
nlp.meta["_sourced_vectors_hashes"][
|
||
pipe_name
|
||
] = source_nlp_vectors_hashes[model]
|
||
# Delete from cache if listeners were replaced
|
||
if listeners_replaced:
|
||
del source_nlps[model]
|
||
# Restore the original vocab after sourcing if necessary
|
||
if vocab_b is not None:
|
||
nlp.vocab.from_bytes(vocab_b)
|
||
|
||
# Resolve disabled/enabled settings.
|
||
if isinstance(disable, str):
|
||
disable = [disable]
|
||
if isinstance(enable, str):
|
||
enable = [enable]
|
||
if isinstance(exclude, str):
|
||
exclude = [exclude]
|
||
|
||
# `enable` should not be merged with `enabled` (the opposite is true for `disable`/`disabled`). If the config
|
||
# specifies values for `enabled` not included in `enable`, emit warning.
|
||
if id(enable) != id(_DEFAULT_EMPTY_PIPES):
|
||
enabled = config["nlp"].get("enabled", [])
|
||
if len(enabled) and not set(enabled).issubset(enable):
|
||
warnings.warn(
|
||
Warnings.W123.format(
|
||
enable=enable,
|
||
enabled=enabled,
|
||
)
|
||
)
|
||
|
||
# Ensure sets of disabled/enabled pipe names are not contradictory.
|
||
disabled_pipes = cls._resolve_component_status(
|
||
list({*disable, *config["nlp"].get("disabled", [])}),
|
||
enable,
|
||
config["nlp"]["pipeline"],
|
||
)
|
||
nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
|
||
|
||
nlp.batch_size = config["nlp"]["batch_size"]
|
||
nlp.config = filled if auto_fill else config
|
||
if after_pipeline_creation is not None:
|
||
nlp = after_pipeline_creation(nlp)
|
||
if not isinstance(nlp, cls):
|
||
raise ValueError(
|
||
Errors.E942.format(name="pipeline_creation", value=type(nlp))
|
||
)
|
||
return nlp
|
||
|
||
def replace_listeners(
|
||
self,
|
||
tok2vec_name: str,
|
||
pipe_name: str,
|
||
listeners: Iterable[str],
|
||
) -> None:
|
||
"""Find listener layers (connecting to a token-to-vector embedding
|
||
component) of a given pipeline component model and replace
|
||
them with a standalone copy of the token-to-vector layer. This can be
|
||
useful when training a pipeline with components sourced from an existing
|
||
pipeline: if multiple components (e.g. tagger, parser, NER) listen to
|
||
the same tok2vec component, but some of them are frozen and not updated,
|
||
their performance may degrade significantly as the tok2vec component is
|
||
updated with new data. To prevent this, listeners can be replaced with
|
||
a standalone tok2vec layer that is owned by the component and doesn't
|
||
change if the component isn't updated.
|
||
|
||
tok2vec_name (str): Name of the token-to-vector component, typically
|
||
"tok2vec" or "transformer".
|
||
pipe_name (str): Name of pipeline component to replace listeners for.
|
||
listeners (Iterable[str]): The paths to the listeners, relative to the
|
||
component config, e.g. ["model.tok2vec"]. Typically, implementations
|
||
will only connect to one tok2vec component, [model.tok2vec], but in
|
||
theory, custom models can use multiple listeners. The value here can
|
||
either be an empty list to not replace any listeners, or a complete
|
||
(!) list of the paths to all listener layers used by the model.
|
||
|
||
DOCS: https://spacy.io/api/language#replace_listeners
|
||
"""
|
||
if tok2vec_name not in self.pipe_names:
|
||
err = Errors.E889.format(
|
||
tok2vec=tok2vec_name,
|
||
name=pipe_name,
|
||
unknown=tok2vec_name,
|
||
opts=", ".join(self.pipe_names),
|
||
)
|
||
raise ValueError(err)
|
||
if pipe_name not in self.pipe_names:
|
||
err = Errors.E889.format(
|
||
tok2vec=tok2vec_name,
|
||
name=pipe_name,
|
||
unknown=pipe_name,
|
||
opts=", ".join(self.pipe_names),
|
||
)
|
||
raise ValueError(err)
|
||
tok2vec = self.get_pipe(tok2vec_name)
|
||
tok2vec_cfg = self.get_pipe_config(tok2vec_name)
|
||
if not isinstance(tok2vec, ty.ListenedToComponent):
|
||
raise ValueError(Errors.E888.format(name=tok2vec_name, pipe=type(tok2vec)))
|
||
tok2vec_model = tok2vec.model
|
||
pipe_listeners = tok2vec.listener_map.get(pipe_name, [])
|
||
pipe = self.get_pipe(pipe_name)
|
||
pipe_cfg = self._pipe_configs[pipe_name]
|
||
if listeners:
|
||
util.logger.debug("Replacing listeners of component '%s'", pipe_name)
|
||
if len(list(listeners)) != len(pipe_listeners):
|
||
# The number of listeners defined in the component model doesn't
|
||
# match the listeners to replace, so we won't be able to update
|
||
# the nodes and generate a matching config
|
||
err = Errors.E887.format(
|
||
name=pipe_name,
|
||
tok2vec=tok2vec_name,
|
||
paths=listeners,
|
||
n_listeners=len(pipe_listeners),
|
||
)
|
||
raise ValueError(err)
|
||
# Update the config accordingly by copying the tok2vec model to all
|
||
# sections defined in the listener paths
|
||
for listener_path in listeners:
|
||
# Check if the path actually exists in the config
|
||
try:
|
||
util.dot_to_object(pipe_cfg, listener_path)
|
||
except KeyError:
|
||
err = Errors.E886.format(
|
||
name=pipe_name, tok2vec=tok2vec_name, path=listener_path
|
||
)
|
||
raise ValueError(err)
|
||
new_config = tok2vec_cfg["model"]
|
||
if "replace_listener_cfg" in tok2vec_model.attrs:
|
||
replace_func = tok2vec_model.attrs["replace_listener_cfg"]
|
||
new_config = replace_func(
|
||
tok2vec_cfg["model"], pipe_cfg["model"]["tok2vec"]
|
||
)
|
||
util.set_dot_to_object(pipe_cfg, listener_path, new_config)
|
||
# Go over the listener layers and replace them
|
||
for listener in pipe_listeners:
|
||
new_model = tok2vec_model.copy()
|
||
if "replace_listener" in tok2vec_model.attrs:
|
||
new_model = tok2vec_model.attrs["replace_listener"](new_model)
|
||
util.replace_model_node(pipe.model, listener, new_model) # type: ignore[attr-defined]
|
||
tok2vec.remove_listener(listener, pipe_name)
|
||
|
||
def to_disk(
|
||
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
|
||
) -> None:
|
||
"""Save the current state to a directory. If a model is loaded, this
|
||
will include the model.
|
||
|
||
path (str / Path): Path to a directory, which will be created if
|
||
it doesn't exist.
|
||
exclude (Iterable[str]): Names of components or serialization fields to exclude.
|
||
|
||
DOCS: https://spacy.io/api/language#to_disk
|
||
"""
|
||
path = util.ensure_path(path)
|
||
serializers = {}
|
||
serializers["tokenizer"] = lambda p: self.tokenizer.to_disk( # type: ignore[union-attr]
|
||
p, exclude=["vocab"]
|
||
)
|
||
serializers["meta.json"] = lambda p: srsly.write_json(p, self.meta)
|
||
serializers["config.cfg"] = lambda p: self.config.to_disk(p)
|
||
for name, proc in self._components:
|
||
if name in exclude:
|
||
continue
|
||
if not hasattr(proc, "to_disk"):
|
||
continue
|
||
serializers[name] = lambda p, proc=proc: proc.to_disk(p, exclude=["vocab"]) # type: ignore[misc]
|
||
serializers["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude)
|
||
util.to_disk(path, serializers, exclude)
|
||
|
||
@staticmethod
|
||
def _resolve_component_status(
|
||
disable: Union[str, Iterable[str]],
|
||
enable: Union[str, Iterable[str]],
|
||
pipe_names: Iterable[str],
|
||
) -> Tuple[str, ...]:
|
||
"""Derives whether (1) `disable` and `enable` values are consistent and (2)
|
||
resolves those to a single set of disabled components. Raises an error in
|
||
case of inconsistency.
|
||
|
||
disable (Union[str, Iterable[str]]): Name(s) of component(s) or serialization fields to disable.
|
||
enable (Union[str, Iterable[str]]): Name(s) of pipeline component(s) to enable.
|
||
pipe_names (Iterable[str]): Names of all pipeline components.
|
||
|
||
RETURNS (Tuple[str, ...]): Names of components to exclude from pipeline w.r.t.
|
||
specified includes and excludes.
|
||
"""
|
||
|
||
if isinstance(disable, str):
|
||
disable = [disable]
|
||
to_disable = disable
|
||
|
||
if enable:
|
||
if isinstance(enable, str):
|
||
enable = [enable]
|
||
to_disable = {
|
||
*[pipe_name for pipe_name in pipe_names if pipe_name not in enable],
|
||
*disable,
|
||
}
|
||
# If any pipe to be enabled is in to_disable, the specification is inconsistent.
|
||
if len(set(enable) & to_disable):
|
||
raise ValueError(Errors.E1042.format(enable=enable, disable=disable))
|
||
|
||
return tuple(to_disable)
|
||
|
||
def from_disk(
|
||
self,
|
||
path: Union[str, Path],
|
||
*,
|
||
exclude: Iterable[str] = SimpleFrozenList(),
|
||
overrides: Dict[str, Any] = SimpleFrozenDict(),
|
||
) -> "Language":
|
||
"""Loads state from a directory. Modifies the object in place and
|
||
returns it. If the saved `Language` object contains a model, the
|
||
model will be loaded.
|
||
|
||
path (str / Path): A path to a directory.
|
||
exclude (Iterable[str]): Names of components or serialization fields to exclude.
|
||
RETURNS (Language): The modified `Language` object.
|
||
|
||
DOCS: https://spacy.io/api/language#from_disk
|
||
"""
|
||
|
||
def deserialize_meta(path: Path) -> None:
|
||
if path.exists():
|
||
data = srsly.read_json(path)
|
||
self.meta.update(data)
|
||
|
||
def deserialize_vocab(path: Path) -> None:
|
||
if path.exists():
|
||
self.vocab.from_disk(path, exclude=exclude)
|
||
|
||
path = util.ensure_path(path)
|
||
deserializers = {}
|
||
if Path(path / "config.cfg").exists(): # type: ignore[operator]
|
||
deserializers["config.cfg"] = lambda p: self.config.from_disk(
|
||
p, interpolate=False, overrides=overrides
|
||
)
|
||
deserializers["meta.json"] = deserialize_meta # type: ignore[assignment]
|
||
deserializers["vocab"] = deserialize_vocab # type: ignore[assignment]
|
||
deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk( # type: ignore[union-attr]
|
||
p, exclude=["vocab"]
|
||
)
|
||
for name, proc in self._components:
|
||
if name in exclude:
|
||
continue
|
||
if not hasattr(proc, "from_disk"):
|
||
continue
|
||
deserializers[name] = lambda p, proc=proc: proc.from_disk( # type: ignore[misc]
|
||
p, exclude=["vocab"]
|
||
)
|
||
if not (path / "vocab").exists() and "vocab" not in exclude: # type: ignore[operator]
|
||
# Convert to list here in case exclude is (default) tuple
|
||
exclude = list(exclude) + ["vocab"]
|
||
util.from_disk(path, deserializers, exclude) # type: ignore[arg-type]
|
||
self._path = path # type: ignore[assignment]
|
||
self._link_components()
|
||
return self
|
||
|
||
def to_bytes(self, *, exclude: Iterable[str] = SimpleFrozenList()) -> bytes:
|
||
"""Serialize the current state to a binary string.
|
||
|
||
exclude (Iterable[str]): Names of components or serialization fields to exclude.
|
||
RETURNS (bytes): The serialized form of the `Language` object.
|
||
|
||
DOCS: https://spacy.io/api/language#to_bytes
|
||
"""
|
||
serializers: Dict[str, Callable[[], bytes]] = {}
|
||
serializers["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
|
||
serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"]) # type: ignore[union-attr]
|
||
serializers["meta.json"] = lambda: srsly.json_dumps(self.meta)
|
||
serializers["config.cfg"] = lambda: self.config.to_bytes()
|
||
for name, proc in self._components:
|
||
if name in exclude:
|
||
continue
|
||
if not hasattr(proc, "to_bytes"):
|
||
continue
|
||
serializers[name] = lambda proc=proc: proc.to_bytes(exclude=["vocab"]) # type: ignore[misc]
|
||
return util.to_bytes(serializers, exclude)
|
||
|
||
def from_bytes(
|
||
self, bytes_data: bytes, *, exclude: Iterable[str] = SimpleFrozenList()
|
||
) -> "Language":
|
||
"""Load state from a binary string.
|
||
|
||
bytes_data (bytes): The data to load from.
|
||
exclude (Iterable[str]): Names of components or serialization fields to exclude.
|
||
RETURNS (Language): The `Language` object.
|
||
|
||
DOCS: https://spacy.io/api/language#from_bytes
|
||
"""
|
||
|
||
def deserialize_meta(b):
|
||
data = srsly.json_loads(b)
|
||
self.meta.update(data)
|
||
|
||
deserializers: Dict[str, Callable[[bytes], Any]] = {}
|
||
deserializers["config.cfg"] = lambda b: self.config.from_bytes(
|
||
b, interpolate=False
|
||
)
|
||
deserializers["meta.json"] = deserialize_meta
|
||
deserializers["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude)
|
||
deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes( # type: ignore[union-attr]
|
||
b, exclude=["vocab"]
|
||
)
|
||
for name, proc in self._components:
|
||
if name in exclude:
|
||
continue
|
||
if not hasattr(proc, "from_bytes"):
|
||
continue
|
||
deserializers[name] = lambda b, proc=proc: proc.from_bytes( # type: ignore[misc]
|
||
b, exclude=["vocab"]
|
||
)
|
||
util.from_bytes(bytes_data, deserializers, exclude)
|
||
self._link_components()
|
||
return self
|
||
|
||
|
||
@dataclass
|
||
class FactoryMeta:
|
||
"""Dataclass containing information about a component and its defaults
|
||
provided by the @Language.component or @Language.factory decorator. It's
|
||
created whenever a component is defined and stored on the Language class for
|
||
each component instance and factory instance.
|
||
"""
|
||
|
||
factory: str
|
||
default_config: Optional[Dict[str, Any]] = None # noqa: E704
|
||
assigns: Iterable[str] = tuple()
|
||
requires: Iterable[str] = tuple()
|
||
retokenizes: bool = False
|
||
scores: Iterable[str] = tuple()
|
||
default_score_weights: Optional[Dict[str, Optional[float]]] = None # noqa: E704
|
||
|
||
|
||
class DisabledPipes(list):
|
||
"""Manager for temporary pipeline disabling."""
|
||
|
||
def __init__(self, nlp: Language, names: List[str]) -> None:
|
||
self.nlp = nlp
|
||
self.names = names
|
||
for name in self.names:
|
||
self.nlp.disable_pipe(name)
|
||
list.__init__(self)
|
||
self.extend(self.names)
|
||
|
||
def __enter__(self):
|
||
return self
|
||
|
||
def __exit__(self, *args):
|
||
self.restore()
|
||
|
||
def restore(self) -> None:
|
||
"""Restore the pipeline to its state when DisabledPipes was created."""
|
||
for name in self.names:
|
||
if name not in self.nlp.component_names:
|
||
raise ValueError(Errors.E008.format(name=name))
|
||
self.nlp.enable_pipe(name)
|
||
self[:] = []
|
||
|
||
|
||
def _copy_examples(
|
||
examples: Iterable[Example], *, copy_x: bool = True, copy_y: bool = False
|
||
) -> List[Example]:
|
||
"""Make a copy of a batch of examples, copying the predicted Doc as well.
|
||
This is used in contexts where we need to take ownership of the examples
|
||
so that they can be mutated, for instance during Language.evaluate and
|
||
Language.update.
|
||
"""
|
||
return [
|
||
Example(eg.x.copy() if copy_x else eg.x, eg.y.copy() if copy_y else eg.y)
|
||
for eg in examples
|
||
]
|
||
|
||
|
||
def _apply_pipes(
|
||
ensure_doc: Callable[[Union[str, Doc, bytes], _AnyContext], Doc],
|
||
pipes: Iterable[Callable[..., Iterator[Doc]]],
|
||
receiver,
|
||
sender,
|
||
underscore_state: Tuple[dict, dict, dict],
|
||
) -> None:
|
||
"""Worker for Language.pipe
|
||
|
||
ensure_doc (Callable[[Union[str, Doc]], Doc]): Function to create Doc from text
|
||
or raise an error if the input is neither a Doc nor a string.
|
||
pipes (Iterable[Pipe]): The components to apply.
|
||
receiver (multiprocessing.Connection): Pipe to receive text. Usually
|
||
created by `multiprocessing.Pipe()`
|
||
sender (multiprocessing.Connection): Pipe to send doc. Usually created by
|
||
`multiprocessing.Pipe()`
|
||
underscore_state (Tuple[dict, dict, dict]): The data in the Underscore class
|
||
of the parent.
|
||
"""
|
||
Underscore.load_state(underscore_state)
|
||
while True:
|
||
try:
|
||
texts_with_ctx = receiver.get()
|
||
docs = (
|
||
ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx
|
||
)
|
||
for pipe in pipes:
|
||
docs = pipe(docs) # type: ignore[arg-type, assignment]
|
||
# Connection does not accept unpickable objects, so send list.
|
||
byte_docs = [(doc.to_bytes(), doc._context, None) for doc in docs]
|
||
padding = [(None, None, None)] * (len(texts_with_ctx) - len(byte_docs))
|
||
sender.send(byte_docs + padding) # type: ignore[operator]
|
||
except Exception:
|
||
error_msg = [(None, None, srsly.msgpack_dumps(traceback.format_exc()))]
|
||
padding = [(None, None, None)] * (len(texts_with_ctx) - 1)
|
||
sender.send(error_msg + padding)
|
||
|
||
|
||
class _Sender:
|
||
"""Util for sending data to multiprocessing workers in Language.pipe"""
|
||
|
||
def __init__(
|
||
self, data: Iterable[Any], queues: List[mp.Queue], chunk_size: int
|
||
) -> None:
|
||
self.data = iter(data)
|
||
self.queues = iter(cycle(queues))
|
||
self.chunk_size = chunk_size
|
||
self.count = 0
|
||
|
||
def send(self) -> None:
|
||
"""Send chunk_size items from self.data to channels."""
|
||
for item, q in itertools.islice(
|
||
zip(self.data, cycle(self.queues)), self.chunk_size
|
||
):
|
||
# cycle channels so that distribute the texts evenly
|
||
q.put(item)
|
||
|
||
def step(self) -> None:
|
||
"""Tell sender that comsumed one item. Data is sent to the workers after
|
||
every chunk_size calls.
|
||
"""
|
||
self.count += 1
|
||
if self.count >= self.chunk_size:
|
||
self.count = 0
|
||
self.send()
|